In [1]:
import pandas as pd
import numpy as np

In [2]:
def transform(data):
    # transform the current dataframes (as in gn_gender_all.csv) into 
    # format suitable for 
    words = list(set(data['word']))
    groups = list(set(data['group']))
    counts = list()
    for gr in groups:
        count = list()
        data_group = data.loc[data['group']==gr,]
        for word in words:
            data_word = data_group.loc[data_group['word']==word,]
            if list(data_word.shape)[0]>0:
                count.append(list(data_word['counts'])[0])
            else:
                count.append(0)
        counts.append(count)
    return words,counts

In [3]:
def sumColumn(m, column,exp=1):
    # calculate column sum of a matrix; exp is the exponent 
    # so that the result is (sum(column_data^exp)) and set to 1 by default
    total = 0
    for row in range(len(m)):
        total += m[row][column]**exp
    return total

def sumRow(m, row,exp=1):
    # calculate row sum of a matrix; exp is the exponent 
    # so that the result is (sum(row_data^exp)) and set to 1 by default
    total = 0
    for col in range(len(m[0])):
        total += m[row][col]**exp
    return total


In [4]:
from sklearn.feature_extraction.text import TfidfTransformer


def tfidf(counts,method='scikit-learn'):
    if method=='scikit-learn':
        # The scikit-learn method
        transformer = TfidfTransformer()
        result = list(transformer.fit_transform(counts).toarray())
        for row in range(len(counts)):
            result[row] = list(result[row])
    else:
        result = []
        for row in range(len(counts)):
            result.append([])
        for col in range(len(counts[0])):
            colSum = sumColumn(counts,col)
            for row in range(len(counts)):
                if method=='conditional':
                    # conditional probability of the entry being in a certain group 
                    # given that a certain word is observed
                    result[row].append(float(counts[row][col])/colSum)
                elif method=='log-conditional':
                    # similar to conditional, just instead of dividing the total 
                    # frequency of each word, divide the log of it so that the 
                    # result is less extreme 
                    result[row].append(float(counts[row][col])/np.log(colSum))
                else:
                    print "method not found"
                    return result
        # Then do l2 normalization
        for row in range(len(result)):
            norm = sumRow(result,row,2)
            for col in range(len(result[0])):
                result[row][col] = result[row][col]**2/norm
                    
    return result

In [5]:
def wordCloudString2(frequency,words,scale=1000):
    # Take the frequency calculated from tf-idf to generate a string for 
    # word cloud making.
    wordCloudStr = "" 
    for i in range(len(words)):
        word = words[i]
        # scale is a constant to be multiplied to the frequency to get the actual
        # number of times a word is repeated in the string, so that this number
        # is >1 for all words and proportional to their frequencies.
        for j in range(int(frequency[i]*scale)):
            wordCloudStr += word
            wordCloudStr += " "
    return wordCloudStr

In [55]:
Data = pd.read_csv("../content_gender_all.csv")
list(set(Data['group']))

['M', 'U', 'T', 'F']

In [56]:
words,counts = transform(Data)
x1 = tfidf(counts,method='scikit-learn')
x2 = tfidf(counts,method='conditional')
#x3 = tfidf(counts,method='log-conditional')

In [60]:
wordCloudString2(x2[3],words,300)

'right right right love love love love help help people people feel feel feel feel back back back years years years need need see see something something something something something something something something something year year year year year year year year year year year go go go still still still still little little little little little little little little little said said said everything everything everything everything everything everything everything everything everything everything everything things things things away away away away away away away away away away long long going going going take take way way got got got ever ever ever ever ever ever ever ever ever friend friend friend friend friend friend friend friend friend friend told told told told told told told told told younger younger younger younger younger younger younger younger younger younger younger life life life someone someone someone someone someone someone someone someone someone around around never never