In [1]:
import os
from collections import Counter
import codecs
import warnings
warnings.filterwarnings('ignore')

In [2]:
with codecs.open('./ted-gender-annotated/ted-male-female-en.txt', 'rb',encoding="utf-8") as f:
    ted_male_female = f.read().strip()
with open('./ted-gender-annotated/ted-gender-annotations-en.csv') as f:
    ted_gender_annotation = f.read().strip()

ted_male_female = ted_male_female.split('\n\n')
ted_gender_annotation = ted_gender_annotation.split('\n')

print "number of talks: %d"%len(ted_male_female)
print "number of gender annotations: %d"%len(ted_gender_annotation)

number of talks: 1445
number of gender annotations: 1415


In [3]:
def get_talk(ted_male_female, url):
    url = '<url>%s</url>'%url.strip()
    for talk in ted_male_female:
        talk_url = talk.split('\n')[0]
        if talk_url == url:
            output =  talk.split('\n')[1:]
            return output
    return None
#get_talk(ted_male_female,'http://www.ted.com/talks/jessa_gamble_how_to_sleep.html')

In [4]:
# limit talk to ones that are annotated
talks = dict()
talk_id = 0
count_discard = 0
count_no_content = 0
count_males = 0
count_females = 0
for gender_annotation in ted_gender_annotation:
    gender = gender_annotation.split(',')[0].lower()
    url = gender_annotation.split(',')[1]
    
    if gender in ['male','female']:
        #identify the label
        if gender == 'male':
            label = 0
            count_males += 1
        else:
            label =1 
            count_females += 1 
        # get the content
        content = get_talk(ted_male_female, url)
        if content != None:
            talks[talk_id] = {'gender':gender,'label':label,'url':url.strip(),'content':content} 
            talk_id += 1
        else:
            count_no_content +=1 
    else:
        count_discard += 1
            

In [5]:
print "number of discards: %d"%count_discard
print "number of males: %d"%count_males
print "number of females: %d"%count_females
print "number of talks in dataset: %d"% len(talks)

number of discards: 59
number of males: 1012
number of females: 344
number of talks in dataset: 1356


In [6]:
male_texts = []
female_texts = []

for talk_id,talk in talks.items():
    if talk['gender'] == 'male':
        male_texts.append(talk['content'])
    else:
        female_texts.append(talk['content'])
print "number of male texts:%d"%len(male_texts)
print "number of female texts:%d"%len(female_texts)

number of male texts:1012
number of female texts:344


In [17]:
# male_content = []
# for text in male_texts:
#     male_content.append('\n'.join(text))

# with codecs.open('./ted-gender-annotated/male.texts','wb','utf-8') as f:
#     content = '\n\n'.join(male_content)
#     f.write(content)

In [18]:
# female_content = []
# for text in female_texts:
#     female_content.append('\n'.join(text))

# with codecs.open('./ted-gender-annotated/female.texts','wb','utf-8') as f:
#     content = '\n\n'.join(female_content)
#     f.write(content)

In [16]:
with codecs.open('./ted-gender-annotated/male.texts','rb','utf-8') as f:
    content = f.read()

count = 0
for text in content.split('\n\n'):
    if text!='\n\n':
        count +=1
print count
        
print len(content.split('\n\n')[0])

with codecs.open('./ted-gender-annotated/female.texts','rb','utf-8') as f:
    content = f.read()
print len(content.split('\n\n'))


1012
15328
344


In [6]:
from collections import defaultdict
def voc_count(corpus):
    d = defaultdict(int)
    for p in corpus:
        for sent in p:
            for t in sent:
                d[t] += 1
    return d

In [7]:
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
import string

#load list of stop words
with open('./snowball_stopwords.txt','rb') as sw:
    stop_words = [line.strip() for line in sw]

#load punctuations 
punctuations = string.punctuation


#extra characters,
extra = [u"'s", 
         u"'m", 
         u"'re",
         u"'ve",
         u"'t",
         u"'d",
         u"'ll",
         u",",
         u"!",
         u"(",
         u")",
         u"?",
         u'"',
         u'--',
         u"''",
         u'``'
        ]

def pre_process_par(par):
    """
    input: 
       list of sentences
    output:
       list of sentences. Each sentence is a list of tokens.
    """
    output = []
    
    sent_par = par

    # remove meta data
    i = len(sent_par)-1
    while(i>0):
        sent = sent_par[i]
        if ("<talkid>" in sent):
            break
        i = i - 1
        
    if i>0:
        sent_par = sent_par[0:i]
    
     
    # make the sentences lowecase
    sent_par = [sent.lower() for sent in sent_par]
    
    
      # tokenize and clean all sentences
    for i,sent in enumerate(sent_par):
        # remove extra things
        for item in extra:
            sent = sent.replace(item,'')
        try:
            #tokenize each sentence
            tokens = word_tokenize(sent)
        except ValueError:
            print i
            raise ValueError(sent)
            return
        # remove repetitve words in a sentenece
        tokens = list(set(tokens))
        
        #remove stop words and clean texts
        tokens = [tok for tok in tokens if 
                                          (tok not in stop_words) and 
                                          (tok not in punctuations)]
        if len(tokens)>0:
            # put it in the output
            output.append(tokens)
    return output

In [8]:
w2v_path = './glove.840B.300d.txt'

import numpy as np
rng = np.random.RandomState(seed=1)

cn = 0
word2vec = {}
with open(w2v_path,'rb') as w2v:
    content = w2v.read().strip()
    for line in content.split('\n'):
        cn +=1
        line = line.strip().split()
        v = line[0]
        
        vector = line[1:]
        vector = np.matrix(vector,dtype='float32')
        
        word2vec[v] = vector

In [9]:
def overlap(ds, word2vec):
    voc_in_word2vec = set(word2vec.keys())
    num_voc_in_word2vec = len(voc_in_word2vec)
    print "num_voc_in_word2vec: %d"%num_voc_in_word2vec

    voc_in_ds = []
    for par in ds:
        for sent in par:
            voc_in_ds += sent
    
    num_voc_in_ds = len(voc_in_ds)
    print "num_voc_in_ds: %d"%num_voc_in_ds
    
    print "remove duplicates ... "
    voc_in_ds = set(voc_in_ds)
    
    num_voc_in_ds = len(voc_in_ds)
    print "num_voc_in_ds: %d"%num_voc_in_ds
    
    intersection = set(voc_in_word2vec).intersection(voc_in_ds)
    count_overlap = len(intersection)
    print "count_overlap: %d"%count_overlap
    
    precent_overlap = 100*(count_overlap / float(num_voc_in_ds))
    print "precent_overlap: %.2f%%"%precent_overlap

In [10]:
def statistics_ds(ds):
    print "data size (# talks): %d"%len(ds)
    
    par_lens = [len(par) for par in ds]
    avg_par_len = np.average(par_lens)
    print "avg_talk_len: %.2f"%avg_par_len
    
    min_par_len = np.min(par_lens)
    print "min_talk_len (sent based): %2.f"%min_par_len
    
    max_par_len = np.max(par_lens)
    print "max_talk_len (sent based): %2.f"%max_par_len

    std_par_len = np.std(par_lens)
    print "std_talk_len (sent based): %2.f"%std_par_len


    
    sent_lens = []
    for par in ds:
        for sent in par:
            sent_lens.append(len(sent))
    
    avg_sent_len = np.average(sent_lens)
    print "avg_sent_len: %.2f"%avg_sent_len
    
    min_sent_len = np.min(sent_lens)
    print "min_sent_len: %2.f"%min_sent_len

    max_sent_len = np.max(sent_lens)
    print "max_sent_len: %2.f"%max_sent_len

    std_sent_len = np.std(sent_lens)
    print "std_sent_len: %2.f"%std_sent_len
    


In [11]:
import sys
def drawProgressBar(shell_out, 
                    begin, k, out_of, end, barLen =25):
    percent = k/float(out_of)
    sys.stdout.write("\r")
    progress = ""
    for i in range(barLen):
        if i < int(barLen * percent):
            progress += "="
        elif i==int(barLen * percent):
            progress +='>'
        else:
            progress += "_"
    text = "%s%d/%d[%s](%.2f%%)%s"%(begin,k,out_of,progress,percent * 100, end)
    if shell_out== True:
        sys.stdout.write(text)
        sys.stdout.flush()
    return text

In [12]:
# import copy as cp
# tmp = cp.deepcopy(talks)

In [13]:
#talks = cp.deepcopy(tmp)

In [14]:
for talk_id, talk in talks.items():
    text = talk['content']
    talks[talk_id]['content'] = pre_process_par(text)
    drawProgressBar(True,"",talk_id+1,len(talks),"")



In [15]:
import pickle
with open('./ted-gender-annotated/dataset.pkl','wb') as f:
    pickle.dump(talks,f)

In [16]:
texts = [talk['content'] for talk in talks.values() if talk['gender']=='female']
statistics_ds(texts)

data size (# talks): 344
avg_talk_len: 118.77
min_talk_len (sent based): 10
max_talk_len (sent based): 267
std_talk_len (sent based): 52
avg_sent_len: 8.01
min_sent_len:  1
max_sent_len: 69
std_sent_len:  5


In [17]:
texts = [talk['content'] for talk in talks.values() if talk['gender']=='male']
statistics_ds(texts)

data size (# talks): 1012
avg_talk_len: 133.24
min_talk_len (sent based): 11
max_talk_len (sent based): 396
std_talk_len (sent based): 63
avg_sent_len: 7.69
min_sent_len:  1
max_sent_len: 138
std_sent_len:  5


In [18]:
texts = [talk['content'] for talk in talks.values()]
statistics_ds(texts)

data size (# talks): 1356
avg_talk_len: 129.57
min_talk_len (sent based): 10
max_talk_len (sent based): 396
std_talk_len (sent based): 61
avg_sent_len: 7.76
min_sent_len:  1
max_sent_len: 138
std_sent_len:  5


In [19]:
texts = [talk['content'] for talk in talks.values() if talk['gender']=='female']
overlap(texts,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 327140
remove duplicates ... 
num_voc_in_ds: 27440
count_overlap: 25931
precent_overlap: 94.50%


In [20]:
texts = [talk['content'] for talk in talks.values() if talk['gender']=='male']
overlap(texts,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 1037038
remove duplicates ... 
num_voc_in_ds: 47174
count_overlap: 42724
precent_overlap: 90.57%


In [21]:
texts = [talk['content'] for talk in talks.values()]
overlap(texts,word2vec)

num_voc_in_word2vec: 2196016
num_voc_in_ds: 1364178
remove duplicates ... 
num_voc_in_ds: 54317
count_overlap: 48459
precent_overlap: 89.22%


In [2]:
import pickle
with open("./ted-gender-annotated/dataset.pkl",'rb') as f:
    talks = pickle.load(f)

In [3]:
male_texts = []
female_texts = []

for talk_id, talk in talks.items():
    if talk['gender'] == 'male':
        male_texts.append(talk['content'])
    else:
        female_texts.append(talk['content'])
print "number of male texts:%d"%len(male_texts)
print "number of female texts:%d"%len(female_texts)

number of male texts:1012
number of female texts:344


In [4]:
male_texts[0]

[[u'ocean', u'thing', u'complicated', u'can'],
 [u'thing', u'complicated', u'human', u'health', u'can'],
 [u'daunting',
  u'simple',
  u'move',
  u'themes',
  u'say',
  u'seem',
  u'even',
  u'two',
  u'going',
  u'forward',
  u'really',
  u'might',
  u'understand',
  u'bringing',
  u'task',
  u'together',
  u'try',
  u'complexity',
  u'can',
  u'think'],
 [u'simple',
  u'themes',
  u'really',
  u'aren',
  u'things',
  u'complex',
  u'going',
  u'pretty',
  u'know',
  u'science',
  u'well'],
 [u'nobody', u'ain', u'one', u'start', u'momma', u'going', u'happy'],
 [u'right', u'experienced', u'know'],
 [u'just',
  u'go',
  u'next',
  u'take',
  u'build',
  u'nobody',
  u'step',
  u'ocean',
  u'ain',
  u'can',
  u'happy'],
 [u'theme', u'talk'],
 [u'lot', u'ways', u'different', u'unhappy', u'ocean', u'pretty', u'making'],
 [u'shot', u'cannery', u'1932', u'row'],
 [u'industrial',
  u'west',
  u'cannery',
  u'coast',
  u'time',
  u'biggest',
  u'canning',
  u'operation',
  u'row'],
 [u'amounts