# Machine-learning classifiers to classify word-vectors with repsect to gender, morality, health, and socio-economic status

In [3]:
from sklearn import tree
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold 
from sklearn import svm
from sklearn.neural_network import MLPClassifier
import sklearn
import csv
import statistics
from sklearn import decomposition
from sklearn import datasets
import gensim
np.set_printoptions(threshold=np.inf) #do this if you want to print full output
import os
cwd= os.getcwd()

# Upload a Trained Word2Vec Model

*Don't have a model? Use a pretrained Word2Vec Model from Google, trained on Google News*
* Read and download here: https://code.google.com/archive/p/word2vec/
* File is called "GoogleNews-vectors-negative300.bin.gz"

*What you will need to change in this code includes:*
* currentmodel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  
* Some of the vocabulary words used to extract subspace may not exist, and you will get errors and need to find substitutes for these words. 

In [4]:
currentmodel=  Word2Vec.load("modelA_ALLYEARS_300dim_10CW") #load up the word2vec model of choice, model 2 used for all decision tree training

# Get a Dataset of Word-Vectors

In [5]:
gendertrain_words3=[ 
            'madame', 'ladies', 'lady',
          'mother', 'mothers', 'mom', 'mama', 'granddaughter', 'daughter', 'daughters', 'aunt', 'godmother', 
          'grandma', 'grandmothers', 'grandmother', 'sister', 'sisters', 'aunts', 'stepmother', 'granddaughters', 'niece',
        'fiancee', 'ex_girlfriend', 'girlfriends', 'wife', 'wives', 'girlfriend', 'bride', 'brides', 'widow',
           'twin_sister', 'younger_sister', 'teenage_girl', 'teenage_girls', 'eldest_daughter','estranged_wife', 'schoolgirl',
        'businesswoman', 'congresswoman' , 'chairwoman', 'councilwoman', 'waitress', 'hostess', 'convent', 'heiress', 
           'saleswoman', 'queen', 'queens', 'princess', 'nun' , 'nuns', 'heroine', 'actress', 'actresses', 'uterus', 'vagina', 'ovarian_cancer',
        'maternal', 'maternity', 'motherhood', 'sisterhood', 'girlhood', 'matriarch', 'sorority', 'mare', 'hen', 'hens', 'filly', 'fillies',
          'deer', 'older_sister', 'oldest_daughter', 'stepdaughter', 'pink', 'cute', 'dependent', 'nurturing', 'hysterical', 'bitch',  'dance', 'dancing', 
               'sir', 'guys', 'gentleman','father', 'fathers', 'dad', 'papa', 'grandson' , 'son', 'sons', 'uncle', 'godfather', 
        'grandpa', 'grandfathers', 'grandfather', 'brother', 'brothers' , 'uncles', 'stepfather', 'grandsons', 'nephew',
           'fiance', 'ex_boyfriend', 'boyfriends', 'husband', 'husbands', 'boyfriend', 'groom', 'grooms', 'widower',
            'twin_brother', 'younger_brother', 'teenage_boy', 'teenage_boys', 'eldest_son', 'estranged_husband', 'schoolboy',
            'businessman', 'congressman', 'chairman', 'councilman', 'waiter', 'host', 'monastery', 'heir', 'salesman', 
            'king', 'kings', 'prince', 'monk', 'monks', 'hero', 'actor', 'actors', 'prostate', 'penis', 'prostate_cancer', 
        'paternal', 'paternity', 'fatherhood', 'brotherhood', 'boyhood', 'patriarch', 'fraternity', 'stallion', 'rooster', 'roosters', 'colt',
           'colts', 'bull', 'older_brother', 'oldest_son', 'stepson', 'blue' ,'manly', 'independent', 'aggressive', 'angry', 'jerk', 'wrestle', 'wrestling'  ]

#this is adding some more implicit (and thus arguably labeled, too) gender words for training, and removing some of the explicit ones
gendertrain_words2=[ 'girl', 'girls', 'her', 'hers', 'herself', 'she', 
            'lady', 'gal', 'gals', 'madame', 'ladies', 'lady',
          'mother', 'mothers', 'mom', 'moms', 'mommy', 'mama', 'ma', 'granddaughter', 'daughter', 'daughters', 'aunt', 'godmother', 
          'grandma', 'grandmothers', 'grandmother', 'sister', 'sisters', 'aunts', 'stepmother', 'granddaughters', 'niece',
        'fiancee', 'ex_girlfriend', 'girlfriends', 'wife', 'wives', 'girlfriend', 'bride', 'brides', 'widow',
           'twin_sister', 'younger_sister', 'teenage_girl', 'teenage_girls', 'eldest_daughter','estranged_wife', 'schoolgirl',
        'businesswoman', 'congresswoman' , 'chairwoman', 'councilwoman', 'waitress', 'hostess', 'convent', 'heiress', 
           'saleswoman', 'queen', 'queens', 'princess', 'nun' , 'nuns', 'heroine', 'actress', 'actresses', 'uterus', 'vagina', 'ovarian_cancer',
        'maternal', 'maternity', 'motherhood', 'sisterhood', 'girlhood', 'matriarch', 'sorority', 'mare', 'hen', 'hens', 'filly', 'fillies',
          'deer', 'older_sister', 'oldest_daughter', 'stepdaughter', 'pink',  'cute', 'dependent', 'nurturing', 'hysterical', 'bitch',  'dance', 'dancing', 
                'boy', 'boys', 'him', 'his', 'himself', 'he', 'guy', 'dude',
            'dudes', 'sir', 'guys', 'gentleman','father', 'fathers', 'dad', 'dads', 'daddy', 'papa', 'pa', 'grandson' , 'son', 'sons', 'uncle', 'godfather', 
        'grandpa', 'grandfathers', 'grandfather', 'brother', 'brothers' , 'uncles', 'stepfather', 'grandsons', 'nephew',
           'fiance', 'ex_boyfriend', 'boyfriends', 'husband', 'husbands', 'boyfriend', 'groom', 'grooms', 'widower',
            'twin_brother', 'younger_brother', 'teenage_boy', 'teenage_boys', 'eldest_son', 'estranged_husband', 'schoolboy',
            'businessman', 'congressman', 'chairman', 'councilman', 'waiter', 'host', 'monastery', 'heir', 'salesman', 
            'king', 'kings', 'prince', 'monk', 'monks', 'hero', 'actor', 'actors', 'prostate', 'penis', 'prostate_cancer', 
        'paternal', 'paternity', 'fatherhood', 'brotherhood', 'boyhood', 'patriarch', 'fraternity', 'stallion', 'rooster', 'roosters', 'colt',
           'colts', 'bull', 'older_brother', 'oldest_son', 'stepson', 'blue' ,'manly', 'independent', 'aggressive', 'angry', 'jerk', 'wrestle', 'wrestling'  ]

gendertrain_words=['womanly', 'my_wife', 'my_mom', 'my_grandmother', 'woman', 'women', 'girl', 'girls', 'her', 'hers', 'herself', 'she', 
            'lady', 'gal', 'gals', 'madame', 'ladies', 'lady',
          'mother', 'mothers', 'mom', 'moms', 'mommy', 'mama', 'ma', 'granddaughter', 'daughter', 'daughters', 'aunt', 'godmother', 
          'grandma', 'grandmothers', 'grandmother', 'sister', 'sisters', 'aunts', 'stepmother', 'granddaughters', 'niece',
          'fiancee', 'ex_girlfriend', 'girlfriends', 'wife', 'wives', 'girlfriend', 'bride', 'brides', 'widow',
           'twin_sister', 'younger_sister', 'teenage_girl', 'teenage_girls', 'eldest_daughter','estranged_wife', 'schoolgirl',
          'businesswoman', 'congresswoman' , 'chairwoman', 'councilwoman', 'waitress', 'hostess', 'convent', 'heiress', 
           'saleswoman', 'queen', 'queens', 'princess', 'nun' , 'nuns', 'heroine', 'actress', 'actresses', 'uterus', 'vagina', 'ovarian_cancer',
           'maternal', 'maternity', 'motherhood', 'sisterhood', 'girlhood', 'matriarch', 'sorority', 
         'older_sister', 'oldest_daughter', 'stepdaughter',
                    'manly', 'my_husband', 'my_dad','my_grandfather', 'man', 'men', 'boy', 'boys', 'him', 'his', 'himself', 'he', 'guy', 'dude',
            'dudes', 'sir', 'guys', 'gentleman','father', 'fathers', 'dad', 'dads', 'daddy', 'papa', 'pa', 'grandson' , 'son', 'sons', 'uncle', 'godfather', 
           'grandpa', 'grandfathers', 'grandfather', 'brother', 'brothers' , 'uncles', 'stepfather', 'grandsons', 'nephew',
           'fiance', 'ex_boyfriend', 'boyfriends', 'husband', 'husbands', 'boyfriend', 'groom', 'grooms', 'widower',
            'twin_brother', 'younger_brother', 'teenage_boy', 'teenage_boys', 'eldest_son', 'estranged_husband', 'schoolboy',
            'businessman', 'congressman', 'chairman', 'councilman', 'waiter', 'host', 'monastery', 'heir', 'salesman', 
            'king', 'kings', 'prince', 'monk', 'monks', 'hero', 'actor', 'actors', 'prostate', 'penis', 'prostate_cancer', 
           'paternal', 'paternity', 'fatherhood', 'brotherhood', 'boyhood', 'patriarch', 'fraternity', 
           'older_brother', 'oldest_son', 'stepson']

genderinteresting_words= [ 'blonde', 'blond', 'politician', 'programmer', 'nurse', 'doctor', 'estrogen', 'testosterone', 'soldier', 'army', 'drafted', 'military', 
                          'pregnancy', 'pregnant', 'beard', 'nanny', 'pink', 'lipstick', 'mustache', 'bride', 'groom', 'lady', 'guy', 'sewing', 'modeling', 'actress', 'actor', 'genius', 'brilliant']

genderinteresting_words_implicit=['petite', 'cooking', 'graceful',  'housework', 'soft', 'whisper', 'flirtatious', 'accepting', 'blonde', 'blond', 'doll', 'dolls','nurse',  'estrogen', 'lipstick','pregnant', 'nanny', 'pink', 
                                'sewing', 'modeling', 'dainty', 'gentle', 'children','pregnancy', 'nurturing', 'depressed', 'nice', 'emotional','depression', 'home', 'kitchen', 'quiet', 'submissive',
                                'soldier', 'army', 'drafted', 'military',   'beard', 'mustache', 'genius', 'engineering', 'math', 
                                'brilliant', 'strong', 'strength',  'politician', 'programmer','doctor', 'sexual', 'aggressive', 
                                'testosterone', 'tall', 'competitive', 'big', 'powerful', 'mean', 'sports', 'fighting', 'confident', 'rough', 'loud', 'worldly',
                                  'experienced', 'insensitive', 'ambitious', 'dominant']

gendertest_words=['goddess', 'single_mother', 'girlish', 'feminine', 'young_woman', 'little_girl', 'ladylike', 'my_mother', 
           'teenage_daughter', 'mistress', 'great_grandmother', 'adopted_daughter', 'femininity', 'motherly', 'matronly', 
           'showgirl', 'housewife', 'vice_chairwoman', 'co_chairwoman', 'spokeswoman', 'governess', 'divorcee', 'spinster', 
           'maid', 'countess', 'pregnant_woman', 'landlady', 'seamstress', 'young_girl', 'waif', 'femme_fatale','comedienne',
            'boyish', 'masculine',  'lad', 'policeman', 'macho', 'gentlemanly', 'machismo',  'teenage_son', 
            'beau', 'great_grandfather', 'tough_guy', 'masculinity', 'bad_boy', 'spokesman', 'baron', 'adult_male', 'landlord', 'fireman', 'mailman', 'vice_chairman', 
           'co_chairman','young_man', 'bearded', 'mustachioed', 'con_man', 'homeless_man', 'gent', 'strongman']

moraltrain_words= ['good', 'benevolent', 'nice', 'caring', 'conscientious', 'polite', 'fair', 'virtue', 'respect', 'responsible', 
            'selfless', 'unselfish', 'sincere', 'truthful', 'wonderful', 'justice', 'innocent', 'innocence',
           'complement', 'sympathetic', 'virtue', 'right', 'proud', 'pride','respectful', 'appropriate', 'pleasing', 'pleasant', 
            'pure', 'decent', 'pleasant', 'compassion' , 'compassionate', 'constructive','graceful', 'gentle', 'reliable',
           'careful', 'help', 'decent' , 'moral', 'hero', 'heroic', 'heroism', 'honest', 'honesty',
           'selfless', 'humility', 'humble', 'generous', 'generosity', 'faithful', 'fidelity', 'worthy', 'tolerant',
            'obedient', 'pious', 'saintly', 'angelic', 'virginal', 'sacred', 'reverent', 'god', 'hero', 'heroic', 
            'forgiving', 'saintly','holy', 'chastity', 'grateful', 'considerate', 'humane', 
            'trustworthy', 'loyal', 'loyalty', 'empathetic', 'empathy', 'clean', 'straightforward', 'pure',
                    'bad', 'evil', 'mean', 'uncaring', 'lazy', 'rude', 'unfair', 'sin', 'disrespect','irresponsible', 
           'self_centered', 'selfish', 'insincere', 'lying', 'horrible', 'injustice', 'guilty', 'guilt', 
            'insult', 'unsympathetic', 'vice', 'wrong', 'ashamed', 'shame', 'disrespectful', 'inappropriate', 'vulgar', 'crude', 
            'dirty', 'obscene', 'offensive', 'cruelty','brutal', 'destructive', 'rude', 'harsh', 'unreliable',
            'careless', 'harm', 'indecent', 'immoral', 'coward', 'cowardly', 'cowardice', 'dishonest', 'dishonesty',
            'narcissistic', 'arrogance', 'arrogant', 'greedy', 'greed', 'betray', 'betrayal', 'unworthy', 'intolerant', 
             'defiant', 'rebellious', 'demonic','devilish', 'promiscuous', 'profane', 'irreverent', 'devil', 'villain', 'villainous', 
            'vindictive', 'diabolical', 'unholy', 'promiscuity', 'ungrateful', 'thoughtless', 'inhumane',
            'untrustworthy', 'treacherous', 'treachery', 'callous', 'indifference', 'dirty', 'manipulative', 'impure' ]

moraltest_words=['great', 'best', 'faith', 'chaste', 'wholesome', 'noble', 'honorable', 'immaculate', 'gracious', 
           'courteous', 'delightful', 'earnest', 'amiable', 'admirable', 'disciplined', 'patience', 'integrity',
            'restraint', 'upstanding', 'diligent', 'dutiful', 'loving', 'righteous','respectable', 'praise', 'devout', 'forthright',
            'depraved', 'repulsive', 'repugnant', 'corruption', 'vicious', 'unlawful', 'outrage',  'shameless', 'perverted',
            'filthy', 'lewd', 'subversive', 'sinister', 'murderous', 'perverse', 
           'monstrous', 'homicidal', 'indignant', 'misdemeanor', 'degenerate', 'malevolent', 'illegal','terrorist','terrorism',  
             'cheated', 'vengeful', 'culpable','vile', 'hateful', 'abuse', 'abusive', 'criminal', 'deviant']

healthtrain_words= ['fertile', 'help_prevent', 'considered_safe', 'safer', 'healthy', 'healthy', 'healthy', 'healthy', 'healthy',
            'healthful', 'well_balanced', 'natural', 'healthy', 'athletic','physically_active', 'health',
            'health', 'nutritious','nourishing', 'stronger', 'strong','wellness', 'safe', 'nutritious_food','exercise',
            'physically_fit', 'unprocessed', 'healthier_foods', 'nutritious_foods', 'nutritious', 'nutritious',
           'healthy_eating', 'healthy_diet', 'healthy_diet', 'nourishing', 'nourished', 'regular_exercise', 'safety', 'safe', 
            'helpful', 'beneficial', 
            'healthy', 'healthy', 'sturdy', 'lower_risk', 'reduced_risk', 'decreased_risk', 'nutritious_foods', 'whole_grains', 'healthier_foods',
            'healthier_foods', 'physically_active', 'physical_activity', 'nourished', 'vitality', 'energetic', 'able_bodied',
            'resilience', 'strength', 'less_prone', 'sanitary', 'clean',  'healing', 'heal', 'salubrious', 
                    'infertile', 'cause_harm','potentially_harmful','riskier', 'unhealthy', 'sick', 'ill', 'frail', 'sickly', 
            'unhealthful','unbalanced', 'unnatural', 'dangerous', 'sedentary', 'inactive', 'illness', 
            'sickness', 'toxic', 'unhealthy', 'weaker', 'weak', 'illness', 'unsafe', 'unhealthy_foods', 'sedentary',
            'inactive', 'highly_processed', 'processed_foods', 'junk_foods', 'unhealthy_foods', 'junk_foods',
           'processed_foods', 'processed_foods', 'fast_food', 'unhealthy_foods', 'deficient', 'sedentary', 'hazard','hazardous', 
            'harmful', 'injurious', 
            'chronically_ill', 'seriously_ill', 'frail', 'higher_risk', 'greater_risk', 'increased_risk', 'fried_foods', 'fried_foods',
            'fatty_foods', 'sugary_foods', 'sedentary', 'physical_inactivity', 'malnourished', 'lethargy', 'lethargic', 'disabled',
            'susceptibility', 'weakness', 'more_susceptible', 'filthy', 'dirty', 'harming', 'hurt', 'deleterious'
              ]

healthtest_words=[ 'balanced_diet', 'healthfulness', 'fiber', 'jogging', 'stopping_smoking', 'vigor', 
          'active', 'fit', 'flourishing', 'sustaining', 'hygienic', 'hearty', 'enduring', 'energized', 'wholesome', 
           'holistic', 'healed', 'fitter', 'health_conscious', 'more_nutritious', 'live_longer',  'exercising_regularly',
           'healthier_choices', 'healthy_habits', 'healthy_lifestyle', 'healthful_eating', 'immune', 
            'deadly', 'diseased',  'adverse', 'risky', 'fatal', 'filthy', 'epidemic', 'crippling', 'carcinogenic', 'carcinogen',
           'crippled', 'afflicted', 'contaminated', 'fatigued', 'detrimental', 'bedridden', 'incurable', 'hospitalized',
           'infected', 'ailing', 'debilitated', 'poisons', 'disabling', 'life_threatening', 'debilitating', 
           'chronic_illness', 'artery_clogging', 'hypertension','disease', 'stroke',
            'plague', 'fatty', 'smoking'] 

sestrain_words=['wealth', 'wealthier', 'wealthiest', 'affluence', 'prosperity', 'wealthy', 'affluent', 'affluent', 'prosperous',
                'prosperous','prosperous','disposable_income',  'wealthy','suburban','luxurious','upscale','upscale', 'luxury', 
                'richest', 'privileged', 'moneyed', 'privileged', 'privileged', 'educated', 'employed', 
                'elite', 'upper_income', 'upper_class', 'employment', 'riches', 'millionaire', 'aristocrat', 'college_educated',
                'abundant', 'lack', 'luxury', 'profitable', 'profit', 'well_educated', 'elites', 'heir', 'well_heeled', 
                'white_collar', 'higher_incomes', 'bourgeois', 'fortunate', 'successful','economic_growth', 'prosper', 'suburbanites', 
                        'poverty', 'poorer', 'poorest', 'poverty', 'poverty', 'impoverished', 'impoverished',  'needy',  'impoverished',
                 'poor', 'needy', 'broke', 'needy', 'slum', 'ghetto', 'slums', 'ghettos', 'poor_neighborhoods', 
                'poorest', 'underserved', 'disadvantaged','marginalized', 'underprivileged', 'uneducated', 'unemployed', 
                'marginalized', 'low_income', 'underclass','unemployment', 'rags', 'homeless', 'peasant', 'college_dropout', 
                'lacking', 'abundance', 'squalor', 'bankrupt', 'debt', 'illiterate' ,'underclass', 'orphan',  'destitute', 
                'blue_collar', 'low_income', 'neediest', 'less_fortunate', 'unsuccessful', 'economic_crisis', 'low_wage', 'homeless'
                  ]

sestest_words= ['rich', 'billionaire', 'banker',  'fortune', 'heiress', 'cosmopolitan', 'ornate', 'entrepreneur', 'sophisticated',
                'aristocratic', 'investor', 'highly_educated', 'better_educated',  'splendor', 
               'businessman', 'opulent', 'multimillionaire', 'philanthropist', 'estate', 'estates', 'chateau', 'fortunes', 
               'financier', 'young_professionals','tycoon', 'baron', 'grandeur', 'magnate', 
               'investment_banker', 'venture_capitalist', 'upwardly_mobile', 'highly_skilled', 'yuppies', 'genteel',
                         'homelessness', 'ruin', 'ruined', 'downtrodden', 'less_affluent',
                'housing_project', 'homeless_shelters', 'indigent', 'jobless', 'welfare',  
                'temporary_shelters','housing_projects', 'subsidized_housing', 'starving', 'beggars', 'orphanages',
                'dispossessed', 'uninsured', 'welfare_recipients', 'food_stamps', 
                'malnutrition',  'underemployed', 'disenfranchised', 'servants', 'displaced', 'poor_families'] 

obesity_words= ['obese', 'obesity', 'diabetic', 'diabetes', 'weight', 'overweight', 'thin', 'slender', 'burly',
                'muscular', 'diet', 'dieting', 'health', 'healthy', 'unhealthy', 'fat', 'anorexic', 'anorexia', 'bulimia', 
                'beautiful', 'handsome', 'overeating', 'exercise', 'sedentary', 'bulimic', 'morbidly_obese', 'normal_weight',
                'seriously_overweight'] 


Make sure these words are actually in your Word2Vec model vocabulary:

In [6]:
allwords= [gendertrain_words, gendertrain_words2, gendertrain_words3, gendertest_words, moraltest_words, moraltrain_words, moraltest_words, healthtrain_words, healthtest_words, sestrain_words, sestest_words, obesity_words]
for i in allwords:
    for j in i:
        try:
            x= currentmodel[j]
        except KeyError:
            print(str(j)+  'was not in this Word2Vec model vocab, try a replacement, noting which class (i.e., feminine or masculine) the word corresponds to')

Get the Classes (Labels) Data for Trainiing and Testing Data

If you had to change the vocabulary used above, if the word was a trianig/testing word ensure the replacement word was of the same class (e.g., feminine vs masculine)

In [7]:
def class_word_data(train_words, test_words, scale, filename):
    #first make vector of true classes for training set
    train_classes=np.repeat(1, len(train_words)*.5).tolist() #1 is feminine/moral/healthy/rich
    masc1=np.repeat(0, len(train_words)*.5).tolist() #0 is masculine/immoral/unhealthy/poor
    for i in masc1:
        train_classes.append(i)

    #make long list of vectors of training words, which are split half/half
    train_vecs=[]
    for i in range(0, len(train_words)):
        try:
            train_vecs.append(currentmodel[train_words[i]])  
        except:
            train_vecs.append('nope') 
    #make vector of true classes for testing set, which are split differently...
    if scale=='gender':
        test_classes=np.repeat(1, 31).tolist() #1 is feminine
        masc2=np.repeat(0, 29).tolist() #0 is masculine
        for i in masc2:
            test_classes.append(i)    
    elif scale == 'moral':
        test_classes=np.repeat(1, 26 ).tolist() #1 is feminine
        masc2=np.repeat(0,34).tolist() #0 is masculine
        for i in masc2:
            test_classes.append(i)
    elif scale== 'health':
        test_classes=np.repeat(1, 26).tolist() #1 is feminine
        masc2=np.repeat(0, 34 ).tolist() #0 is masculine
        for i in masc2:
            test_classes.append(i)
    elif scale== 'ses':
        test_classes=np.repeat(1, 33).tolist() #1 is feminine
        masc2=np.repeat(0, 27).tolist() #0 is masculine
        for i in masc2:
            test_classes.append(i)
    else:
        print('not clear which set of words to use')
    
    #make long list of vectors of testing words
    test_vecs=[]
    for i in range(0, len(test_words)):
        try:
            test_vecs.append(currentmodel[test_words[i]])  #make long list of vectors of testing words
        except:
            test_vecs.append('nope') 
        
    #write the above four pieces of data to a CSV
    b = open(filename, 'w') 
    a = csv.writer(b)
    a.writerow(train_words)
    a.writerow(train_classes)
    for i in range(0,len(train_vecs)):
        a.writerow(train_vecs[i])
    a.writerow(test_words)
    a.writerow(test_classes)
    for i in range(0,len(test_vecs)):
        a.writerow(test_vecs[i])
    b.close()

In [8]:
class_word_data(gendertrain_words2, gendertest_words, 'gender',  'Word_Vectors_gender_ALLYEARS_30010CW_loosertrainingwords.csv') #run this for gender words, moral words, health words, and SES words. Easiest to keep track of if run separately and CSVs are labeled accordingly

In [None]:
#Write CSV with Obesity-keywords wordvectors

obesity_vecs=[]
for i in range(0, len(obesity_words)):
    try:
        obesity_vecs.append(currentmodel[obesity_words[i]])  #make long list of vectors of testing words
    except:
        obesity_vecs.append('nope')

b = open('Word_Vectors_Obesity.csv', 'w') 
a = csv.writer(b)
a.writerow(obesity_words)
for i in range(0,len(obesity_vecs)):
    a.writerow(obesity_vecs[i])
b.close()

# Now that you've saved CSVs for each training and testing set, and obesity word-vector set, import and shape this data

In [39]:
#dat= pd.read_csv('Word_Vectors_gender_ALLYEARS_300_10CW.csv', header=None)
dat= pd.read_csv('Word_Vectors_gender_ALLYEARS_300_10CW.csv', header=None)
training_all= dat.loc[dat[1] == "training3"]
training= training_all.iloc[:, 3:]
#training= training.loc[dat[2]==0] to get training data for just feminine wordvectors
testing_all= dat.loc[dat[1] == "testing"] #can swap out testing or stereotypes for exploring gender here
testing= testing_all.iloc[:, 3:]

stereotypes_all= dat.loc[dat[1] == "stereotypes"] #can swap out testing or stereotypes for exploring gender here
stereotypes= stereotypes_all.iloc[:, 3:]


In [40]:
dat2=pd.read_csv('Word_Vectors_Obesity_ALLYEARS_300_10CW.csv', header=None)
subdat_obesity=dat2.iloc[: , 1:] 
subdat_obesity= sklearn.preprocessing.normalize(subdat_obesity, norm='l2')

In [41]:
subdat_obesity = pd.DataFrame(subdat_obesity)

If desired, reduce data dimensionality with Principal Component Analysis

In [23]:
pca = decomposition.PCA(n_components=50)
pca.fit(training)

PCA(copy=True, iterated_power='auto', n_components=50, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [24]:
PCAd_training = pca.transform(training)

In [25]:
print(pca.explained_variance_ratio_)

[ 0.1005174   0.07629745  0.05802303  0.05646248  0.04796207  0.04127004
  0.03426234  0.02977129  0.02935428  0.02547648  0.02049176  0.01893327
  0.01821458  0.01648444  0.01569333  0.01286961  0.01266341  0.01246324
  0.01175406  0.01089521  0.01059884  0.00966333  0.00958397  0.00880871
  0.00855983  0.00844487  0.0082911   0.00787601  0.00781576  0.0075975
  0.00726766  0.00701461  0.00650561  0.00635682  0.00616906  0.00609852
  0.00583818  0.00577576  0.00553722  0.00529397  0.00499118  0.00488846
  0.0048296   0.00473739  0.00470892  0.00456641  0.00436864  0.00426812
  0.00422873  0.00410697]


In [26]:
PCAd_training.shape #now 50 features instead of 500

(170, 50)

# Do this chunk of code for k-fold cross-validation rather than merely 2-fold of testing/training, like below lines

Note that this is cross validation on training set, meaning the "training" below is part of the training set, and the "testing" is really a validation set, and still part of the larger training set.

In [9]:
#Split training into k folds (now have multiple validation/training sets)
subdat_training=np.array(training.iloc[: , :]) #now subdat starts at index of 3. make into array to use sklearn
#subdat_training= PCAd_training


subdat_training= sklearn.preprocessing.normalize(subdat_training, norm='l2')
classes_training= np.array(training_all[2]) #classes data is in column 2 of the data. make into array to use sklearn

kf= KFold(n_splits=10, shuffle=True) #try 3 and 4 for n_splits for resonable amount of data in testing/training sets. 

In [10]:
subdat_training[1].shape

(300,)

In [11]:
np.linalg.norm(subdat_training[18]) 

1.0

In [12]:
len(subdat_training)

188

In [13]:
testacc=[]
trainacc=[]
for train_index, test_index in kf.split(subdat_training):
    
    #clf = RandomForestClassifier(n_estimators=50, max_depth=4,random_state=234) 
    clf=svm.SVC(kernel='linear', C=1) #Use linear kernel, since not much data. More complex kernels performed worse and very high SD on accuracy. #C is 1 by default and it’s a reasonable default choice. If you have a lot of noisy observations you should decrease it. It corresponds to regularize more the estimation.
    #clf = MLPClassifier(hidden_layer_sizes=(5)) #to try neural network
    clf= clf.fit(subdat_training[train_index], classes_training[train_index] )

    #first on training set
    predictions_training=[]
    for i in range(0,len(subdat_training[train_index])):
        trial=clf.predict(subdat_training[train_index][i])
        predictions_training.append(trial)

    trainacc.append(accuracy_score(classes_training[train_index], predictions_training))

    #second on mini testing set (i.e., validation set)
    predictions_test=[]
    for i in range(0,len(subdat_training[test_index])):
        trial2=clf.predict(subdat_training[test_index][i])
        predictions_test.append(trial2)

    testacc.append(accuracy_score(classes_training[test_index], predictions_test))

















In [14]:
print(statistics.median(trainacc),statistics.stdev(trainacc), statistics.median(testacc), statistics.stdev(testacc))

0.94674556213 0.006231313415079559 0.894736842105 0.052762273654691993


In [15]:
print(trainacc, testacc) #this is the accuracy of the "training" folds, and of the "test" i.e., validation in cross-validation, folds

[0.94674556213017746, 0.9349112426035503, 0.94674556213017746, 0.9526627218934911, 0.9526627218934911, 0.94674556213017746, 0.9526627218934911, 0.94082840236686394, 0.94117647058823528, 0.95294117647058818] [0.89473684210526316, 0.89473684210526316, 0.89473684210526316, 0.84210526315789469, 0.89473684210526316, 0.89473684210526316, 0.78947368421052633, 0.84210526315789469, 0.94444444444444442, 0.77777777777777779]


### Next set of code is step-by-step explained version, no cross validation among training sample and more functionalities to explore model beyond accuracy. 

Extract just the testing/training data, and respective classes as separate objects

In [None]:
subdat_training=training.iloc[: , 3:] #now subdat starts at index of 3
classes_training= training[2]

subdat_testing=testing.iloc[: , 3:] #now subdat starts at index of 3 
classes_testing= testing[2]

Restructure for format needed for a decision tree in python: input X = [[0, 0], [1, 1]]

In [44]:
wordvecs_training=[]
for row in range(0,len(subdat_training.index)):
    mini2=subdat_training.iloc[row,:]
    #exclude first three columns in row
    wordvecs_training.append(mini2)
    
wordvecs_testing=[]
for row in range(0,len(subdat_testing.index)):
    mini3=subdat_testing.iloc[row,:]
    #exclude first three columns in row
    wordvecs_testing.append(mini3)

wordvecs_obesity=[]
for row in range(0,len(subdat_obesity.index)):
    mini4=subdat_obesity.iloc[row,:]
    #exclude first three columns in row
    wordvecs_obesity.append(mini4)


To Try SVM

In [108]:
clf=svm.SVC(kernel='linear', C=3) #after grid search, for Gender it seems that between C=1 to C=5 is ideal, and C=3 is best
clf = clf.fit(wordvecs_training, classes_training)

Fit a Single Decision Tree

In [255]:
#fit tree with chosen depth, for training set
clf = tree.DecisionTreeClassifier(max_depth=3) #tried betwen max depth of 2-4; 3 seems best for gender but on smaller sets consider 2.
clf = clf.fit(wordvecs_training, classes_training)

Or, fit a Random Forest

In [48]:
clf = RandomForestClassifier(n_estimators=100, max_depth=3, max_features=None, random_state=234) #max_features=None means all features are tried, rather than a sample. So the only randomness is the data. Default is that a sample of sqrt(n_features) is tried out for each tree, but this doesn't perform as well on training data, and doesn't make sense theoretically since I expect there is a few specific features that carry most of gender information. I tried betwen max depth of 2-4; 3 seems best for gender but on smaller sets consider 2.
clf = clf.fit(wordvecs_training, classes_training)

Get raw predictions from fitted tree or forest and accuracy

In [53]:
#first on training set
predictions_training=[]
for i in range(0,len(wordvecs_training)):
    trial=clf.predict(wordvecs_training[i])
    predictions_training.append(trial)

accuracy_score(classes_training, predictions_training)



0.88690476190476186

## To get raw predictions from fitted tree/forest and accuracy, for obesity words set, test set, and write predicted classes to csv if you want

In [42]:
#train on ALL training words with hyperparameters selected by Cross Validation
#test words
subdat_training=np.array(training.iloc[: , :]) #now subdat starts at index of 3. make into array to use sklearn
subdat_training= sklearn.preprocessing.normalize(subdat_training, norm='l2')
classes_training= np.array(training_all[2]) #classes data is in column 2 of the data. make into array to use sklearn

clf=svm.SVC(kernel='linear', C=1, probability=True) 
clf= clf.fit(subdat_training, classes_training )

predictions_training=[]
predictions_prob_training=[]
for i in range(0,len(subdat_training)):
    trial3=clf.predict(subdat_training[i])
    predictions_training.append(trial3)
    trial4=clf.predict_proba(subdat_training[i])
    predictions_prob_training.append(trial4)

accuracy_score(classes_training, predictions_training)



0.94444444444444442

In [20]:
with open(r'test_11-3.csv', 'a') as f:
    writer = csv.writer(f) 
    writer.writerow(np.array(training_all.iloc[:,0]))
    writer.writerow(predictions_prob_training)
    writer.writerow(predictions_training)

In [None]:
tracking=[] 
genderinteresting_words= ['blonde', 'blond', 'politician', 'programmer', 'nurse', 'doctor', 'estrogen', 'testosterone', 'soldier', 'army', 'drafted', 'military', 'pregnancy', 'pregnant', 'beard', 'nanny', 'pink', 'lipstick', 'mustache', 'bride', 'groom', 'lady', 'guy', 'sewing', 'modeling', 'actress', 'actor', 'genius', 'brilliant']

for i in genderinteresting_words:
    tracking.append(clf.predict_proba(currentmodel[i]))
    
for i in range(0,len(tracking)):
    print(tracking[i], genderinteresting_words[i])

In [8]:
#clf.predict_proba(currentmodel['']) 

In [156]:
#nurture masc, nurtuing fem  #seems like nouns tend to masc adjectives tend to fem??
#obesity masc, obese fem, morbidly_obese slightly masc, morbid_obesity very masc , severely obese slightly fem, underweight fem 60%, 
#fatness very masc, fat masc, fatter very masc
#thinness very fem, thin fem, but slender just slightly masc, dainty quite fem, graceful quite masc, bitch masc, pretty masc (65%), cute slightly fem (58%),  
#chiseled very masc, hunky and hunk very masc
#overweight slightly fem, seriously overweight quite masc (70%), slightly overweight slightly fem, being_overweight very masc (89%), 
#flabnby quite masc as is flab
#healthful very fem, healthfulness quite masc
#illness is very masc! as is ill!! sick/sickness same, althought unhealthy is fem
#love is half half exactly! lovely fem, loving masc, 

In [43]:
#NEED TO DECIDE FINAL ALGORITHM WITH CROSS VALIDATION, ABOVE, FIRST.

subdat_testing=np.array(testing.iloc[: , :]) #now subdat starts at index of 3. make into array to use sklearn
subdat_testing= sklearn.preprocessing.normalize(subdat_testing, norm='l2')
predictions_testing=[]
predictions_prob_testing=[]
for i in range(0,len(subdat_testing)):
    try:
        trial3=clf.predict_proba(subdat_testing[i]) #predict or predict_proba
        predictions_prob_testing.append(trial3)
        trial4=clf.predict(subdat_testing[i]) #predict or predict_proba
        predictions_testing.append(trial4)
    except:
        print(i, str(trial3))
        #predictions_testing.append(str(trial3))
                                 
classes_testing= np.array(testing_all[2]) #classes data is in column 2 of the data. make into array to use sklearn
#accuracy_score(classes_testing, predictions_testing)

#with open(r'ModelA_ALLYEARS_300_Gender.csv', 'a') as f:
   # writer = csv.writer(f) 
    #writer.writerow(np.array(testing_all.iloc[:,0]))
    #writer.writerow(predictions_testing) 



In [45]:
accuracy_score(classes_testing, predictions_testing)

0.96666666666666667

In [46]:
subdat_stereotypes=np.array(stereotypes.iloc[: , :]) #now subdat starts at index of 3. make into array to use sklearn
subdat_stereotypes= sklearn.preprocessing.normalize(subdat_stereotypes, norm='l2')
predictions_stereotypes=[]
predictions_prob_stereotypes=[]
for i in range(0,len(subdat_stereotypes)):
    try:
        trial3=clf.predict_proba(subdat_stereotypes[i]) #predict or predict_proba
        predictions_prob_stereotypes.append(trial3)
        trial4=clf.predict(subdat_stereotypes[i]) #predict or predict_proba
        predictions_stereotypes.append(trial4)
    except:
        print(i, str(trial3))
        #predictions_testing.append(str(trial3))
                                 
classes_stereotypes= np.array(stereotypes_all[2]) #classes data is in column 2 of the data. make into array to use sklearn



In [47]:
accuracy_score(classes_stereotypes, predictions_stereotypes)

0.89393939393939392

## PROJECT OBESITY WORDS

In [48]:
wordvecs_obesity=[]
for row in range(0,len(subdat_obesity.index)):
    mini4=subdat_obesity.iloc[row,:]
    #exclude first three columns in row
    wordvecs_obesity.append(mini4)
    
predictions_obesity=[]
predictions_prob_obesity=[]
for i in range(0,len(wordvecs_obesity)):
    trial2=clf.predict_proba(wordvecs_obesity[i])
    predictions_prob_obesity.append(trial2)
    trial3=clf.predict(wordvecs_obesity[i])
    predictions_obesity.append(trial3)
    

#with open(r'ModelA_ALLYEARS_implicitytraining.csv', 'a') as f:
    #writer = csv.writer(f)
    #writer.writerow(np.array(dat2.iloc[:,0]))
    #writer.writerow(predictions_obesity)



In [49]:
with open(r'test_11-3.csv', 'a') as f:
    writer = csv.writer(f) 
    writer.writerow(np.array(training_all.iloc[:,0]))
    writer.writerow(predictions_prob_training)
    writer.writerow(predictions_training)
    
    writer.writerow(np.array(testing_all.iloc[:,0]))
    writer.writerow(predictions_prob_testing)
    writer.writerow(predictions_testing)
    
    writer.writerow(np.array(stereotypes_all.iloc[:,0]))
    writer.writerow(predictions_prob_stereotypes)
    writer.writerow(predictions_stereotypes)
    
    writer.writerow(np.array(dat2.iloc[:,0]))
    writer.writerow(predictions_prob_obesity)
    writer.writerow(predictions_obesity)

To get visual tree, words for a single fitted tree, otherwise will need to adjust and select a single tree from forest

In [None]:
with open("visualtreecode.dot", 'w') as f:
     f = tree.export_graphviz(clf, out_file=f) #then copy code in this file into http://webgraphviz.com/ to get a tree visualization

To write CSV with predicted vs actual classes 

In [257]:
temp= np.asarray(classes_training)

with open(r'dectree2_predicts_training_gender.csv', 'a') as f:
    writer = csv.writer(f)
    writer.writerow(predictions_training)
    writer.writerow(temp)