In [1]:
import gensim
import pandas as pd
import os
import re
import numpy as np
from tqdm import tqdm

directory = os.getcwd()
file_names = [file for file in os.listdir(directory) if file.endswith('.model')]

use_similarities = True

terms_df = pd.read_csv('gendered_terms.csv')

vocab = np.loadtxt('model_vocab.txt', dtype=str).tolist()

with open('negative-words.txt', 'r', encoding='ISO-8859-1') as file:
    negative_words = [line.strip() for line in file]

with open('positive-words.txt', 'r', encoding='ISO-8859-1') as file:
    positive_words = [line.strip() for line in file]

def preprocess_text(text):
    return re.sub(r'[^\w\s]', '', str(text).lower())

results = []

for file_name in file_names:
    model = gensim.models.Word2Vec.load(file_name)
    print(f'Processing {file_name}...')
    
    male_pos = 0
    male_neg = 0
    male_count = 0
    female_pos = 0
    female_neg = 0
    female_count = 0

    for term in tqdm(terms_df['Masculine'], desc='\tmasculine terms'):
        term = preprocess_text(term)
        if term not in vocab:
            continue
        male_count += 1
        temp_neg = 0
        neg_count = 0
        temp_pos = 0
        pos_count = 0
        for word in negative_words:
            if word not in vocab:
                continue
            neg_count += 1
            temp_neg += model.wv.similarity(term, word) if use_similarities else np.linalg.norm(model.wv[term] - model.wv[word])
        for word in positive_words:
            if word not in vocab:
                continue
            pos_count += 1
            temp_pos += model.wv.similarity(term, word) if use_similarities else np.linalg.norm(model.wv[term] - model.wv[word])
        male_neg += temp_neg / neg_count
        male_pos += temp_pos / pos_count

    for term in tqdm(terms_df['Feminine'], desc='\tfeminine terms'):
        term = preprocess_text(term)
        if term not in vocab:
            continue
        female_count += 1
        temp_neg = 0
        neg_count = 0
        temp_pos = 0
        pos_count = 0
        for word in negative_words:
            if word not in vocab:
                continue
            neg_count += 1
            temp_neg += model.wv.similarity(term, word) if use_similarities else np.linalg.norm(model.wv[term] - model.wv[word])
        for word in positive_words:
            if word not in vocab:
                continue
            pos_count += 1
            temp_pos += model.wv.similarity(term, word) if use_similarities else np.linalg.norm(model.wv[term] - model.wv[word])
        female_neg += temp_neg / neg_count
        female_pos += temp_pos / pos_count

    row = {
        'Model': file_name,
        'Male_Pos': male_pos / male_count,
        'Male_Neg': male_neg / male_count,
        'Female_Pos': female_pos / female_count,
        'Female_Neg': female_neg / female_count
    }
    results.append(row)

    print('\n')

results_df = pd.DataFrame(results)
print(results_df.head())
results_df.to_csv('results.csv', index=False)


Processing reddit_cbow.model...


	masculine terms: 100%|██████████| 32/32 [02:33<00:00,  4.80s/it]
	feminine terms: 100%|██████████| 32/32 [01:50<00:00,  3.44s/it]




Processing reddit_sg.model...


	masculine terms: 100%|██████████| 32/32 [02:30<00:00,  4.70s/it]
	feminine terms: 100%|██████████| 32/32 [01:54<00:00,  3.59s/it]




Processing twitter_cbow.model...


	masculine terms: 100%|██████████| 32/32 [02:32<00:00,  4.76s/it]
	feminine terms: 100%|██████████| 32/32 [02:05<00:00,  3.91s/it]




Processing twitter_sg.model...


	masculine terms: 100%|██████████| 32/32 [02:36<00:00,  4.88s/it]
	feminine terms: 100%|██████████| 32/32 [01:54<00:00,  3.57s/it]




Processing wikipedia_cbow.model...


	masculine terms: 100%|██████████| 32/32 [02:31<00:00,  4.73s/it]
	feminine terms: 100%|██████████| 32/32 [01:36<00:00,  3.01s/it]




Processing wikipedia_sg.model...


	masculine terms: 100%|██████████| 32/32 [02:40<00:00,  5.01s/it]
	feminine terms: 100%|██████████| 32/32 [02:00<00:00,  3.77s/it]



                  Model  Male_Pos  Male_Neg  Female_Pos  Female_Neg
0     reddit_cbow.model  0.099056  0.095118    0.082766    0.090360
1       reddit_sg.model  0.770178  0.757278    0.722455    0.711081
2    twitter_cbow.model  0.037624  0.029863    0.027258    0.020980
3      twitter_sg.model  0.233407  0.221800    0.219218    0.207518
4  wikipedia_cbow.model  0.031755  0.048783    0.045039    0.064625





In [2]:
with open('processed_results.txt', 'w') as f:
    for r in results:
        print(f'{r["Model"]}:', file=f) 
        temp = r['Male_Neg'] / r['Male_Pos']
        print(f'\tMale neg/pos: {temp}', file=f)
        temp = r['Female_Neg'] / r['Female_Pos']
        print(f'\tFemale neg/pos: {temp}', file=f)
        temp = r['Male_Neg'] / r['Female_Neg']
        print(f'\tNeg male/female: {temp}', file=f)
        temp = r['Male_Pos'] / r['Female_Pos']
        print(f'\tPos male/female: {temp}', file=f)

In [3]:
with open('negative-words.txt', 'r', encoding='ISO-8859-1') as file:
    negative_words = [line.strip() for line in file]

with open('positive-words.txt', 'r', encoding='ISO-8859-1') as file:
    positive_words = [line.strip() for line in file]

print(f'length of negative words: {len(negative_words)}')
print(f'length of positive words: {len(positive_words)}')

length of negative words: 4783
length of positive words: 2006
