In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

# Load the uploaded CSV files
preprocessed_data = pd.read_csv('v3_preprocessed_data.csv')
adjectives_data = pd.read_csv('adjectives_from_corpus.csv')

# Prepare the data for Word2Vec
sentences = preprocessed_data['body'].apply(simple_preprocess).tolist()

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1, seed=42, workers=1)

# Create DataFrame from Word2Vec model's embeddings
def create_embedding_dataframe(model):
    words = list(model.wv.index_to_key)
    embeddings = [model.wv[word] for word in words]
    return pd.DataFrame({"word": words, "embedding": embeddings})

gendered_word_embeddings = create_embedding_dataframe(word2vec_model)

# Function to calculate average embedding for a given word
def calculateAverageEmbedding(gendered_word_embeddings, target_word):
    filtered_df = gendered_word_embeddings[gendered_word_embeddings['word'] == target_word]
    embeddings = filtered_df['embedding'].tolist()  # Change from .values to .tolist() for a writable array
    if len(embeddings) > 0:
        # Convert list of arrays into a 2D NumPy array
        embeddings_array = np.array(embeddings)
        avg_embedding = np.mean(embeddings_array, axis=0)
        return target_word, avg_embedding
    else:
        return target_word, None


# Calculate average embeddings for male and female keywords
_, male_vector = calculateAverageEmbedding(gendered_word_embeddings, "he")
_, female_vector = calculateAverageEmbedding(gendered_word_embeddings, "she")

# Prepare target adjectives from the corpus and filter out unwanted words
adjectives = adjectives_data['adjective'].tolist()
filtered_adjectives = [adj for adj in adjectives if adj.lower() not in ["french", "british", "female"]]

# Function to find top N adjectives based on cosine similarity to gender vectors
def find_top_adjectives_by_similarity(model, adjectives, gender_vector, top_n=10):
    if gender_vector is None:
        return []
    adjective_similarities = []
    for word in adjectives:
        if word in model.wv:  # Ensure the word is in the model's vocabulary
            similarity = cosine_similarity(model.wv[word].reshape(1, -1), gender_vector.reshape(1, -1))[0][0]
            adjective_similarities.append((word, similarity))
    
    # Sort by similarity and return the top_n closest words
    adjective_similarities.sort(key=lambda x: x[1], reverse=True)
    return adjective_similarities[:top_n]

# Find top 10 adjectives closest to male and female vectors
top_male_adjectives = find_top_adjectives_by_similarity(word2vec_model, filtered_adjectives, male_vector, top_n=10)
top_female_adjectives = find_top_adjectives_by_similarity(word2vec_model, filtered_adjectives, female_vector, top_n=10)

# Print the results
print("Top 10 Adjectives Closest to Male Keywords:")
for word, similarity in top_male_adjectives:
    print(f"{word}: {similarity:.4f}")

print("\nTop 10 Adjectives Closest to Female Keywords:")
for word, similarity in top_female_adjectives:
    print(f"{word}: {similarity:.4f}")


Top 10 Adjectives Closest to Male Keywords:
wrong: 0.6695
young: 0.6612
whole: 0.6551
mean: 0.6491
real: 0.6474
fresh: 0.6434
little: 0.6383
pole: 0.6352
bad: 0.6325
smart: 0.6314

Top 10 Adjectives Closest to Female Keywords:
consistent: 0.6662
impressive: 0.6502
strong: 0.6495
whole: 0.6491
solid: 0.6478
smart: 0.6458
young: 0.6417
fresh: 0.6400
bad: 0.6364
hard: 0.6350
