In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

# Load the uploaded CSV files
preprocessed_data = pd.read_csv('v3_preprocessed_data.csv')
adjectives_data = pd.read_csv('adjectives_from_corpus.csv')

# Prepare the data for Word2Vec
sentences = preprocessed_data['body'].apply(simple_preprocess).tolist()

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1, seed=42, workers=1)

# Create DataFrame from Word2Vec model's embeddings
def create_embedding_dataframe(model):
    words = list(model.wv.index_to_key)
    embeddings = [model.wv[word] for word in words]
    return pd.DataFrame({"word": words, "embedding": embeddings})

gendered_word_embeddings = create_embedding_dataframe(word2vec_model)

# Function to calculate average embedding for a list of words
def calculateAverageEmbedding(gendered_word_embeddings, target_words):
    embeddings = []
    for word in target_words:
        filtered_df = gendered_word_embeddings[gendered_word_embeddings['word'] == word]
        if not filtered_df.empty:
            embeddings.extend(filtered_df['embedding'].tolist())
    if embeddings:
        # Convert list of arrays into a 2D NumPy array
        embeddings_array = np.array(embeddings)
        avg_embedding = np.mean(embeddings_array, axis=0)
        return target_words, avg_embedding
    else:
        return target_words, None

# Define male and female keywords
male_keywords = ["he", "him", "his"]
female_keywords = ["she", "her", "hers"]

# Calculate average embeddings for male and female keywords
_, male_vector = calculateAverageEmbedding(gendered_word_embeddings, male_keywords)
_, female_vector = calculateAverageEmbedding(gendered_word_embeddings, female_keywords)

# Prepare target adjectives from the corpus and filter out unwanted words
adjectives = adjectives_data['adjective'].tolist()
filtered_adjectives = [adj for adj in adjectives if adj.lower()]

# Function to compute and save cosine similarities for all adjectives, sorted by similarity
def save_cosine_similarities(model, adjectives, gender_vector, filename):
    if gender_vector is None:
        return  # Skip if the vector is None
    adjective_similarities = []
    for word in adjectives:
        if word in model.wv:
            similarity = cosine_similarity(model.wv[word].reshape(1, -1), gender_vector.reshape(1, -1))[0][0]
            adjective_similarities.append({"word": word, "similarity": similarity})
    # Create a DataFrame, sort by 'similarity' in descending order, and save to CSV
    df = pd.DataFrame(adjective_similarities)
    df = df.sort_values(by='similarity', ascending=False)  # Sorting the DataFrame by 'similarity' column
    df.to_csv(filename, index=False)

# Compute and save cosine similarities for male and female vectors
save_cosine_similarities(word2vec_model, filtered_adjectives, male_vector, "male_cosine_similarities_word2vec.csv")
save_cosine_similarities(word2vec_model, filtered_adjectives, female_vector, "female_cosine_similarities_word2vec.csv")

print("Cosine similarity files saved: 'male_cosine_similarities_word2vec.csv' and 'female_cosine_similarities_word2vec.csv'")


Cosine similarity files saved: 'male_cosine_similarities_word2vec.csv' and 'female_cosine_similarities_word2vec.csv'
