In [13]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

# Load the uploaded CSV files
preprocessed_data = pd.read_csv('v2_preprocessed_data.csv')
adjectives_data = pd.read_csv('adjectives_from_corpus.csv')

# Define male and female keywords
male_keywords = ["he", "him", "his", "man", "male", "men", "boy", "guy"]
female_keywords = ["she", "her", "hers", "woman", "female", "women", "girl", "lady"]

# Prepare the data for Word2Vec
sentences = preprocessed_data['text'].apply(simple_preprocess).tolist()

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# Function to get the average vector of a list of words
def get_average_vector(words, model):
    valid_words = [word for word in words if word in model.wv]
    if valid_words:
        return np.mean([model.wv[word] for word in valid_words], axis=0)
    else:
        return None

# Get average vectors for male and female keywords
male_vector = get_average_vector(male_keywords, word2vec_model)
female_vector = get_average_vector(female_keywords, word2vec_model)

# Prepare target adjectives from the corpus and filter out unwanted words
adjectives = adjectives_data['adjective'].tolist()
filtered_adjectives = [adj for adj in adjectives if adj.lower() not in ["french", "british", "female"]]

# Function to find top N adjectives based on cosine similarity to gender vectors
def find_top_adjectives_by_similarity(model, adjectives, gender_vector, top_n=10):
    adjective_similarities = []
    for word in adjectives:
        if word in model.wv:  # Ensure the word is in the model's vocabulary
            similarity = cosine_similarity(model.wv[word].reshape(1, -1), gender_vector.reshape(1, -1))[0][0]
            adjective_similarities.append((word, similarity))
    
    # Sort by similarity and return the top_n closest words
    adjective_similarities.sort(key=lambda x: x[1], reverse=True)
    return adjective_similarities[:top_n]

# Find top 10 adjectives closest to male and female vectors
top_male_adjectives = find_top_adjectives_by_similarity(word2vec_model, filtered_adjectives, male_vector, top_n=10)
top_female_adjectives = find_top_adjectives_by_similarity(word2vec_model, filtered_adjectives, female_vector, top_n=10)

# Print the results
print("Top 10 Adjectives Closest to Male Keywords:")
for word, similarity in top_male_adjectives:
    print(f"{word}: {similarity:.4f}")

print("\nTop 10 Adjectives Closest to Female Keywords:")
for word, similarity in top_female_adjectives:
    print(f"{word}: {similarity:.4f}")


Top 10 Adjectives Closest to Male Keywords:
entire: 0.9365
notable: 0.9301
similar: 0.9266
consistent: 0.9149
huge: 0.9141
slower: 0.9140
incredible: 0.9118
impressive: 0.9104
complete: 0.9028
tight: 0.9028

Top 10 Adjectives Closest to Female Keywords:
notable: 0.9546
complete: 0.9122
entire: 0.9091
international: 0.9051
recent: 0.9050
huge: 0.8985
main: 0.8958
competitive: 0.8948
impressive: 0.8940
similar: 0.8901
