In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.metrics.pairwise import cosine_similarity

# Load the uploaded CSV files
preprocessed_data = pd.read_csv('v2_preprocessed_data.csv')
adjectives_data = pd.read_csv('adjectives_from_corpus.csv')

# Prepare the data for Word2Vec
sentences = preprocessed_data['text'].apply(simple_preprocess).tolist()

# Train a Word2Vec model
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

# Define male and female keywords
male_keywords = ["he", "him", "his", "man", "male", "men", "boy", "guy", "father", "brother", "son", "gentleman"]
female_keywords = ["she", "her", "hers", "woman", "female", "women", "girl", "lady", "mother", "sister", "daughter", "gentlewoman"]

# Define ultra running terms for different categories
ultra_running_terms = {
    "Physical": (
        "endurance", "stamina", "strength", "agility", "recovery",
        "nutrition", "hydration", "fatigue", "injury", "conditioning",
        "adaptation", "muscle", "pace", "terrain", "distance", 
        "strong", "weak", "fitness"
    ),
    "Performance-Related": (
        "speed", "strategy", "training", "goal-setting", "efficiency",
        "technique", "timing", "monitoring", "pace management", 
        "endurance tests", "personal best", "race day", 
        "competition", "support crew", "equipment", 
        "strong", "weak", "athlete", "performance", "power"
    ),
    "Emotional": (
        "resilience", "motivation", "determination", "euphoria", 
        "frustration", "anxiety", "joy", "isolation", "fear", 
        "accomplishment", "grit", "community", "self-discovery", 
        "perseverance", "confidence", "strong", "weak"
    ),
    "Aesthetic": (
        "scenic", "serene", "majestic", "beautiful", "tranquil",
        "inspiring", "rugged", "vibrant", "picturesque", 
        "challenging", "wild", "adventurous", "ethereal", 
        "breath-taking", "harmonious", "beauty", "elegance", 
        "style", "grace", "charm", "design", "visual", 
        "artistic", "pleasing", "attractive"
    )
}

# Function to get the average vector of a list of words
def get_average_vector(words, model):
    valid_words = [word for word in words if word in model.wv]
    if valid_words:
        return np.mean([model.wv[word] for word in valid_words], axis=0)
    else:
        return None

# Get average vectors for male and female keywords
male_vector = get_average_vector(male_keywords, word2vec_model)
female_vector = get_average_vector(female_keywords, word2vec_model)

# Calculate and print cosine similarities for each word in the ultra running terms
for category, words in ultra_running_terms.items():
    print(f"\nCosine Similarities for Category '{category}':")
    
    for word in words:
        if word in word2vec_model.wv:
            word_vector = word2vec_model.wv[word]
            
            # Similarity to male gendered words
            if male_vector is not None:
                male_similarity = cosine_similarity(word_vector.reshape(1, -1), male_vector.reshape(1, -1))[0][0]
                print(f"  Word: '{word}' -> Similarity to Male Keywords: {male_similarity:.4f}")
            
            # Similarity to female gendered words
            if female_vector is not None:
                female_similarity = cosine_similarity(word_vector.reshape(1, -1), female_vector.reshape(1, -1))[0][0]
                print(f"  Word: '{word}' -> Similarity to Female Keywords: {female_similarity:.4f}")
        else:
            print(f"  Word: '{word}' not found in the model vocabulary.")



Cosine Similarities for Category 'Physical':
  Word: 'endurance' -> Similarity to Male Keywords: 0.6835
  Word: 'endurance' -> Similarity to Female Keywords: 0.7637
  Word: 'stamina' not found in the model vocabulary.
  Word: 'strength' -> Similarity to Male Keywords: 0.7962
  Word: 'strength' -> Similarity to Female Keywords: 0.8492
  Word: 'agility' -> Similarity to Male Keywords: 0.8645
  Word: 'agility' -> Similarity to Female Keywords: 0.8798
  Word: 'recovery' -> Similarity to Male Keywords: 0.6722
  Word: 'recovery' -> Similarity to Female Keywords: 0.6915
  Word: 'nutrition' -> Similarity to Male Keywords: 0.6219
  Word: 'nutrition' -> Similarity to Female Keywords: 0.7011
  Word: 'hydration' -> Similarity to Male Keywords: 0.7360
  Word: 'hydration' -> Similarity to Female Keywords: 0.7722
  Word: 'fatigue' -> Similarity to Male Keywords: 0.7793
  Word: 'fatigue' -> Similarity to Female Keywords: 0.8212
  Word: 'injury' -> Similarity to Male Keywords: 0.7647
  Word: 'injury' 