In [None]:
!pip install pandas

In [None]:
!pip install nltk

In [None]:
!pip install scikit-learn

In [None]:
!pip install gensim

In [None]:
!pip install pytrec_eval

In [1]:
import pandas as pd
import nltk
from nltk.corpus import brown, reuters
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from gensim.models import Word2Vec
import pytrec_eval
import numpy as np

def load_simlex999(path='SimLex-999.txt'):
    simlex999 = pd.read_csv(path, sep='\t')
    return simlex999

simlex999 = load_simlex999()

nltk.download('brown')
nltk.download('reuters')
corpus = brown.sents() + reuters.sents()
corpus_words = [word.lower() for sentence in corpus for word in sentence if word.isalpha()]

def build_tfidf_model(corpus):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([' '.join(sentence) for sentence in corpus])
    return tfidf_vectorizer, tfidf_matrix

tfidf_vectorizer, tfidf_matrix = build_tfidf_model(corpus)

def find_top_k_tfidf(word, k=10):
    word_index = tfidf_vectorizer.vocabulary_.get(word)
    if word_index is not None:
        cosine_similarities = linear_kernel(tfidf_matrix[word_index], tfidf_matrix).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-min(k, len(cosine_similarities)) - 1:-1]
        related_docs_indices = [idx for idx in related_docs_indices if idx < len(tfidf_vectorizer.get_feature_names_out())]
        related_words = [tfidf_vectorizer.get_feature_names_out()[i] for i in related_docs_indices]
        cosine_scores = cosine_similarities[related_docs_indices]
        return related_words, cosine_scores
    else:
        return [], []

def train_word2vec_model(corpus):
    model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)
    model.train(corpus, total_examples=len(corpus), epochs=10)
    return model

word2vec_model = train_word2vec_model(corpus)

def find_top_k_word2vec(word, k=10):
    if word in word2vec_model.wv.key_to_index:
        similar_words = [word for word, similarity in word2vec_model.wv.most_similar(word, topn=k)]
        cosine_scores = [similarity for word, similarity in word2vec_model.wv.most_similar(word, topn=k)]
        return similar_words, cosine_scores
    else:
        return [], []

def evaluate_model(simlex999, model_retrieval_func):
    qrel = {}
    for _, row in simlex999.iterrows():
        qrel[row['word1']] = {row['word2']: int(row['SimLex999'] > 3.0)}
    run = {}
    for _, row in simlex999.iterrows():
        run[row['word1']] = {word: 0 for word in model_retrieval_func(row['word1'])[0]}
    evaluator = pytrec_eval.RelevanceEvaluator(qrel, {'ndcg'})
    metrics = evaluator.evaluate(run)
    ndcg_scores = [metrics[query]['ndcg'] for query in metrics]
    avg_ndcg = np.mean(ndcg_scores)
    return avg_ndcg

def calculate_avg_cosine_similarity(cosine_scores):
    return np.mean(cosine_scores)

related_words_tfidf, cosine_scores_tfidf = find_top_k_tfidf("example", k=10)
avg_cosine_similarity_tfidf = calculate_avg_cosine_similarity(cosine_scores_tfidf)
print("Average Cosine Similarity (TF-IDF):", avg_cosine_similarity_tfidf)
avg_ndcg_tfidf = evaluate_model(simlex999, lambda word: find_top_k_tfidf(word, k=10))
print("Average nDCG (TF-IDF):", avg_ndcg_tfidf)
related_words_word2vec, cosine_scores_word2vec = find_top_k_word2vec("example", k=10)
avg_cosine_similarity_word2vec = calculate_avg_cosine_similarity(cosine_scores_word2vec)
print("Average Cosine Similarity (Word2Vec):", avg_cosine_similarity_word2vec)
avg_ndcg_word2vec = evaluate_model(simlex999, lambda word: find_top_k_word2vec(word, k=10))
print("Average nDCG (Word2Vec):", avg_ndcg_word2vec)


[nltk_data] Downloading package brown to /Users/xyz/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to /Users/xyz/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


Average Cosine Similarity (TF-IDF): 0.3948066831521956
Average nDCG (TF-IDF): 0.0
Average Cosine Similarity (Word2Vec): 0.5503238439559937
Average nDCG (Word2Vec): 0.025154747035756493
