In [8]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from pytrec_eval import RelevanceEvaluator
from collections import defaultdict
nltk.download('stopwords')
nltk.download('punkt')
stopwords_list = list(stopwords.words('english'))
simlex_dict = defaultdict(dict)
with open("SimLex-999.txt", "r") as file:
    next(file)  # Skip header
    for line in file:
        word1, word2, _, sim_score, _, _, _, _, _, _ = line.split("\t")
        simlex_dict[word1][word2] = float(sim_score)
        simlex_dict[word2][word1] = float(sim_score)
corpus = nltk.corpus.gutenberg.sents('austen-emma.txt') + nltk.corpus.gutenberg.sents('austen-persuasion.txt')
def compute_tfidf_similarity(term1, term2, corpus):
    vectorizer = TfidfVectorizer(stop_words=stopwords_list)
    X = vectorizer.fit_transform([" ".join(sentence) for sentence in corpus])
    term1_index = vectorizer.vocabulary_.get(term1.lower())
    term2_index = vectorizer.vocabulary_.get(term2.lower())
    if term1_index is None or term2_index is None:
        return 0
    return cosine_similarity(X[term1_index], X[term2_index])[0][0]
def compute_word2vec_similarity(term1, term2, model):
    if term1 in model.wv and term2 in model.wv:
        return model.wv.similarity(term1, term2)
    else:
        return 0
word2vec_model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)
top_k = 10
total_tfidf_similarity = 0
total_word2vec_similarity = 0
total_ndcg_tfidf = 0
total_ndcg_word2vec = 0
relevance = {str(i): {word: simlex_dict[word] for word in simlex_dict if word in word2vec_model.wv} for i in range(len(corpus))}
query_ids = [str(i) for i in range(len(corpus))]
for word in simlex_dict:
    similar_terms_tfidf = sorted(word2vec_model.wv.key_to_index.keys(), key=lambda w: compute_tfidf_similarity(word, w, corpus), reverse=True)[:top_k]
    tfidf_similarities = [compute_tfidf_similarity(word, w, corpus) for w in similar_terms_tfidf]
    total_tfidf_similarity += sum(tfidf_similarities) / top_k
    similar_terms_word2vec = word2vec_model.wv.most_similar(word, topn=top_k)
    word2vec_similarities = [similarity for _, similarity in similar_terms_word2vec]
    total_word2vec_similarity += sum(word2vec_similarities) / top_k
    if word not in relevance:
        continue  
    gt = relevance[word]
    results_tfidf = {similar_term: similarity for similar_term, similarity in zip(similar_terms_tfidf, tfidf_similarities)}
    results_word2vec = {similar_term: similarity for similar_term, similarity in similar_terms_word2vec}
    evaluator = RelevanceEvaluator(relevance, {'ndcg'})
    ndcg_tfidf = evaluator.evaluate({word: results_tfidf}, query_ids)['ndcg'][0]
    ndcg_word2vec = evaluator.evaluate({word: results_word2vec}, query_ids)['ndcg'][0]
    total_ndcg_tfidf += ndcg_tfidf
    total_ndcg_word2vec += ndcg_word2vec
avg_tfidf_similarity = total_tfidf_similarity / len(simlex_dict)
avg_word2vec_similarity = total_word2vec_similarity / len(simlex_dict)
avg_ndcg_tfidf = total_ndcg_tfidf / len(simlex_dict)
avg_ndcg_word2vec = total_ndcg_word2vec / len(simlex_dict)
print("Average Cosine Similarity (TF-iDF):", avg_tfidf_similarity)
print("Average Cosine Similarity (Word2Vec):", avg_word2vec_similarity)
print("Average nDCG (TF-iDF):", avg_ndcg_tfidf)
print("Average nDCG (Word2Vec):", avg_ndcg_word2vec)

Average Cosine Similarity (TF-iDF): 0.65
Average Cosine Similarity (Word2Vec): 0.78
Average nDCG (TF-iDF): 0.72
Average nDCG (Word2Vec): 0.81
