In [59]:
pip install nltk gensim scikit-learn numpy



In [60]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

import string
import re
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from gensim.models import Phrases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from nltk.corpus.reader import PlaintextCorpusReader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [61]:
with open('./data/WikiText-103.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()
# Define a regex pattern to match headings (sections surrounded by "=" signs)
my_heading = r'(\n (= )+[^=]*[^=](= )+\n )'

# Split the raw text based on headings and get documents
doc_split = re.split(my_heading, corpus)

# Extract the articles/documents (ignoring the headings)
docs = [x.strip("\n ") for x in doc_split[2::2] if x != "= "]

# Remove empty articles/documents
documents = [doc.strip() for doc in docs if doc.strip() != '']

print(f"Number of documents: {len(documents)}")

Number of documents: 89698


In [62]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_document(text):
    # Tokenize the document into lowercase words
    tokens = word_tokenize(text.lower())

    # Remove non-alphabetic tokens and stopwords
    filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]

    # Lemmatize the remaining tokens (default to noun)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return lemmatized_tokens

# Assuming 'documents' is your input corpus (list of raw text documents)
# Preprocess each document in the 'documents' list
preprocessed_documents = [preprocess_document(doc) for doc in documents]


In [63]:
bigram = Phrases(preprocessed_documents, min_count=5, threshold=10)
trigram = Phrases(bigram[preprocessed_documents], threshold=10)

def make_bigrams(texts):
    return [bigram[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]

# Apply bigrams and trigrams
corpus_bigrams = make_bigrams(preprocessed_documents)
corpus_trigrams = make_trigrams(corpus_bigrams)

In [67]:
# Step 1: Preprocess the Corpus (assuming corpus_preprocessed is already prepared)
corpus_tokens = corpus_trigrams

# Step 2: Train Word2Vec (Skip-gram) Model
word2vec_model = Word2Vec(sentences=corpus_tokens, vector_size=300, window=8, sg=0, min_count=1, workers=4)
vocab_size = len(word2vec_model.wv)
print(f"Vocabulary size: {vocab_size}")

# Step 3: Read the Test CSV File (with 'Gold' column, which we will later drop)
test_file_path = './data/CW-1-testdata.csv'  # Path to your test pairs CSV file
test_pairs = pd.read_csv(test_file_path, header=None, names=['PairID', 'Term1', 'Term2', 'GoldSimilarity'])

# Step 4: Create a function to calculate cosine similarity between two terms using Word2Vec
def calculate_word2vec_similarity(word1, word2, model):
    try:
        if word1 in model.wv and word2 in model.wv:
            vec1 = model.wv[word1]
            vec2 = model.wv[word2]
            similarity = cosine_similarity([vec1], [vec2])[0][0]
            return similarity
        else:
            return 0
    except KeyError:
        return 0

# Step 5: Use a For Loop to calculate the cosine similarity and add it to a new column
computed_similarities = []
for idx, row in test_pairs.iterrows():
    word1 = row['Term1']
    word2 = row['Term2']

    # Calculate the cosine similarity using the custom function
    similarity = calculate_word2vec_similarity(word1, word2, word2vec_model)

    # Append the similarity to the list
    computed_similarities.append(similarity)

# Step 6: Add the computed similarities to the DataFrame as a new column
test_pairs['computed_similarity'] = computed_similarities

# Step 7: Drop the 'GoldSimilarity' column and keep only relevant columns (PairID, Term1, Term2, computed_similarity)
output_df = test_pairs[['PairID', 'Term1', 'Term2', 'computed_similarity']]

# Step 8: Write the updated results to a new CSV file
output_file_path = './data/11028972_task2_results.csv'
output_df.to_csv(output_file_path, index=False, header=False)

print(f"Final results written to {output_file_path}")

Vocabulary size: 280087
Final results written to ./data/11028972_task2_results.csv
