In [70]:
pip install nltk gensim scikit-learn numpy




In [71]:
!pip install --upgrade gensim




In [72]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import string
import re
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from gensim.models import Phrases
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Phrases


from nltk.corpus.reader import PlaintextCorpusReader

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [73]:
with open('./data/WikiText-103.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()
# Define a regex pattern to match headings (sections surrounded by "=" signs)
my_heading = r'(\n (= )+[^=]*[^=](= )+\n )'

# Split the raw text based on headings and get documents
doc_split = re.split(my_heading, corpus)

# Extract the articles/documents (ignoring the headings)
docs = [x.strip("\n ") for x in doc_split[2::2] if x != "= "]

# Remove empty articles/documents
documents = [doc.strip() for doc in docs if doc.strip() != '']

print(f"Number of documents: {len(documents)}")

Number of documents: 89698


In [74]:
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess each document
def preprocess_document(text):
    # Tokenization
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize

    # Remove non-alphabetic tokens and stopwords
    filtered_tokens = [token for token in tokens if token.isalpha() and token not in stop_words]

    # Lemmatization (without POS tagging, defaults to noun)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return lemmatized_tokens

# Preprocess each document in the 'documents' list
preprocessed_documents = [preprocess_document(doc) for doc in documents]

# Example of preprocessing output for the first document
print(f"First preprocessed document: {preprocessed_documents[0]}")


First preprocessed document: ['previous', 'unk', 'chronicle', 'game', 'valkyria', 'chronicle', 'iii', 'tactical', 'role', 'playing', 'game', 'player', 'take', 'control', 'military', 'unit', 'take', 'part', 'mission', 'enemy', 'force', 'story', 'told', 'comic', 'book', 'like', 'panel', 'animated', 'character', 'portrait', 'character', 'speaking', 'partially', 'voiced', 'speech', 'bubble', 'partially', 'unvoiced', 'text', 'player', 'progress', 'series', 'linear', 'mission', 'gradually', 'unlocked', 'map', 'freely', 'scanned', 'replayed', 'unlocked', 'route', 'story', 'location', 'map', 'varies', 'depending', 'individual', 'player', 'approach', 'one', 'option', 'selected', 'sealed', 'player', 'outside', 'mission', 'player', 'character', 'rest', 'camp', 'unit', 'customized', 'character', 'growth', 'occurs', 'alongside', 'main', 'story', 'mission', 'character', 'specific', 'sub', 'mission', 'relating', 'different', 'squad', 'member', 'game', 'completion', 'additional', 'episode', 'unlocked'

In [75]:
# Train Bigram and Trigram Models to Detect Multi-word Terms
bigram = Phrases(preprocessed_documents, min_count=7, threshold=10)
trigram = Phrases(bigram[preprocessed_documents], threshold=10)

# Apply bigram and trigram models to detect multi-word terms
def make_bigrams(texts):
    return [bigram[doc] for doc in texts]  # Using bigram directly

def make_trigrams(texts):
    return [trigram[bigram[doc]] for doc in texts]  # Using trigram directly

# Apply bigrams and trigrams
corpus_bigrams = make_bigrams(preprocessed_documents)
corpus_trigrams = make_trigrams(corpus_bigrams)

# Convert Preprocessed Documents (Trigrams) Back to String Format
corpus_preprocessed = [' '.join(doc) for doc in corpus_trigrams]


In [82]:
# Step 1: Create TF-IDF Vectorizer and Fit on the Corpus
vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, smooth_idf=True, sublinear_tf=True)
tfidf_matrix = vectorizer.fit_transform(corpus_preprocessed)
vocab_size = len(vectorizer.vocabulary_)
print(f"Vocabulary size: {vocab_size}")

# Step 2: Read the Test Data
# Assuming we have a test dataset in this format (replace with your actual data)

# Step 3: Read the Test CSV File and Remove the Fourth Column
test_file_path = './data/CW-1-testdata.csv'  # Path to your test pairs CSV file
test_pairs = pd.read_csv(test_file_path, header=None, names=['PairID', 'Term1', 'Term2', 'GoldSimilarity'])

# Replace spaces with underscores in multi-word terms (if needed)
test_pairs['Term1'] = test_pairs['Term1'].str.replace(' ', '_')
test_pairs['Term2'] = test_pairs['Term2'].str.replace(' ', '_')

# Step 4: Create a function to calculate cosine similarity for word pairs
def calculate_cosine_similarity(row):
    word1_vec = get_term_vector(row['Term1'], vectorizer, tfidf_matrix)
    word2_vec = get_term_vector(row['Term2'], vectorizer, tfidf_matrix)

    if word1_vec is not None and word2_vec is not None:
        return cosine_similarity([word1_vec], [word2_vec])[0][0]
    return 0  # Return 0 if either word is not found in the vocabulary

# Function to get term vector from TF-IDF matrix
def get_term_vector(term, vectorizer, tfidf_matrix):
    try:
        term_index = vectorizer.vocabulary_[term]
        return tfidf_matrix[:, term_index].toarray().flatten()
    except KeyError:
        return None  # Return None if the term is not in the vocabulary

# Step 5: Apply the Cosine Similarity Calculation to Each Row in Test Pairs
for idx, row in test_pairs.iterrows():
    word1, word2 = row['Term1'], row['Term2']

    # Get term vectors for the pair
    vec1 = get_term_vector(word1, vectorizer, tfidf_matrix)
    vec2 = get_term_vector(word2, vectorizer, tfidf_matrix)

    # Calculate cosine similarity if both vectors are found
    if vec1 is not None and vec2 is not None:
        similarity = cosine_similarity([vec1], [vec2])[0][0]
    else:
        similarity = 0  # Assign 0 similarity if any term is not in vocabulary
    test_pairs.at[idx, 'computed_similarity'] = similarity

# Step 6: Write the Updated Test Pairs with Calculated Cosine Similarities to the CSV File
output_df = test_pairs[['PairID', 'Term1', 'Term2', 'computed_similarity']]  # Note the correct column name here
output_file_path = './data/11028972_task1_results.csv'  # Path to save the updated CSV
output_df.to_csv(output_file_path, index=False, header=False)

print(f"Updated test pairs written to {output_file_path}")

Vocabulary size: 272445
Updated test pairs written to ./data/11028972_task1_results.csv
