`Word Similarity Scores:` 
Given a pair of words, predict their similarity score. The
focus is how do you convert a word to its numerical representation, on which
learning algorithms (like Regression, classification etc) can be applied. Download
the dataset from this link. You have to come up with an unsupervised / semi
supervised method to achieve the task. Assume that you don't have any
supervised training data at your disposal. The whole dataset will be used as a
test set. Choose an appropriate metric that is suitable to assess the task and
report the results. You have to come up with a solution for each of the following
conditions:

i. Constraints on Data Resources: You can only use the following resources
(any one or all) to solve the problem (DON’T USE PRE-TRAINED MODELS!) :
- any monolingual English corpus - Maximum 1 million tokens

In [11]:
pip install wikipedia-api nltk pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.1 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.0.3 pytz-2024.1 tzdata-2024.1
Note: you may need to restart the kernel to use updated packages.


## Generating Corpus

In [1]:
import wikipediaapi
import re

def scrape_wikipedia_articles(max_tokens=1000000):
    wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    extract_format=wikipediaapi.ExtractFormat.WIKI,
    user_agent='WordEmbeddingScraper/1.0 (https://github.com/yourusername)')
    pages = [
        "Natural_language_processing",
        "Machine_learning",
        "Artificial_intelligence",
        "Data_science",
        "Deep_learning",
        "Computer_vision",
        "Neural_network",
        "Reinforcement_learning",
        "Big_data",
        "Statistics"
    ]
    corpus = ""
    total_tokens = 0

    for page_title in pages:
        page = wiki_wiki.page(page_title)
        if page.exists():
            text = page.text
            text = re.sub(r'\[\d+\]', '', text)  # Remove references
            tokens = text.split()
            total_tokens += len(tokens)
            if total_tokens > max_tokens:
                break
            corpus += text + "\n"

    return corpus

corpus = scrape_wikipedia_articles()


In [2]:
import requests

def download_gutenberg_book(url):
    response = requests.get(url)
    response.raise_for_status()
    text = response.text
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = re.sub(r'\[.*?\]', '', text)  # Remove square bracketed text
    start = re.search(r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*", text)
    end = re.search(r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK .* \*\*\*", text)
    if start and end:
        text = text[start.end():end.start()]
    return text

gutenberg_url = "https://www.gutenberg.org/files/1342/1342-0.txt"  # URL for "Pride and Prejudice"
gutenberg_text = download_gutenberg_book(gutenberg_url)


In [3]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

def create_combined_corpus(wiki_corpus, gutenberg_text, max_tokens=1000000):
    combined_text = wiki_corpus + "\n" + gutenberg_text
    tokens = word_tokenize(combined_text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    return ' '.join(tokens)

combined_corpus = create_combined_corpus(corpus, gutenberg_text)

with open('corpus.txt', 'w', encoding='utf-8') as f:
    f.write(combined_corpus)


[nltk_data] Downloading package punkt to /home/fiftyfive/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Trial 1

In [24]:
import numpy as np
from collections import defaultdict, Counter
import nltk
from nltk.tokenize import word_tokenize
import string

with open('corpus.txt', 'r', encoding='utf-8') as file:  # Loading and preprocessing the corpus
    corpus = file.read()

tokens = word_tokenize(corpus)
tokens = [word.lower() for word in tokens if word.isalpha()]
# print("len tokens ", tokens[0:10])

vocab = set(tokens)
# print("len vocab", len(vocab))

word_to_id = {word: i for i, word in enumerate(vocab)}
# print("word_to_id", word_to_id)

id_to_word = {i: word for word, i in word_to_id.items()}
# print("id_to_word", id_to_word)

vocab_size = len(vocab)
# print(vocab_size)

In [25]:
transition_matrix = np.zeros((vocab_size, vocab_size))

for i in range(len(tokens) - 1):
    current_word = tokens[i]
    next_word = tokens[i + 1]
    if current_word in word_to_id and next_word in word_to_id:
        current_id = word_to_id[current_word]
        next_id = word_to_id[next_word]
        transition_matrix[current_id, next_id] += 1

row_sums = transition_matrix.sum(axis=1)  # Convert counts to probabilities
transition_matrix = transition_matrix / row_sums[:, np.newaxis]
transition_matrix = np.nan_to_num(transition_matrix)

In [26]:
word_vectors = transition_matrix  # We will use the rows of the transition matrix as word vectors for the input words

In [27]:
import pandas as pd
from scipy.spatial.distance import cosine
from scipy.stats import spearmanr
import pandas as pd
from sklearn import preprocessing

simlex_df = pd.read_csv('SimLex-999.txt', delimiter='\t')

def cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

def get_vector(word):
    return word_vectors[word_to_id[word]]
    

def add_word_to_corpus(word):
    global vocab_size, transition_matrix, word_to_id, id_to_word, word_vectors

    if word in word_to_id:
        return word_vectors[word_to_id[word]]
    
    word_to_id[word] = vocab_size  # Updating word_to_id and id_to_word for new word
    id_to_word[vocab_size] = word
    vocab_size += 1

    transition_matrix = np.pad(transition_matrix, ((0, 1), (0, 1)), 'constant') # Extending the transition matrix

    new_vector = np.mean(transition_matrix, axis=0)
    transition_matrix[-1, :] = new_vector
    row_sum = np.sum(transition_matrix[-1, :]) # Normalize the new row to ensure it sums to 1
    transition_matrix[-1, :] /= row_sum if row_sum != 0 else 1
    word_vectors = transition_matrix
    return word_vectors[word_to_id[word]]

simlex_df['vector1'] = simlex_df['word1'].apply(get_vector)
simlex_df['vector2'] = simlex_df['word2'].apply(get_vector)
simlex_df = simlex_df.dropna(subset=['vector1', 'vector2'])
simlex_df['predicted_similarity'] = simlex_df.apply(lambda row: cosine_similarity(row['vector1'], row['vector2']), axis=1)


x = simlex_df[["SimLex999", "predicted_similarity"]].values 
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)

correlation, _ = spearmanr(df[0], df[1])  # Calculating Spearman correlation
print(f"Spearman correlation: {correlation}")

Spearman correlation: 0.06174202418589323


In [28]:
def word_similarity(word1, word2):
    vector1 = get_vector(word1) if get_vector(word1) is not None else add_word_to_corpus(word1)
    vector2 = get_vector(word2) if get_vector(word2) is not None else add_word_to_corpus(word2)

    if vector1 is None or vector2 is None:
        return 0
    
    similarity = cosine_similarity(vector1, vector2)
    return similarity

# Inference function
word1 = "smart"
word2 = "intelligent"
similarity = word_similarity(word1, word2)

print(f"The similarity between '{word1}' and '{word2}' is {similarity}")

The similarity between 'smart' and 'intelligent' is 0.10721125348377947


## Trial 2

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input, concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

simlex_path = 'SimLex-999.txt'
simlex_data = pd.read_csv(simlex_path, delimiter='\t')
word_pairs = simlex_data[['word1', 'word2']]
similarity_scores = simlex_data['SimLex999']
similarity_scores = (similarity_scores - similarity_scores.min()) / (similarity_scores.max() - similarity_scores.min())  # Normalizing similarity scores to the range [0, 1]

corpus_path = 'corpus.txt'
with open(corpus_path, 'r') as file:
    corpus = file.readlines()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
vocab_size = len(tokenizer.word_index) + 1

def create_input_sequences(word_pairs, tokenizer):
    sequences = []
    for i, row in word_pairs.iterrows():
        seq1 = tokenizer.texts_to_sequences([row['word1']])[0]
        seq2 = tokenizer.texts_to_sequences([row['word2']])[0]
        sequences.append((seq1, seq2))
    return sequences

input_sequences = create_input_sequences(word_pairs, tokenizer)

max_seq_length = max(max(len(seq1), len(seq2)) for seq1, seq2 in input_sequences)
input_sequences = [(pad_sequences([seq1], maxlen=max_seq_length)[0],
                    pad_sequences([seq2], maxlen=max_seq_length)[0])
                   for seq1, seq2 in input_sequences]

input_sequences = np.array(input_sequences)
similarity_scores = np.array(similarity_scores)

X_train, X_test, y_train, y_test = train_test_split(input_sequences, similarity_scores, test_size=0.2, random_state=42)

X_train_1 = np.array([seq1 for seq1, seq2 in X_train])
X_train_2 = np.array([seq2 for seq1, seq2 in X_train])
X_test_1 = np.array([seq1 for seq1, seq2 in X_test])
X_test_2 = np.array([seq2 for seq1, seq2 in X_test])


2024-06-06 01:50:58.329871: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-06 01:50:58.331118: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-06 01:50:58.358472: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Build the neural network model

In [5]:
embedding_dim = 100

input_1 = Input(shape=(max_seq_length,))
input_2 = Input(shape=(max_seq_length,))

embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length)
embedded_1 = embedding_layer(input_1)
lstm_1 = Bidirectional(LSTM(64))(embedded_1)
embedded_2 = embedding_layer(input_2)
lstm_2 = Bidirectional(LSTM(64))(embedded_2)

merged = concatenate([lstm_1, lstm_2])
dense_1 = Dense(64, activation='relu')(merged)
dropout_1 = Dropout(0.5)(dense_1)
output = Dense(1, activation='sigmoid')(dropout_1)

model = Model(inputs=[input_1, input_2], outputs=output)
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

In [6]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    [X_train_1, X_train_2], y_train,
    validation_data=([X_test_1, X_test_2], y_test),
    epochs=50,
    batch_size=32,
    callbacks=[early_stopping]
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


In [7]:
test_loss = model.evaluate([X_test_1, X_test_2], y_test)
print(f'model evaluation test loss: {test_loss}')

from scipy.stats import spearmanr

# Predict similarity scores for test data
predicted_scores = model.predict([X_test_1, X_test_2]).flatten()

# Calculate Spearman correlation coefficient
spearman_corr, _ = spearmanr(y_test, predicted_scores)

print(f"Spearman correlation coefficient: {spearman_corr}")


model evaluation test loss: 0.0721711590886116
Spearman correlation coefficient: 0.1496124481096653


In [8]:
# Inference function
def predict_similarity(word1, word2, tokenizer, model, max_seq_length):
    seq1 = tokenizer.texts_to_sequences([word1])
    seq2 = tokenizer.texts_to_sequences([word2])
    seq1 = pad_sequences(seq1, maxlen=max_seq_length)
    seq2 = pad_sequences(seq2, maxlen=max_seq_length)
    prediction = model.predict([seq1, seq2])
    return prediction[0][0]

word1 = 'smart'
word2 = 'intelligent'
similarity_score = predict_similarity(word1, word2, tokenizer, model, max_seq_length)
print(f'Similarity score between "{word1}" and "{word2}": {similarity_score}')




Similarity score between "smart" and "intelligent": 0.45076942443847656


ii. Unconstrained : Consider that the constraints above are removed and you are allowed to use any data or model.

In [58]:
import gensim
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

with open('corpus.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

tokens = word_tokenize(corpus)
tokens = [word.lower() for word in tokens if word.isalpha()]

sentences = [tokens[i:i+100] for i in range(0, len(tokens), 100)]  # Split tokens into sentences
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

model.save("word2vec.model")

In [59]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine

simlex_df = pd.read_csv('SimLex-999.txt', delimiter='\t')

model = Word2Vec.load("word2vec.model")

def get_vector(word):
    try:
        return model.wv[word]
    except KeyError:
        return np.zeros(model.vector_size)

simlex_df['vector1'] = simlex_df['word1'].apply(get_vector)
simlex_df['vector2'] = simlex_df['word2'].apply(get_vector)
simlex_df = simlex_df.dropna(subset=['vector1', 'vector2'])

In [60]:
def cosine_similarity(v1, v2):
    return 1 - cosine(v1, v2)

simlex_df['predicted_similarity'] = simlex_df.apply(lambda row: cosine_similarity(row['vector1'], row['vector2']), axis=1)


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [61]:
import pandas as pd
from sklearn import preprocessing

x = simlex_df[["SimLex999", "predicted_similarity"]].values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df = pd.DataFrame(x_scaled)


In [62]:
from scipy.stats import spearmanr

correlation, _ = spearmanr(df[0], df[1])
print(f"Spearman correlation: {correlation}")


Spearman correlation: 0.007257090149171784


In [63]:
def word_similarity(word1, word2):
    vector1 = get_vector(word1)
    vector2 = get_vector(word2)
    
    if vector1 is None or vector2 is None:
        return None
    
    similarity = cosine_similarity(vector1, vector2)
    return similarity

word1 = "smart"
word2 = "intelligent"
similarity = word_similarity(word1, word2)

if similarity is not None:
    print(f"The similarity between '{word1}' and '{word2}' is {similarity}")
else:
    print(f"One or both words '{word1}' and '{word2}' are not in the vocabulary.")


The similarity between 'smart' and 'intelligent' is 0.9684575796127319


In Word2Vec, vectors are generated only for words present in the vocabulary or corpus.
 
To obtain a vector for a new word, it needs to be added to the vocabulary, and then its embedding can be derived by training the Word2Vec model on the corpus, ensuring all words have corresponding embeddings. 

Essentially, Word2Vec generates embeddings for words based on their occurrence in the corpus, and adding new words involves updating the model to include them in the vocabulary and retraining to compute their embeddings.

In [48]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import multiprocessing
import nltk
from nltk.corpus import gutenberg
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('gutenberg') # Load Gutenberg Corpus (1 Million tokens)

[nltk_data] Downloading package punkt to /home/fiftyfive/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/fiftyfive/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/fiftyfive/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     /home/fiftyfive/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [49]:
words = gutenberg.words()
max_tokens = 1000000
if len(words) > max_tokens:
    words = words[:max_tokens]
    
# Preprocessing
words_lower = [word.lower() for word in words] # Convert tokens to lowercase
print("tokenized_corpus: ", words_lower)
words_no_punct = [word for word in words_lower if word not in string.punctuation]
stop_words = set(stopwords.words('english'))
words_no_stopwords = [word for word in words_no_punct if word not in stop_words]
lemmatizer = WordNetLemmatizer() # Lemmatization
words_lemmatized = [lemmatizer.lemmatize(word) for word in words_no_stopwords]
tokenized_corpus = [simple_preprocess(line) for line in words_lemmatized]  # Tokenize the corpus



In [50]:
# Word2Vec parameters
vector_size = 100  # Dimensionality of the word vectors
window_size = 5    # Context window size
min_count = 1      # Minimum frequency of words to consider
workers = multiprocessing.cpu_count()  # Number of CPU cores to use during training

# Training Word2Vec model
model = Word2Vec(
    tokenized_corpus,
    vector_size=vector_size,
    window=window_size,
    min_count=min_count,
    workers=workers
)

# Save the trained model
model.save('word2vec.model')

# Inference
# Example of how to use the model to find similarity between two words
word1 = 'dog'
word2 = 'dog'
similarity_score = model.wv.similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")

Similarity between 'dog' and 'dog': 1.0


In [55]:
# Inference
word1 = 'dog'
word2 = 'barks'  # Example words, modify as needed

# Check if words are in vocabulary
word1_in_vocab = word1 in model.wv.key_to_index
word2_in_vocab = word2 in model.wv.key_to_index

# Get vectors for words
vector_word1 = model.wv[word1] if word1_in_vocab else model.wv.vectors.mean(axis=0)
vector_word2 = model.wv[word2] if word2_in_vocab else model.wv.vectors.mean(axis=0)

# Calculate similarity
similarity_score = model.wv.cosine_similarities(vector_word1, [vector_word2])[0]
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")

Similarity between 'dog' and 'barks': -0.13962948322296143
