In [1]:
!pip install seaborn matplotlib




In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
import numpy as np
import nltk
from nltk.corpus import stopwords
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors

nltk.download('stopwords')

# Load SimLex999 data
simlex_data = pd.read_csv("/dgxa_home/se20uari151//SimLex-999.txt", delimiter='\t')

# Load Brown Corpus data
brown_data = pd.read_csv('/dgxa_home/se20uari151/brown.csv')

print("Sample of Brown Corpus data before preprocessing:")
print(brown_data[['tokenized_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /dgxa_home/se20uari151/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Sample of Brown Corpus data before preprocessing:
                                      tokenized_text
0  Furthermore , as an encouragement to revisioni...
1  The Unitarian clergy were an exclusive club of...
2  Ezra Stiles Gannett , an honorable representat...
3  Even so , Gannett judiciously argued , the Ass...
4  We today are not entitled to excoriate honest ...


In [3]:
# Function for text preprocessing
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in text.split() if word.isalpha()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    return tokens

# Apply text preprocessing to each sentence in the Brown Corpus
brown_data['preprocessed_text'] = brown_data['tokenized_text'].apply(preprocess_text)

# Display the dataset after preprocessing
print("\nAfter Preprocessing:")
print(brown_data['preprocessed_text'].head())



After Preprocessing:
0    [furthermore, encouragement, revisionist, thin...
1    [unitarian, clergy, exclusive, club, cultivate...
2    [ezra, stiles, gannett, honorable, representat...
3    [even, gannett, judiciously, argued, associati...
4    [today, entitled, excoriate, honest, men, beli...
Name: preprocessed_text, dtype: object


In [4]:
import os

glove_dir='/dgxa_home/se20uari151'

embeddings_index={}
f=open(os.path.join(glove_dir,'glove.6B.100d.txt'),encoding="utf8")
for line in f:
  values=line.split()
  word=values[0]
  coefs=np.asarray(values[1:], dtype='float32')
  embeddings_index[word]=coefs
f.close()

print('%s word  vectors'%len(embeddings_index))

# embeddings_index['good']

400000 word  vectors


In [5]:
# Create word embeddings for Brown Corpus using GloVe
embedding_dim = 100  # Assuming you are using GloVe with 100-dimensional vectors

def get_embedding(word):
    return embeddings_index.get(word, np.zeros(embedding_dim))

brown_data['word_embeddings'] = brown_data['preprocessed_text'].apply(lambda words: [get_embedding(word) for word in words])

# Display a few entries to verify the embeddings
print(brown_data[['preprocessed_text', 'word_embeddings']].head())

                                   preprocessed_text  \
0  [furthermore, encouragement, revisionist, thin...   
1  [unitarian, clergy, exclusive, club, cultivate...   
2  [ezra, stiles, gannett, honorable, representat...   
3  [even, gannett, judiciously, argued, associati...   
4  [today, entitled, excoriate, honest, men, beli...   

                                     word_embeddings  
0  [[-0.158, 0.15186, -0.098325, 0.31868, -0.0931...  
1  [[0.14219, 0.68774, -0.84726, -0.027145, -0.41...  
2  [[0.30293, 0.33103, -1.1076, -0.1375, -0.35773...  
3  [[-0.15308, 0.63194, 0.65512, -0.30706, -0.239...  
4  [[-0.19939, 0.37846, 0.52093, 0.28347, -0.1898...  


In [6]:
print(len(brown_data['preprocessed_text'][0]))

print(np.vstack(brown_data['word_embeddings'][0]).shape)


print(len(brown_data['preprocessed_text'][1]))
print(np.vstack(brown_data['word_embeddings'][1]).shape)


print(len(brown_data['preprocessed_text'][55160]))
print(np.vstack(brown_data['word_embeddings'][55160]).shape)

total_rows = len(brown_data['word_embeddings'])
print("Total number of rows in 'word_embeddings':", total_rows)

print(type(np.vstack(brown_data['word_embeddings'][0]).shape[0]))

14
(14, 100)
16
(16, 100)
9
(9, 100)
Total number of rows in 'word_embeddings': 57340
<class 'int'>


In [7]:
rrearranged_data=pd.read_csv('/dgxa_home/se20uari151/token.csv')

# Drop duplicates based on 'preprocessed_text'
rrearranged_data = rrearranged_data.drop_duplicates(subset=['preprocessed_text'])

duplicates = rrearranged_data['preprocessed_text'].duplicated()
print("Duplicates in 'rrrrrpreprocessed_text':", duplicates.any())

# Assuming your new dataset is named 'new_dataset' and has a column 'preprocessed_text'
rrearranged_data['word_embeddings'] = rrearranged_data['preprocessed_text'].apply(lambda text: [get_embedding(word) for word in text.split()] if isinstance(text, str) else [])


# Assuming 'word_embeddings' column is already populated in your DataFrame
rrearranged_data['word_embeddings'] = rrearranged_data['word_embeddings'].apply(lambda embeddings_list: np.concatenate(embeddings_list) if embeddings_list else [])

# Assuming 'rearranged_data' is your DataFrame
rrearranged_data = rrearranged_data.drop(columns=['Unnamed: 1'])

rrearranged_data.head()

Duplicates in 'rrrrrpreprocessed_text': False


Unnamed: 0,preprocessed_text,word_embeddings
0,furthermore,"[-0.158, 0.15186, -0.098325, 0.31868, -0.09317..."
1,encouragement,"[0.72701, 0.58038, -0.12419, -0.027612, 0.4040..."
2,revisionist,"[0.12806, 0.86505, 0.16681, 0.21417, -0.25038,..."
3,thinking,"[0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ..."
4,manifestly,"[0.38598, -0.14472, -0.15539, 0.12179, -0.1323..."


In [8]:
print(np.vstack(rrearranged_data['word_embeddings'][19225]).shape)

print((rrearranged_data['preprocessed_text'][19225]))
print((rrearranged_data['preprocessed_text'][19225:19226]))

(100, 1)
identified
116407    NaN
Name: preprocessed_text, dtype: object


In [9]:
# Check for NaN values in the DataFrame
nan_rows = rrearranged_data[rrearranged_data.isnull().any(axis=1)]

# Drop rows with NaN values
rrearranged_data = rrearranged_data.dropna()

# Display the rows with NaN values (if any)
print("Rows with NaN values:")
print(nan_rows)

# Display the DataFrame after dropping NaN values
print("DataFrame after dropping NaN values:")
print(rrearranged_data.head())


Rows with NaN values:
       preprocessed_text word_embeddings
116407               NaN              []
DataFrame after dropping NaN values:
  preprocessed_text                                    word_embeddings
0       furthermore  [-0.158, 0.15186, -0.098325, 0.31868, -0.09317...
1     encouragement  [0.72701, 0.58038, -0.12419, -0.027612, 0.4040...
2       revisionist  [0.12806, 0.86505, 0.16681, 0.21417, -0.25038,...
3          thinking  [0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ...
4        manifestly  [0.38598, -0.14472, -0.15539, 0.12179, -0.1323...


In [10]:
rrearranged_data = rrearranged_data.reset_index(drop=True)

# Print the modified DataFrame
print(rrearranged_data.head())
print((rrearranged_data.loc[:, 'preprocessed_text']))

  preprocessed_text                                    word_embeddings
0       furthermore  [-0.158, 0.15186, -0.098325, 0.31868, -0.09317...
1     encouragement  [0.72701, 0.58038, -0.12419, -0.027612, 0.4040...
2       revisionist  [0.12806, 0.86505, 0.16681, 0.21417, -0.25038,...
3          thinking  [0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ...
4        manifestly  [0.38598, -0.14472, -0.15539, 0.12179, -0.1323...
0          furthermore
1        encouragement
2          revisionist
3             thinking
4           manifestly
             ...      
40094    pharmacopoeia
40095     redefinition
40096              usp
40097          mussett
40098       equipotent
Name: preprocessed_text, Length: 40099, dtype: object


In [11]:
print((rrearranged_data['preprocessed_text'][40098]))

equipotent


In [12]:
rrearranged_data.head()

Unnamed: 0,preprocessed_text,word_embeddings
0,furthermore,"[-0.158, 0.15186, -0.098325, 0.31868, -0.09317..."
1,encouragement,"[0.72701, 0.58038, -0.12419, -0.027612, 0.4040..."
2,revisionist,"[0.12806, 0.86505, 0.16681, 0.21417, -0.25038,..."
3,thinking,"[0.56354, 0.28124, 0.62225, 0.2077, -0.47417, ..."
4,manifestly,"[0.38598, -0.14472, -0.15539, 0.12179, -0.1323..."


In [13]:
from sklearn.metrics.pairwise import cosine_similarity
upto=40098
# Take the first 14 rows of word embeddings
subset_embeddings = rrearranged_data['word_embeddings'][:upto]

# Convert the subset of embeddings to a NumPy array
subset_embeddings_array = np.vstack(subset_embeddings)

# Calculate cosine similarity matrix for the subset
cosine_sim_matrix = cosine_similarity(subset_embeddings_array)

# Subset the cosine similarity matrix for the first 14 rows
cosine_sim_matrix_subset = cosine_sim_matrix[:upto, :upto]

# Optionally, you can convert the similarity matrix to a DataFrame for better visualization
cosine_sim_df_subset = pd.DataFrame(cosine_sim_matrix_subset, index=range(upto), columns=range(upto))

# Display the cosine similarity DataFrame for the subset
print(rrearranged_data['preprocessed_text'][:upto])
print(cosine_sim_df_subset)

0          furthermore
1        encouragement
2          revisionist
3             thinking
4           manifestly
             ...      
40093       thyrotoxic
40094    pharmacopoeia
40095     redefinition
40096              usp
40097          mussett
Name: preprocessed_text, Length: 40098, dtype: object
          0         1         2         3         4         5         6      \
0      1.000000  0.247114 -0.015581  0.417657  0.161974  0.386030  0.373484   
1      0.247114  1.000000  0.131388  0.348723  0.068426  0.230121  0.272027   
2     -0.015581  0.131388  1.000000  0.225604  0.139131  0.079546  0.134561   
3      0.417657  0.348723  0.225604  1.000000  0.036609  0.373480  0.540338   
4      0.161974  0.068426  0.139131  0.036609  1.000000  0.184054  0.143845   
...         ...       ...       ...       ...       ...       ...       ...   
40093  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
40094  0.020284 -0.052673  0.214876 -0.203851  0.126293  0.04

In [14]:
word1 = "flower"
word2 = "endurance"

# Check if the words exist in the preprocessed_text column
if word1 in rrearranged_data['preprocessed_text'].values and word2 in rrearranged_data['preprocessed_text'].values:
    # Get the index numbers in the preprocessed_text column
    index1 = rrearranged_data.index[rrearranged_data['preprocessed_text'] == word1].tolist()[0]
    index2 = rrearranged_data.index[rrearranged_data['preprocessed_text'] == word2].tolist()[0]

    # Get the similarity value from the cosine_sim_df_subset DataFrame
    similarity_value = cosine_sim_df_subset.iloc[index1, index2]

    # Map the similarity value to the range [0, 10]
    mapped_similarity = similarity_value

    print(f"Similarity between '{word1}' and '{word2}': {mapped_similarity}")
else:
    print(f"One or both of the words '{word1}' and '{word2}' not found in the preprocessed_text column.")

Similarity between 'flower' and 'endurance': -0.01657593563095525


In [15]:
# Linear transformation function
def transform_similarity(similarity):
    # Map from [-1, 1] to [0, 10]
    return (similarity + 1) * 5

def get_similarity_between_words(word1, word2, rearranged_data, cosine_sim_df_subset):
    if word1 in rearranged_data['preprocessed_text'].values and word2 in rearranged_data['preprocessed_text'].values:
        # Get the index numbers in the preprocessed_text column
        index1 = rearranged_data.index[rearranged_data['preprocessed_text'] == word1].tolist()[0]
        index2 = rearranged_data.index[rearranged_data['preprocessed_text'] == word2].tolist()[0]

        # Get the similarity value from the cosine_sim_df_subset DataFrame
        similarity_value = cosine_sim_df_subset.iloc[index1, index2]

        # Map the similarity value to the range [0, 10]
        mapped_similarity = transform_similarity(similarity_value)

        return mapped_similarity
    else:
        print(f"One or both of the words '{word1}' and '{word2}' not found in the preprocessed_text column.")
        return None

# Apply the function to the 'word1' and 'word2' columns of SimLex999
simlex_data['mapped_similarity'] = simlex_data.apply(lambda row: get_similarity_between_words(row['word1'], row['word2'], rrearranged_data, cosine_sim_df_subset), axis=1)

# Display the updated SimLex999 dataset with mapped similarity values
print(simlex_data[['word1', 'word2','SimLex999', 'mapped_similarity']])

# Extract the relevant columns for correlation
simlex_subset_for_correlation = simlex_data[['SimLex999', 'mapped_similarity']]

# Drop rows with NaN values (if any)
simlex_subset_for_correlation = simlex_subset_for_correlation.dropna()

# Calculate the correlation
correlation = np.corrcoef(simlex_subset_for_correlation['SimLex999'], simlex_subset_for_correlation['mapped_similarity'])[0, 1]

print(f'Correlation between SimLex999 and mapped_similarity: {correlation}')

One or both of the words 'disorganize' and 'organize' not found in the preprocessed_text column.
One or both of the words 'do' and 'happen' not found in the preprocessed_text column.
One or both of the words 'do' and 'quit' not found in the preprocessed_text column.
      word1        word2  SimLex999  mapped_similarity
0       old          new       1.58           8.216244
1     smart  intelligent       9.20           8.776366
2      hard    difficult       8.77           8.926273
3     happy     cheerful       9.55           7.729968
4      hard         easy       0.95           8.833839
..      ...          ...        ...                ...
994    join      acquire       2.85           7.323556
995    send       attend       1.67           7.504681
996  gather       attend       4.80           7.701219
997  absorb     withdraw       2.97           6.445788
998  attend       arrive       6.08           7.969202

[999 rows x 4 columns]
Correlation between SimLex999 and mapped_similari

In [16]:
print(simlex_data[['word1', 'word2', 'SimLex999', 'mapped_similarity']].sort_values(by='mapped_similarity'))


           word1      word2  SimLex999  mapped_similarity
723       flower  endurance       0.40           4.917120
744         hymn      straw       0.40           4.920208
82        modest    ashamed       2.65           4.993611
720    endurance       band       0.40           5.231201
521          pot  appliance       2.53           5.231400
..           ...        ...        ...                ...
777           go       come       2.42           9.639308
359      brother        son       3.48           9.688039
855  disorganize   organize       1.45                NaN
865           do     happen       4.23                NaN
924           do       quit       1.17                NaN

[999 rows x 4 columns]


In [17]:
# from sklearn.cluster import KMeans

# # Assuming n_clusters is the number of clusters you want
# n_clusters = 5
# kmeans = KMeans(n_clusters=n_clusters, random_state=42)

# # Fit the KMeans model on the cosine similarity matrix
# clusters = kmeans.fit_predict(cosine_sim_df_subset)

# # Add the cluster labels to the DataFrame
# rrearranged_data['cluster'] = clusters

# # Display the clusters and their representative words
# for cluster_label in range(n_clusters):
#     cluster_words = rrearranged_data.loc[rrearranged_data['cluster'] == cluster_label, 'preprocessed_text']
#     print(f"\nCluster {cluster_label + 1}: {', '.join(cluster_words)}")


In [18]:
# # Assuming you have ground truth similarity scores in simlex_data['SimLex999']
# ground_truth_scores = simlex_data['SimLex999'].values

# # Flatten the upper triangle of the cosine similarity matrix (excluding the diagonal)
# flat_cosine_sim = cosine_sim_matrix[np.triu_indices(40098, k=1)]

# # Evaluate the correlation between ground truth scores and cosine similarity
# correlation = np.corrcoef(flat_cosine_sim, ground_truth_scores)[0, 1]
# print(f'Correlation with ground truth scores: {correlation}')
