In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('SimLex-999.txt', delimiter='\t')

df

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75


In [3]:
df.POS.unique()

array(['A', 'N', 'V'], dtype=object)

## VAD as a feature

In [4]:
# No textual context data, etc. provided -- lexicon based approach to be followed
nrc = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\t', header=None)

nrc.columns = ['word', 'valence', 'arousal', 'dominance']
nrc = nrc.set_index('word')

nrc

Unnamed: 0_level_0,valence,arousal,dominance
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaaaaaah,0.479,0.606,0.291
aaaah,0.520,0.636,0.282
aardvark,0.427,0.490,0.437
aback,0.385,0.407,0.288
abacus,0.510,0.276,0.485
...,...,...,...
zoo,0.760,0.520,0.580
zoological,0.667,0.458,0.492
zoology,0.568,0.347,0.509
zoom,0.490,0.520,0.462


In [5]:
nrc.at['bee', 'valence']

0.52

In [6]:
df2 = df.loc[:, ['word1', 'word2', 'SimLex999']]

In [7]:
def transform_row(row):
  new_row = row.copy()

  return pd.DataFrame(new_row)


In [8]:
rows = []
for i, row in df2.iterrows():
  word1 = row['word1']
  word2 = row['word2']

  score = row['SimLex999']

  try:
    rows.append([
        word1,
        word2,
        nrc.at[word1, 'valence'],
        nrc.at[word1, 'arousal'],
        nrc.at[word1, 'dominance'],
        nrc.at[word2, 'valence'],
        nrc.at[word2, 'arousal'],
        nrc.at[word2, 'dominance'],
        score
    ])
  except KeyError:
    continue

df3 = pd.DataFrame(rows)

In [9]:
df.shape

(999, 10)

In [10]:
df3.shape

(962, 9)

While not ideal, the usage of SimLex has resulted in the loss of 37 out of 999 samples (~3%); which is still acceptable.

Store the SDs of each pair-wise value; this will be used in evaluating the model.

In [11]:
pairwise_sd = {}

for _, row in df.iterrows():
  pairwise_sd[row['word1'], row['word2']] = row['SD(SimLex)']

In [12]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,smart,intelligent,0.906,0.607,0.923,0.917,0.541,0.868,9.20
1,hard,difficult,0.302,0.708,0.616,0.235,0.700,0.508,8.77
2,happy,cheerful,1.000,0.735,0.772,0.990,0.720,0.696,9.55
3,hard,easy,0.302,0.708,0.616,0.865,0.194,0.373,0.95
4,happy,glad,1.000,0.735,0.772,0.938,0.760,0.740,9.17
...,...,...,...,...,...,...,...,...,...
957,join,acquire,0.729,0.479,0.684,0.833,0.468,0.700,2.85
958,send,attend,0.663,0.480,0.537,0.650,0.420,0.625,1.67
959,gather,attend,0.648,0.510,0.546,0.650,0.420,0.625,4.80
960,absorb,withdraw,0.469,0.558,0.670,0.260,0.425,0.336,2.97


In [13]:
X = df3.iloc[:, :-1].values
y = df3.iloc[:, -1].values

X

array([['smart', 'intelligent', 0.906, ..., 0.917, 0.541, 0.868],
       ['hard', 'difficult', 0.302, ..., 0.235, 0.7, 0.508],
       ['happy', 'cheerful', 1.0, ..., 0.99, 0.72, 0.696],
       ...,
       ['gather', 'attend', 0.648, ..., 0.65, 0.42, 0.625],
       ['absorb', 'withdraw', 0.469, ..., 0.26, 0.425, 0.336],
       ['attend', 'arrive', 0.65, ..., 0.837, 0.406, 0.585]], dtype=object)

In [14]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets (75% train, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


Try out various models, to find the best one

In [15]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

cross_val_score(LinearRegression(), X[:, 2:], y, cv=5).mean()

-0.06451575276478638

(Ok, Bad.)

In [16]:
from sklearn.svm import SVR

cross_val_score(SVR(), X[:, 2:], y, cv=5).mean()

0.08234493656036967

In [17]:
from sklearn.ensemble import RandomForestRegressor

cross_val_score(RandomForestRegressor(n_estimators=100), X[:, 2:], y, cv=5).mean()

0.04374554178660062

In [18]:
svr = SVR()

svr.fit(X_train[:, 2:], y_train)

In [19]:
from sklearn.metrics import mean_squared_error

y_pred = svr.predict(X_test[:, 2:])
mean_squared_error(y_test, y_pred)

5.53500159519488

Standard Machine Learning Approaches seem to be performing low; may have to use Neural Nets - however, let's first check the validity using SD.



In [20]:
df4 = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

df4

Unnamed: 0,True,Pred
0,6.90,5.874684
1,4.92,4.137310
2,3.27,2.186759
3,1.60,3.831075
4,6.58,3.354484
...,...,...
188,6.75,3.590077
189,0.95,0.994105
190,9.20,5.308906
191,0.40,1.893833


In [33]:
import numpy as np

df4['word1'] = X_test[:, 0]
df4['word2'] = X_test[:, 1]

df4['abs_error'] = (df4['True'] - df4['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in df4.iterrows():
  SD_ratio.append(df4['abs_error'] / pairwise_sd[(row.word1, row.word2)])

df4['SD_ratio'] = np.array(SD_ratio)

df4.to_csv('svm_output.csv')

In [34]:
df4['abs_error'].describe()

count    193.000000
mean       1.968001
std        1.292529
min        0.011500
25%        1.035356
50%        1.790184
75%        2.731624
max        6.694511
Name: abs_error, dtype: float64

In [97]:
len(df4[df4['SD_ratio'].apply(lambda x: x<=1)])

138

In [98]:
len(df4[df4['SD_ratio'].apply(lambda x: x<=2)])

182

138 out of 193 samples; accuracy of 71.5% (within 1 Standard Deviation)

By extension, 94.3% of samples occur with a maximum of 2 Standard Deviations

In [37]:
svr.score(X_test[:, 2:], y_test)

0.15577481040576968

In [38]:
df4

Unnamed: 0,True,Pred,word1,word2,abs_error,SD_ratio
0,6.90,5.874684,illness,infection,1.025316,0.801028
1,4.92,4.137310,basketball,baseball,0.782690,1.102490
2,3.27,2.186759,bee,queen,1.083241,0.854430
3,1.60,3.831075,destroy,make,2.231075,0.596114
4,6.58,3.354484,bird,turkey,3.225516,0.640822
...,...,...,...,...,...,...
188,6.75,3.590077,idea,scheme,3.159923,0.697494
189,0.95,0.994105,sad,funny,0.044105,0.801028
190,9.20,5.308906,weird,odd,3.891094,0.807335
191,0.40,1.893833,wife,straw,1.493833,0.670141


SVM model gives predictions with a MSE of 5.53500159519488; can this be improved with a Neual Net?

And will improving MSE also improve SD Ratios?



In [40]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,smart,intelligent,0.906,0.607,0.923,0.917,0.541,0.868,9.20
1,hard,difficult,0.302,0.708,0.616,0.235,0.700,0.508,8.77
2,happy,cheerful,1.000,0.735,0.772,0.990,0.720,0.696,9.55
3,hard,easy,0.302,0.708,0.616,0.865,0.194,0.373,0.95
4,happy,glad,1.000,0.735,0.772,0.938,0.760,0.740,9.17
...,...,...,...,...,...,...,...,...,...
957,join,acquire,0.729,0.479,0.684,0.833,0.468,0.700,2.85
958,send,attend,0.663,0.480,0.537,0.650,0.420,0.625,1.67
959,gather,attend,0.648,0.510,0.546,0.650,0.420,0.625,4.80
960,absorb,withdraw,0.469,0.558,0.670,0.260,0.425,0.336,2.97


In [41]:
y_train = y_train.ravel()

In [42]:
X_train.shape

(576, 8)

In [43]:
import numpy as np

X_train[:, 2:].astype(np.float64)

array([[0.667, 0.26 , 0.37 , 0.685, 0.33 , 0.241],
       [0.74 , 0.594, 0.667, 0.771, 0.769, 0.612],
       [0.698, 0.408, 0.649, 0.406, 0.235, 0.339],
       ...,
       [0.812, 0.69 , 0.852, 0.827, 0.612, 0.824],
       [0.729, 0.479, 0.684, 0.728, 0.49 , 0.554],
       [0.323, 0.667, 0.377, 0.792, 0.417, 0.457]])

In [46]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(6,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

- SVM: 5.53500159519488
-Neural Net: 5.339602947235107

However, this stark difference may also be the product of randomness in Backpropagation/learning, etc.

In [51]:
nn_output = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

nn_output['word1'] = X_test[:, 0]
nn_output['word2'] = X_test[:, 1]

nn_output['abs_error'] = (nn_output['True'] - nn_output['Pred']).apply(lambda x: abs(x))


SD_ratio = []
for _, row in nn_output.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

# print(SD_ratio)

nn_output['SD_ratio'] = np.array(SD_ratio)

nn_output.to_csv('nn_output.csv')

[0.8010277475786365, 0.8416022516203089, 0.9027005416785246, 1.2971363493190735, 2.0159472532243883, 1.7118099073108572, 1.5225256862120755, 0.7207268491320208, 0.5380164451738069, 1.0579055048353445, 3.213069826456428, 2.1903119711893835, 1.05794607231304, 4.60920846178413, 0.8712071644040611, 3.1621682253328127, 1.2909848445372711, 0.5309519412982748, 5.063737042001851, 0.2318532493430614, 2.360652908878899, 2.455504064385973, 0.7673711931050405, 0.6440901388937067, 1.739154257279841, 3.698842279855356, 0.9181896229309958, 0.7007365854829265, 2.236346981202669, 1.1464195362278164, 0.16400138126797034, 2.327988668961508, 0.13282878542583443, 1.6760971326646443, 1.881365868023017, 0.519100291074077, 2.7001900295964356, 1.0518941342014436, 2.728529844119143, 2.495483435126608, 3.5378386850959633, 3.0668361132214637, 0.24748969623380282, 1.5820975111725744, 1.9241764342512535, 1.4996995016845347, 1.0827134841596864, 4.809999691029201, 7.77683956016101, 1.3698907223790524, 0.5728558989721

In [52]:
nn_output

Unnamed: 0,True,Pred,word1,word2,abs_error,SD_ratio
0,6.90,5.874684,illness,infection,1.025316,0.801028
1,4.92,4.137310,basketball,baseball,0.782690,0.841602
2,3.27,2.186759,bee,queen,1.083241,0.902701
3,1.60,3.831075,destroy,make,2.231075,1.297136
4,6.58,3.354484,bird,turkey,3.225516,2.015947
...,...,...,...,...,...,...
188,6.75,3.590077,idea,scheme,3.159923,2.149608
189,0.95,0.994105,sad,funny,0.044105,0.034457
190,9.20,5.308906,weird,odd,3.891094,3.063854
191,0.40,1.893833,wife,straw,1.493833,0.976362


In [53]:
len(nn_output[nn_output['SD_ratio'].apply(lambda x: x<=1)])

57

In [None]:
len(nn_output[nn_output['SD_ratio'].apply(lambda x: x<=2)])

126

In [55]:
len(nn_output[nn_output['SD_ratio'].apply(lambda x: x>2)])

67

In [None]:
nn_output[nn_output['abs_error'] == df4['abs_error']]

Unnamed: 0,True,Pred,word1,word2,abs_error
0,6.90,5.874684,illness,infection,1.025316
1,4.92,4.137310,basketball,baseball,0.782690
2,3.27,2.186759,bee,queen,1.083241
3,1.60,3.831075,destroy,make,2.231075
4,6.58,3.354484,bird,turkey,3.225516
...,...,...,...,...,...
188,6.75,3.590077,idea,scheme,3.159923
189,0.95,0.994105,sad,funny,0.044105
190,9.20,5.308906,weird,odd,3.891094
191,0.40,1.893833,wife,straw,1.493833


In [None]:
df4.abs_error.describe()

count    193.000000
mean       1.924569
std        1.353862
min        0.000166
25%        0.815127
50%        1.703530
75%        2.765143
max        6.765926
Name: abs_error, dtype: float64

This is... interesting. The Neural Network is significantly worse at predicting the word similarity, even though the MSE is lower. (Additionally, the MAE noted from mean of `abs_error` column, is also lower for the NN.)

Can we improve this, by using another embedding, other than the VAD scores?


## Word2Vec

In [56]:
import nltk
nltk.download('brown')

from nltk.corpus import brown


len(brown.sents())

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


57340

In [57]:
import gensim
from gensim.models import Word2Vec

w2v = Word2Vec(brown.sents(), min_count=1, vector_size=100, window=5)

In [58]:
rows = []
for i, row in df2.iterrows():
  word1 = row['word1']
  word2 = row['word2']

  score = row['SimLex999']

  try:
    rows.append([
        word1,
        word2,
        nrc.at[word1, 'valence'],
        nrc.at[word1, 'arousal'],
        nrc.at[word1, 'dominance'],
        nrc.at[word2, 'valence'],
        nrc.at[word2, 'arousal'],
        nrc.at[word2, 'dominance'],
        w2v.wv.similarity(word1, word2),
        score
    ])
  except KeyError:
    continue

df5 = pd.DataFrame(rows)

In [59]:
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,smart,intelligent,0.906,0.607,0.923,0.917,0.541,0.868,0.829624,9.20
1,hard,difficult,0.302,0.708,0.616,0.235,0.700,0.508,0.866216,8.77
2,happy,cheerful,1.000,0.735,0.772,0.990,0.720,0.696,0.766488,9.55
3,hard,easy,0.302,0.708,0.616,0.865,0.194,0.373,0.926008,0.95
4,happy,glad,1.000,0.735,0.772,0.938,0.760,0.740,0.832238,9.17
...,...,...,...,...,...,...,...,...,...,...
954,join,acquire,0.729,0.479,0.684,0.833,0.468,0.700,0.933578,2.85
955,send,attend,0.663,0.480,0.537,0.650,0.420,0.625,0.892023,1.67
956,gather,attend,0.648,0.510,0.546,0.650,0.420,0.625,0.928954,4.80
957,absorb,withdraw,0.469,0.558,0.670,0.260,0.425,0.336,0.853783,2.97


In [60]:
df5.shape

(959, 10)

In [61]:
X = df5.iloc[:, :-1].values
y = df5.iloc[:, -1].values

from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets (75% train, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)


In [62]:
svr = SVR()

svr.fit(X_train[:, 2:].astype(np.float64), y_train)

In [63]:
y_pred = svr.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)

# 5.53500159519488

4.724532153923312

In [64]:
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(7,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Squared Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [65]:
# SVM output
with_sim = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

with_sim['word1'] = X_test[:, 0]
with_sim['word2'] = X_test[:, 1]


with_sim['abs_error'] = (with_sim['True'] - with_sim['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in with_sim.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

with_sim['SD_ratio'] = SD_ratio


with_sim

Unnamed: 0,True,Pred,word1,word2,abs_error,SD_ratio
0,7.63,7.078555,know,comprehend,0.551445,0.396723
1,7.70,4.750847,wagon,carriage,2.949153,1.602801
2,3.27,2.912047,bee,queen,0.357953,0.298294
3,2.25,2.187917,keep,borrow,0.062083,0.048126
4,6.47,3.519168,ask,plead,2.950832,1.821501
...,...,...,...,...,...,...
187,1.45,7.143261,argument,agreement,5.693261,4.186222
188,8.73,7.138337,attention,awareness,1.591663,0.970526
189,0.95,0.780496,sad,funny,0.169504,0.132425
190,9.20,5.590172,weird,odd,3.609828,2.842384


In [67]:
len(with_sim[with_sim.SD_ratio <= 1])

81

In [68]:
len(with_sim[with_sim.SD_ratio <= 2])

141

In [69]:
len(with_sim[with_sim.SD_ratio > 2])

51

In [70]:
# NN output
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

nn_with_sim = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

nn_with_sim['word1'] = X_test[:, 0]
nn_with_sim['word2'] = X_test[:, 1]


nn_with_sim['abs_error'] = (nn_with_sim['True'] - nn_with_sim['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in nn_with_sim.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

nn_with_sim['SD_ratio'] = SD_ratio

print(len(with_sim[with_sim.SD_ratio <= 1]),
      len(with_sim[with_sim.SD_ratio <= 2]),
      len(with_sim[with_sim.SD_ratio > 2]))

81 141 51


Reducing MSE seems to be making worse (qualitatively) predictions? What?

In [73]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet

def word_similarity(word1, word2):
    # Get synsets for each word
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    max_similarity = 0.0

    for synset1 in synsets1:
        for synset2 in synsets2:
            # Calculate similarity between synsets
            similarity = synset1.wup_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity

    return max_similarity

word1 = "agreement"
word2 = "argument"
similarity_score = word_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")


Similarity between 'agreement' and 'argument': 0.7777777777777778


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [74]:
with_sim[with_sim.SD_ratio > 2]

Unnamed: 0,True,Pred,word1,word2,abs_error,SD_ratio
6,1.9,4.510613,bar,jail,2.610613,2.330904
7,7.85,3.637121,wine,liquor,4.212879,5.401126
9,7.47,6.063049,realize,discover,1.406951,2.384663
11,9.37,5.883502,bizarre,strange,3.486498,2.26396
13,2.85,7.281345,join,acquire,4.431345,4.476106
14,1.6,5.211891,author,reader,3.611891,2.457069
15,6.53,3.609319,cabin,hut,2.920681,2.131884
21,7.07,4.9483,horse,colt,2.1217,2.525833
22,0.4,3.88427,dollar,people,3.48427,2.453711
23,2.62,5.838239,friend,teacher,3.218239,2.494759


In [77]:
word1 = 'wine'
word2 = 'liquor'
similarity_score = word_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")

Similarity between 'wine' and 'liquor': 0.9


In [84]:
rows = []
for i, row in df2.iterrows():
  word1 = row['word1']
  word2 = row['word2']

  score = row['SimLex999']

  try:
    rows.append([
        word1,
        word2,
        nrc.at[word1, 'valence'],
        nrc.at[word1, 'arousal'],
        nrc.at[word1, 'dominance'],
        nrc.at[word2, 'valence'],
        nrc.at[word2, 'arousal'],
        nrc.at[word2, 'dominance'],
        word_similarity(word1, word2),
        score
    ])
  except KeyError:
    continue

df5 = pd.DataFrame(rows)

In [89]:
df5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,smart,intelligent,0.906,0.607,0.923,0.917,0.541,0.868,0.500000,9.20
1,hard,difficult,0.302,0.708,0.616,0.235,0.700,0.508,1.000000,8.77
2,happy,cheerful,1.000,0.735,0.772,0.990,0.720,0.696,0.500000,9.55
3,hard,easy,0.302,0.708,0.616,0.865,0.194,0.373,0.500000,0.95
4,happy,glad,1.000,0.735,0.772,0.938,0.760,0.740,1.000000,9.17
...,...,...,...,...,...,...,...,...,...,...
957,join,acquire,0.729,0.479,0.684,0.833,0.468,0.700,0.500000,2.85
958,send,attend,0.663,0.480,0.537,0.650,0.420,0.625,0.333333,1.67
959,gather,attend,0.648,0.510,0.546,0.650,0.420,0.625,0.400000,4.80
960,absorb,withdraw,0.469,0.558,0.670,0.260,0.425,0.336,0.500000,2.97


In [93]:
X = df5.iloc[:, :-1].values
y = df5.iloc[:, -1].values


from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets (75% train, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [94]:
svr = SVR()

svr.fit(X_train[:, 2:].astype(np.float64), y_train)

In [95]:
y_pred = svr.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)

# 5.53500159519488

4.255252351426032

In [96]:
# SVM output (With WordNet)

svm_with_wn = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

svm_with_wn['word1'] = X_test[:, 0]
svm_with_wn['word2'] = X_test[:, 1]
print(svm_with_wn.word1, svm_with_wn.word2)


svm_with_wn['abs_error'] = (svm_with_wn['True'] - svm_with_wn['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in svm_with_wn.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

svm_with_wn['SD_ratio'] = SD_ratio

print(len(svm_with_wn[svm_with_wn.SD_ratio <= 1]),
      len(svm_with_wn[svm_with_wn.SD_ratio <= 2]),
      len(svm_with_wn[svm_with_wn.SD_ratio > 2]))

0         illness
1      basketball
2             bee
3         destroy
4            bird
          ...    
188          idea
189           sad
190         weird
191          wife
192         bring
Name: word1, Length: 193, dtype: object 0      infection
1       baseball
2          queen
3           make
4         turkey
         ...    
188       scheme
189        funny
190          odd
191        straw
192      restore
Name: word2, Length: 193, dtype: object
83 144 49


In [100]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(7,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [101]:
# NN output
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

nn_with_wn = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

nn_with_wn['word1'] = X_test[:, 0]
nn_with_wn['word2'] = X_test[:, 1]


nn_with_wn['abs_error'] = (nn_with_wn['True'] - nn_with_wn['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in nn_with_wn.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

nn_with_wn['SD_ratio'] = SD_ratio

print(len(with_sim[with_sim.SD_ratio <= 1]),
      len(with_sim[with_sim.SD_ratio <= 2]),
      len(with_sim[with_sim.SD_ratio > 2]))

81 141 51


In [102]:
# VAD, Similarity, WordNet

In [104]:
rows = []
for i, row in df2.iterrows():
  word1 = row['word1']
  word2 = row['word2']

  score = row['SimLex999']

  try:
    rows.append([
        word1,
        word2,
        nrc.at[word1, 'valence'],
        nrc.at[word1, 'arousal'],
        nrc.at[word1, 'dominance'],
        nrc.at[word2, 'valence'],
        nrc.at[word2, 'arousal'],
        nrc.at[word2, 'dominance'],
        w2v.wv.similarity(word1, word2),
        word_similarity(word1, word2),
        score
    ])
  except KeyError:
    continue

df6 = pd.DataFrame(rows)

In [105]:
X = df6.iloc[:, :-1].values
y = df6.iloc[:, -1].values


from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training data into training and validation sets (75% train, 25% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [106]:
svr = SVR()

svr.fit(X_train[:, 2:].astype(np.float64), y_train)

In [107]:
y_pred = svr.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)

# 5.53500159519488

3.6621559184695323

In [108]:
# SVM output (With WordNet)

svm_all = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

svm_all['word1'] = X_test[:, 0]
svm_all['word2'] = X_test[:, 1]
# print(svm_all.word1, svm_all.word2)


svm_all['abs_error'] = (svm_all['True'] - svm_all['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in svm_all.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

svm_all['SD_ratio'] = SD_ratio

print(len(svm_all[svm_all.SD_ratio <= 1]),
      len(svm_all[svm_all.SD_ratio <= 2]),
      len(svm_all[svm_all.SD_ratio > 2]))

0           know
1          wagon
2            bee
3           keep
4            ask
         ...    
187     argument
188    attention
189          sad
190        weird
191         hole
Name: word1, Length: 192, dtype: object 0      comprehend
1        carriage
2           queen
3          borrow
4           plead
          ...    
187     agreement
188     awareness
189         funny
190           odd
191          wife
Name: word2, Length: 192, dtype: object
105 156 36


In [109]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(8,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)




Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [110]:
# NN output
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

nn_all = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

nn_all['word1'] = X_test[:, 0]
nn_all['word2'] = X_test[:, 1]


nn_all['abs_error'] = (nn_all['True'] - nn_all['Pred']).apply(lambda x: abs(x))

SD_ratio = []
for _, row in nn_all.iterrows():
  SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

nn_all['SD_ratio'] = SD_ratio

print(len(nn_all[nn_all.SD_ratio <= 1]),
      len(nn_all[nn_all.SD_ratio <= 2]),
      len(nn_all[nn_all.SD_ratio > 2]))

105 151 41
