# Word Similarity (Constrained)

## Setup (General)

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('SimLex-999.txt', delimiter='\t')

df

Unnamed: 0,word1,word2,POS,SimLex999,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.20,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93
...,...,...,...,...,...,...,...,...,...,...
994,join,acquire,V,2.85,2.86,2.93,2,0.00,0,0.99
995,send,attend,V,1.67,2.70,3.17,2,0.00,0,1.44
996,gather,attend,V,4.80,2.75,3.17,2,0.00,0,1.97
997,absorb,withdraw,V,2.97,3.11,3.04,2,0.00,0,1.75


In [None]:
df.POS.unique()

array(['A', 'N', 'V'], dtype=object)

In [None]:
# Setup for accessing VAD scores
nrc = pd.read_csv('NRC-VAD-Lexicon.txt', delimiter='\t', header=None)

nrc.columns = ['word', 'valence', 'arousal', 'dominance']
nrc = nrc.set_index('word')

nrc

Unnamed: 0_level_0,valence,arousal,dominance
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aaaaaaah,0.479,0.606,0.291
aaaah,0.520,0.636,0.282
aardvark,0.427,0.490,0.437
aback,0.385,0.407,0.288
abacus,0.510,0.276,0.485
...,...,...,...
zoo,0.760,0.520,0.580
zoological,0.667,0.458,0.492
zoology,0.568,0.347,0.509
zoom,0.490,0.520,0.462


In [None]:
nrc.at['bee', 'valence']

0.52

In [None]:
# Store pair-wise SD for each word
pairwise_sd = {}

for _, row in df.iterrows():
  pairwise_sd[row['word1'], row['word2']] = row['SD(SimLex)']

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
train, test = train_test_split(df.loc[:, ['word1', 'word2', 'SimLex999']], test_size=0.2, random_state=42)

# Further split the training data into training and validation sets (75% train, 25% validation)
train, val = train_test_split(train, test_size=0.25, random_state=42)


In [None]:
train

Unnamed: 0,word1,word2,SimLex999
755,meal,waist,0.98
667,belly,abdomen,8.13
704,anger,mood,4.10
608,guy,partner,3.57
605,communication,television,5.60
...,...,...,...
869,get,put,1.98
575,despair,misery,7.22
756,camera,president,0.48
26,happy,angry,1.28


In [None]:
def generate_dataset(df, similarity_functions=None):
  rows = []
  for _, row in df.iterrows():
    word1 = row['word1']
    word2 = row['word2']

    score = row['SimLex999']

    try:
      row = [
              word1,
              word2,
              nrc.at[word1, 'valence'],
              nrc.at[word1, 'arousal'],
              nrc.at[word1, 'dominance'],
              nrc.at[word2, 'valence'],
              nrc.at[word2, 'arousal'],
              nrc.at[word2, 'dominance'],
          ]

      if similarity_functions is not None:
        for func in similarity_functions:
          row.append(func(word1, word2))

      row.append(score)

      rows.append(row)

    except KeyError:
      continue

  return pd.DataFrame(rows)

## Using ONLY VAD

In [None]:
train_VAD = generate_dataset(train)
test_VAD = generate_dataset(test)
val_VAD = generate_dataset(val)

all_VAD = generate_dataset(df.loc[:, ['word1', 'word2', 'SimLex999']])

### SVM

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

cross_val_score(LinearRegression(), all_VAD.iloc[:, 2:-1], all_VAD.iloc[:, -1], cv=5).mean()

-0.06451575276478642

In [None]:
from sklearn.svm import SVR

cross_val_score(SVR(), all_VAD.iloc[:, 2:-1], all_VAD.iloc[:, -1], cv=5).mean()

0.08234493656036967

In [None]:
from sklearn.ensemble import RandomForestRegressor

cross_val_score(RandomForestRegressor(n_estimators=100), all_VAD.iloc[:, 2:-1], all_VAD.iloc[:, -1], cv=5).mean()

0.05008484997250038

In [None]:
svr_VAD = SVR()

svr_VAD.fit(train_VAD.iloc[:, 2:-1], train_VAD.iloc[:, -1])

In [None]:
from sklearn.metrics import mean_squared_error

y_pred = svr_VAD.predict(test_VAD.iloc[:, 2:-1])
y_test = test_VAD.iloc[:, -1].values

mean_squared_error(y_test, y_pred)

5.264556766795256

In [None]:
def report_performance(df, y_pred, y_test):
  df4 = pd.DataFrame(list(zip(y_test, y_pred)), columns=['True', 'Pred'])

  df4['word1'] = df.iloc[:, 0]
  df4['word2'] = df.iloc[:, 1]

  df4['abs_error'] = (df4['True'] - df4['Pred']).apply(lambda x: abs(x))

  SD_ratio = []
  for _, row in df4.iterrows():
    SD_ratio.append(row['abs_error'] / pairwise_sd[(row.word1, row.word2)])

  df4['SD_ratio'] = np.array(SD_ratio)

  return {
      '<=1': len(df4[df4['SD_ratio'].apply(lambda x: x<=1)]),
      '<=2': len(df4[df4['SD_ratio'].apply(lambda x: x<=2)])
  }

In [None]:
report_performance(test_VAD, y_test, y_pred)

{'<=1': 78, '<=2': 134}

In [None]:
report = {}

In [None]:
report['SVM_VAD'] = report_performance(test_VAD, y_test, y_pred)

In [None]:
report_df = pd.DataFrame.from_dict(report, orient='index')
report_df

Unnamed: 0,<=1,<=2
SVM_VAD,78,134


### NN

In [None]:
X_train = train_VAD.iloc[:, :-1].values
y_train = train_VAD.iloc[:, -1].values

X_test = test_VAD.iloc[:, :-1].values
y_test = test_VAD.iloc[:, -1].values

X_val = val_VAD.iloc[:, :-1].values
y_val = val_VAD.iloc[:, -1].values

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(6,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
y_pred = model.predict(X_test[:, 2:].astype(np.float64))



In [None]:
report_performance(test_VAD, y_test, y_pred)

{'<=1': 75, '<=2': 127}

## VAD + Word2Vec

### Setup

In [None]:
import nltk
nltk.download('brown')

from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


In [None]:
import gensim
from gensim.models import Word2Vec

w2v = Word2Vec(brown.sents(), min_count=1, vector_size=100, window=5)

In [None]:
def wordvec_similarity(word1, word2):
  return w2v.wv.similarity(word1, word2)

In [None]:
train_VAD_WV = generate_dataset(train, [wordvec_similarity])
test_VAD_WV = generate_dataset(test, [wordvec_similarity])
val_VAD_WV = generate_dataset(val, [wordvec_similarity])


X_train = train_VAD_WV.iloc[:, :-1].values
y_train = train_VAD_WV.iloc[:, -1].values

X_test = test_VAD_WV.iloc[:, :-1].values
y_test = test_VAD_WV.iloc[:, -1].values

X_val = val_VAD_WV.iloc[:, :-1].values
y_val = val_VAD_WV.iloc[:, -1].values

In [None]:
svr_VAD_WV = SVR()

svr_VAD_WV.fit(X_train[:, 2:].astype(np.float64), y_train)

In [None]:
y_pred = svr_VAD_WV.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)

# 5.53500159519488

5.331598528233896

In [None]:
report_performance(test_VAD_WV, y_test, y_pred)

{'<=1': 80, '<=2': 130}

### NN

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(7,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

report_performance(test_VAD_WV, y_test, y_pred)



{'<=1': 73, '<=2': 132}

## WordNet

In [None]:
import nltk
nltk.download('wordnet')

from nltk.corpus import wordnet

def word_similarity(word1, word2):
    # Get synsets for each word
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)

    max_similarity = 0.0

    for synset1 in synsets1:
        for synset2 in synsets2:
            # Calculate similarity between synsets
            similarity = synset1.wup_similarity(synset2)
            if similarity is not None and similarity > max_similarity:
                max_similarity = similarity

    return max_similarity

word1 = "agreement"
word2 = "argument"
similarity_score = word_similarity(word1, word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity_score}")


[nltk_data] Downloading package wordnet to /root/nltk_data...


Similarity between 'agreement' and 'argument': 0.7777777777777778


In [None]:
train_VAD_WN = generate_dataset(train, [word_similarity])
test_VAD_WN = generate_dataset(test, [word_similarity])
val_VAD_WN = generate_dataset(val, [word_similarity])


X_train = train_VAD_WN.iloc[:, :-1].values
y_train = train_VAD_WN.iloc[:, -1].values

X_test = test_VAD_WN.iloc[:, :-1].values
y_test = test_VAD_WN.iloc[:, -1].values

X_val = val_VAD_WN.iloc[:, :-1].values
y_val = val_VAD_WN.iloc[:, -1].values

### SVM

In [None]:
svr_VAD_WN = SVR()

svr_VAD_WN.fit(X_train[:, 2:].astype(np.float64), y_train)

In [None]:
y_pred = svr_VAD_WN.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)


4.078742705415954

In [None]:
report_performance(test_VAD_WN, y_test, y_pred)

{'<=1': 89, '<=2': 151}

### NN

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(7,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

report_performance(test_VAD_WN, y_test, y_pred)



{'<=1': 85, '<=2': 152}

## All Combined

In [None]:
train_ALL = generate_dataset(train, [word_similarity, wordvec_similarity])
test_ALL = generate_dataset(test, [word_similarity, wordvec_similarity])
val_ALL = generate_dataset(val, [word_similarity, wordvec_similarity])


X_train = train_ALL.iloc[:, :-1].values
y_train = train_ALL.iloc[:, -1].values

X_test = test_ALL.iloc[:, :-1].values
y_test = test_ALL.iloc[:, -1].values

X_val = val_ALL.iloc[:, :-1].values
y_val = val_ALL.iloc[:, -1].values

### SVM

In [None]:
svr_ALL = SVR()

svr_ALL.fit(X_train[:, 2:].astype(np.float64), y_train)

In [None]:
y_pred = svr_ALL.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)

# 5.53500159519488

4.047477715892683

In [None]:
report_performance(test_ALL, y_test, y_pred)

{'<=1': 91, '<=2': 152}

### NN

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(8,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

report_performance(test_ALL, y_test, y_pred)



{'<=1': 89, '<=2': 146}

In [None]:
w2v.wv['quick'].shape

(100,)

In [None]:
v1 = list(w2v.wv['quick'])
v2 = list(w2v.wv['rapid'])


np.dot(v1, v2) / ((np.dot(v1, v1) * np.dot(v2, v2)) ** 0.5)

0.8812830330690906

In [None]:
w2v.wv.similarity('quick', 'rapid')

0.8812831

In [None]:
v1_ = v1 + [nrc.at['quick', 'valence'], nrc.at['quick', 'arousal'],
            nrc.at['quick', 'dominance']]
v2_ = v2 + [nrc.at['rapid', 'valence'], nrc.at['rapid', 'arousal'],
            nrc.at['rapid', 'dominance']]

In [None]:
np.dot(v1_, v2_)

5.912193678029531

In [None]:
np.dot(v1_, v2_) / ((np.dot(v1_, v1_) * np.dot(v2_, v2_)) ** 0.5)

0.9083959993035119

In [None]:
def sim(w1, w2):
  v1 = list(w2v.wv[w1])
  v2 = list(w2v.wv[w2])

  v1_ = v1 + [nrc.at[w1, 'valence'], nrc.at[w1, 'arousal'],
            nrc.at[w1, 'dominance']]
  v2_ = v2 + [nrc.at[w2, 'valence'], nrc.at[w2, 'arousal'],
            nrc.at[w2, 'dominance']]
  return np.dot(v1_, v2_) / ((np.dot(v1_, v1_) * np.dot(v2_, v2_)) ** 0.5)

In [None]:
sim('quick', 'rapid')

0.9083959993035119

In [None]:
sim('sharp', 'dull')

0.923288580141021

In [None]:
def sim_(w1, w2):
  v1_ = list(w2v.wv[w1])
  v2_ = list(w2v.wv[w2])

  # v1_ = v1 + [nrc.at[w1, 'valence'], nrc.at[w1, 'arousal'],
  #          nrc.at[w1, 'dominance']]
  # v2_ = v2 + [nrc.at[w2, 'valence'], nrc.at[w2, 'arousal'],
  #          nrc.at[w2, 'dominance']]
  return np.dot(v1_, v2_) / ((np.dot(v1_, v1_) * np.dot(v2_, v2_)) ** 0.5)

In [None]:
sim_('sharp', 'dull')

0.9250247800547484

In [None]:
vv1 = nrc.loc['sharp', :].values
vv2 = nrc.loc['dull', :].values

np.dot(vv1, vv2) / ((np.dot(vv1, vv1) * np.dot(vv2, vv2)) ** 0.5)

0.9625563386210116

In [None]:
svr_ALL

In [None]:
test

Unnamed: 0,word1,word2,SimLex999
453,butter,potato,1.22
793,choose,elect,7.62
209,bread,flour,3.33
309,bed,hospital,0.92
740,diet,apple,1.18
...,...,...,...
78,easy,big,1.12
29,old,fresh,0.87
277,rain,mist,5.97
261,blood,flesh,4.28


In [None]:
y_pred

array([[4.273809  ],
       [6.3013854 ],
       [4.147719  ],
       [3.9274855 ],
       [2.2194176 ],
       [3.6126375 ],
       [6.1570983 ],
       [5.418885  ],
       [4.9282107 ],
       [5.1734834 ],
       [4.0329533 ],
       [5.281097  ],
       [7.4195147 ],
       [6.4461904 ],
       [4.7384458 ],
       [3.106462  ],
       [5.8034453 ],
       [3.9371305 ],
       [5.688359  ],
       [5.5936346 ],
       [0.6927015 ],
       [5.3409877 ],
       [4.8496113 ],
       [1.9671574 ],
       [4.58169   ],
       [8.0735235 ],
       [5.8908315 ],
       [6.3010917 ],
       [2.034042  ],
       [6.0348344 ],
       [8.194392  ],
       [5.359603  ],
       [6.6861963 ],
       [6.528412  ],
       [4.749088  ],
       [6.760712  ],
       [6.1333747 ],
       [1.7053702 ],
       [6.3725123 ],
       [6.7240834 ],
       [6.019866  ],
       [3.1648006 ],
       [6.5079045 ],
       [4.227019  ],
       [5.580276  ],
       [5.1445413 ],
       [1.4812629 ],
       [0.145

In [None]:
y_pred = svr_ALL.predict(X_test[:, 2:].astype(np.float64))

mean_squared_error(y_test, y_pred)

# 5.53500159519488

4.047477715892683

In [None]:
rows = []
for row, gold, pred in zip(X_test, y_test, y_pred):
  rows.append(list(row) + [gold, pred, pairwise_sd[row[0], row[1]]])

rows

[['butter',
  'potato',
  0.65,
  0.292,
  0.202,
  0.541,
  0.235,
  0.265,
  0.75,
  0.9063538908958435,
  1.22,
  3.9073856936120146,
  1.19],
 ['choose',
  'elect',
  0.615,
  0.542,
  0.648,
  0.667,
  0.554,
  0.661,
  0.8,
  0.8207060098648071,
  7.62,
  6.614633817214295,
  1.14],
 ['bread',
  'flour',
  0.66,
  0.314,
  0.342,
  0.53,
  0.235,
  0.239,
  0.8,
  0.9174583554267883,
  3.33,
  3.846979297927501,
  1.25],
 ['bed',
  'hospital',
  0.604,
  0.173,
  0.327,
  0.323,
  0.537,
  0.598,
  0.6666666666666666,
  0.8534948825836182,
  0.92,
  3.7713050380348645,
  1.35],
 ['diet',
  'apple',
  0.429,
  0.235,
  0.398,
  0.811,
  0.3,
  0.264,
  0.4,
  0.9201089143753052,
  1.18,
  1.8392010234678744,
  1.56],
 ['argument',
  'criticism',
  0.51,
  0.464,
  0.698,
  0.115,
  0.563,
  0.505,
  0.6153846153846154,
  0.9182789921760559,
  5.08,
  3.575318939809848,
  1.07],
 ['inspect',
  'examine',
  0.479,
  0.603,
  0.793,
  0.615,
  0.623,
  0.786,
  0.5,
  0.8659030795097

In [None]:
output = pd.DataFrame(rows)

output.to_csv('output.csv')

# Unconstrained

In [None]:
!pip install sentence-transformers

from sentence_transformers import SentenceTransformer

bert = SentenceTransformer('bert-base-uncased')





In [None]:
import numpy as np

def bert_similarity(word1, word2):
  v1, v2 = tuple(bert.encode([word1, word2]))

  return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [None]:
train_BERT = generate_dataset(train, [word_similarity, wordvec_similarity])
test_BERT = generate_dataset(test, [word_similarity, wordvec_similarity])
val_BERT = generate_dataset(val, [word_similarity, wordvec_similarity])


X_train = train_BERT.iloc[:, :-1].values
y_train = train_BERT.iloc[:, -1].values

X_test = test_BERT.iloc[:, :-1].values
y_test = test_BERT.iloc[:, -1].values

X_val = val_BERT.iloc[:, :-1].values
y_val = val_BERT.iloc[:, -1].values

In [None]:
svr_BERT = SVR()

svr_BERT.fit(X_train[:, 2:].astype(np.float64), y_train)
y_pred = svr_BERT.predict(X_test[:, 2:].astype(np.float64))

In [None]:
mean_squared_error(y_pred, y_test)

4.047477715892683

In [None]:
report_performance(test_BERT, y_pred, y_test)

{'<=1': 91, '<=2': 152}

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models


model = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(8,)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

model.compile(optimizer='adam',
              loss='mean_squared_error',  # Use Mean Squared Error for regression
              metrics=['mean_squared_error'])  # Use Mean Absolute Error as metric

model.fit(X_train[:, 2:].astype(np.float64), y_train,
          epochs=100, batch_size=16,
          validation_data=(X_val[:, 2:].astype(np.float64), y_val))

test_loss, test_mse = model.evaluate(X_test[:, 2:].astype(np.float64), y_test)
print('Test Mean Squared Error:', test_mse)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
y_pred = model.predict(X_test[:, 2:].astype(np.float64))

report_performance(test_BERT, y_pred, y_test)



{'<=1': 90, '<=2': 138}