In [23]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import json
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
data  = []
with open("/content/drive/MyDrive/word2vec_without_numbers.json", 'r') as f:
    data=json.load(f)
    without_numbers = pd.DataFrame(data)

print(without_numbers.head(1))

query = np.array([np.array(list(np.float_(x))) for x in without_numbers['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in without_numbers['product_vector']])
label = np.array(without_numbers['median_relevance'])

train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.4, random_state=42)
    model = SVR(kernel = 'linear').fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(MAE(y_test, prediction), MSE(y_test, prediction))


   id                                       query_vector  \
0   1  [-0.067871094, 0.122558594, -0.17480469, 0.086...   

                                      product_vector  median_relevance  \
0  [0.010758463, 0.015542602, 0.008837891, 0.0595...                 1   

   relevance_variance  
0                 0.0  
0.5949937826673509 0.7912099329496702
0.6546216357301569 0.8792645065674145
0.6415964858376851 0.8493766579626596
0.6898132629799536 1.1144750047742857


In [25]:
data  = []
with open("/content/drive/MyDrive/word2vec_with_numbers.json", 'r') as f:
    data=json.load(f)
    with_numbers = pd.DataFrame(data)

print(without_numbers.head(1))

query = np.array([np.array(list(np.float_(x))) for x in with_numbers['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in with_numbers['product_vector']])
label = np.array(with_numbers['median_relevance'])

train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.4, random_state=42)
    model = SVR(kernel = 'linear').fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(MAE(y_test, prediction), MSE(y_test, prediction))

   id                                       query_vector  \
0   1  [-0.067871094, 0.122558594, -0.17480469, 0.086...   

                                      product_vector  median_relevance  \
0  [0.010758463, 0.015542602, 0.008837891, 0.0595...                 1   

   relevance_variance  
0                 0.0  
0.5944924377735168 0.791422319960048
0.6531430292941272 0.8760865723367223
0.6425975890115347 0.8541168635842786
0.6897938637847952 1.115313813121766


In [26]:
data  = []
with open("/content/drive/MyDrive/word2vec_without_numbers_partially.json", 'r') as f:
    data=json.load(f)
    withoutp_numbers = pd.DataFrame(data)

print(without_numbers.head(1))

query = np.array([np.array(list(np.float_(x))) for x in withoutp_numbers['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in withoutp_numbers['product_vector']])
label = np.array(withoutp_numbers['median_relevance'])

train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.4, random_state=42)
    model = SVR(kernel = 'linear').fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(MAE(y_test, prediction), MSE(y_test, prediction))

   id                                       query_vector  \
0   1  [-0.067871094, 0.122558594, -0.17480469, 0.086...   

                                      product_vector  median_relevance  \
0  [0.010758463, 0.015542602, 0.008837891, 0.0595...                 1   

   relevance_variance  
0                 0.0  
0.5941600901655384 0.7918457137403566
0.6542307926782174 0.8776017033724947
0.6416758305044253 0.852386901992414
0.6898051616185545 1.1146957934685808


In [30]:
data  = []
with open("/content/drive/MyDrive/word2vec_without_numbers.json", 'r') as f:
    data=json.load(f)
    without_numbers = pd.DataFrame(data)

print(without_numbers.head(1))

query = np.array([np.array(list(np.float_(x))) for x in without_numbers['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in without_numbers['product_vector']])
label = np.array(without_numbers['median_relevance'])

train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.4, random_state=42)
    model = SVC(kernel = 'linear').fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(MAE(y_test, prediction), MSE(y_test, prediction))
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))


   id                                       query_vector  \
0   1  [-0.067871094, 0.122558594, -0.17480469, 0.086...   

                                      product_vector  median_relevance  \
0  [0.010758463, 0.015542602, 0.008837891, 0.0595...                 1   

   relevance_variance  
0                 0.0  
0.5445374015748031 1.0268208661417322
0.6532972440944882
0.5957185039370079 1.1739665354330708
0.6387795275590551
0.5999015748031497 1.1683070866141732
0.6313976377952756
0.6865157480314961 1.437007874015748
0.6109744094488189


In [31]:
data  = []
with open("/content/drive/MyDrive/word2vec_with_numbers.json", 'r') as f:
    data=json.load(f)
    with_numbers = pd.DataFrame(data)

print(without_numbers.head(1))

query = np.array([np.array(list(np.float_(x))) for x in with_numbers['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in with_numbers['product_vector']])
label = np.array(with_numbers['median_relevance'])

train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.4, random_state=42)
    model = SVC(kernel = 'linear').fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(MAE(y_test, prediction), MSE(y_test, prediction))
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))

   id                                       query_vector  \
0   1  [-0.067871094, 0.122558594, -0.17480469, 0.086...   

                                      product_vector  median_relevance  \
0  [0.010758463, 0.015542602, 0.008837891, 0.0595...                 1   

   relevance_variance  
0                 0.0  
0.5445374015748031 1.0248523622047243
0.6528051181102362
0.5905511811023622 1.1633858267716535
0.6419783464566929
0.6016240157480315 1.1754429133858268
0.6316437007874016
0.6865157480314961 1.437007874015748
0.6109744094488189


In [32]:
data  = []
with open("/content/drive/MyDrive/word2vec_without_numbers_partially.json", 'r') as f:
    data=json.load(f)
    withoutp_numbers = pd.DataFrame(data)

print(without_numbers.head(1))

query = np.array([np.array(list(np.float_(x))) for x in withoutp_numbers['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in withoutp_numbers['product_vector']])
label = np.array(withoutp_numbers['median_relevance'])

train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.4, random_state=42)
    model = SVC(kernel = 'linear').fit(X_train, y_train)
    prediction = model.predict(X_test)
    print(MAE(y_test, prediction), MSE(y_test, prediction))
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))

   id                                       query_vector  \
0   1  [-0.067871094, 0.122558594, -0.17480469, 0.086...   

                                      product_vector  median_relevance  \
0  [0.010758463, 0.015542602, 0.008837891, 0.0595...                 1   

   relevance_variance  
0                 0.0  
0.5440452755905512 1.0214074803149606
0.6525590551181102
0.5927657480314961 1.1651082677165354
0.6395177165354331
0.6026082677165354 1.174458661417323
0.6299212598425197
0.6865157480314961 1.437007874015748
0.6109744094488189
