In [1]:
import json
import numpy as np

In [73]:
dataJSON  = []
with open("embedding.json", 'r') as f:
  dataJSON=json.load(f)

In [74]:
query = np.array(dataJSON[0])
product = np.array(dataJSON[1])
label = np.array(dataJSON[2])

In [75]:
print(query.shape)
print(product.shape)
print(label.shape)

(10112, 300)
(10112, 300)
(10112,)


## SVM

### SVM NAD KONKATENIRANIM VEKTORIMA UPITA I PROIZVODA

In [6]:
train = np.concatenate((query, product), axis = 1)
print(train.shape)

(10112, 600)


In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [8]:
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.33, random_state=42)
model = SVC(kernel = 'rbf', gamma = 0.4).fit(X_train, y_train)

In [9]:
prediction = model.predict(X_test)

In [10]:
conf = confusion_matrix(y_test, prediction)

In [11]:
print(np.sum(np.diag(conf))/len(prediction))

0.6239136949355709


### GRID SEARCH PARAMETRA GAMMA

In [14]:
gamma = [0.1*x for x in range(1, 10)]
for g in gamma:
    model = SVC(kernel = 'rbf', gamma = g).fit(X_train, y_train)
    prediction = model.predict(X_test)
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))

0.6320047947258016
0.6320047947258016
0.6299071021875936
0.6239136949355709
0.6236140245729698
0.6215163320347618
0.6209169913095595
0.6212166616721606
0.622115672759964


### LINEARNA JEZGRA 

In [15]:
model = SVC(kernel = 'linear').fit(X_train, y_train)
prediction = model.predict(X_test)
conf = confusion_matrix(y_test, prediction)
print(np.sum(np.diag(conf))/len(prediction))

0.6296074318249925


### RAZLIČITI NAČINI OBJEDINJAVANJA EMBEDDINGA

In [76]:
train = [np.concatenate((query, product), axis = 1), query - product, query * product, query + product]
for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.3, random_state=42)
    model = SVC(kernel = 'rbf', gamma = 0.2).fit(X_train, y_train)
    prediction = model.predict(X_test)
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))

0.6324983520105472
0.6275543836519446
0.6249176005273567
0.6285431773236652


### FASTTEXT EMBEDDING

In [10]:
import pandas as pd

In [11]:
data  = []
with open("df_pp_fasttext.json", 'r') as f:
    data=json.load(f)
    df_fasttext = pd.DataFrame(data)

In [12]:
print(df_fasttext.head(1))

   id                                       query_vector  \
0   1  [0.012943572, 0.015888123, -0.025562584, 0.090...   

                                      product_vector  median_relevance  \
0  [-0.0042361375, 0.008490032, 0.07483778, 0.015...                 1   

   relevance_variance  
0                 0.0  


In [13]:
query = np.array([np.array(list(np.float_(x))) for x in df_fasttext['query_vector']])
product = np.array([np.array(list(np.float_(x))) for x in df_fasttext['product_vector']])
label = np.array(df_fasttext['median_relevance'])

In [14]:
print(query.shape)
print(product.shape)
print(label.shape)

(10158, 300)
(10158, 300)
(10158,)


In [15]:
train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

In [16]:
for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.33, random_state=42)
    model = SVC(kernel = 'rbf', gamma = 0.2).fit(X_train, y_train)
    prediction = model.predict(X_test)
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))

0.6280942439606323
0.6194452728899493
0.6167611094542201
0.6069191768565464


### NUMBERS ARE STOP WORDS

In [2]:
data2  = []
with open("embedding_without_numbers.json", 'r') as f:
  data2=json.load(f)

In [4]:
query = np.array(data2[0])
product = np.array(data2[1])
label = np.array(data2[2])

In [5]:
print(query.shape)
print(product.shape)
print(label.shape)

(10141, 300)
(10141, 300)
(10141,)


In [70]:
train = [np.concatenate((query, product), axis = 1), query - product, query + product, query * product]

In [71]:
for tr in train:
    X_train, X_test, y_train, y_test = train_test_split(tr, label, test_size=0.33, random_state=42)
    model = SVC(kernel = 'rbf', gamma = 0.2).fit(X_train, y_train)
    prediction = model.predict(X_test)
    conf = confusion_matrix(y_test, prediction)
    print(np.sum(np.diag(conf))/len(prediction))

0.6248127060233742
0.6248127060233742
0.6248127060233742
0.6248127060233742


In [9]:
train = np.concatenate((query, product), axis = 1)
X_train, X_test, y_train, y_test = train_test_split(train, label, test_size=0.3, random_state=42)
model = SVC(kernel = 'rbf', gamma = 0.2).fit(X_train, y_train)
prediction = model.predict(X_test)
conf = confusion_matrix(y_test, prediction)
print(np.sum(np.diag(conf))/len(prediction))

0.6398291160039434
