In [187]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.metrics import classification_report, log_loss, hinge_loss
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
import nltk
from tqdm import tqdm
import pandas as pd
import numpy as np
import joblib

In [160]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report

In [161]:
model= Doc2Vec.load("task-1/data/doc2vec/d2v.model")

### Get doc2vec vectors

In [162]:
dv_=model.dv

In [163]:
print(dv_[19999])

[ 1.0527064  -0.88299656  1.8908286  -0.25031358 -0.32747704  1.0352894
 -0.62976235 -0.5546069   0.8104291  -0.34984457 -0.2712656   0.32901013
 -0.66599745 -0.3243207  -0.12314082 -0.09490109 -1.3840748  -0.3651697
  0.19609755 -0.01749463  0.44469282  0.38914767  0.31424308  0.3778051
  0.10877842 -0.20135105  0.82883406 -0.1873991   0.2517018  -0.16888648
 -1.4841188  -1.0153332   0.59753853 -0.68308294 -0.15685658  0.0935528
 -0.6633116   0.17084147 -1.4388142   0.10703216  0.7573486   1.8683228
 -0.7577702  -0.15317133 -0.35258406 -0.41043687 -0.46426144  0.28915682
 -0.97379273  0.8134736   0.07009614  0.83730304  0.27350315  1.6401083
  0.09703351 -0.18142878 -0.02556931  0.5307504   0.9032727   1.2531323
 -0.9093362   0.36124003  0.10845557 -0.35556373  0.8334697  -0.5982448
 -0.04784167 -1.0015296  -1.0224038  -0.8572968   0.3455853  -0.25963253
 -0.5222779  -1.4435843   0.8370194   0.46381196  1.5352349   0.7431258
  0.08776543 -0.18143864 -0.5344791   0.12011831  1.006585  

In [164]:
y_pred=[]

In [165]:
df=pd.read_csv("task-1/data/cleaned_data.csv")
y_true=df['is_duplicate'][:40000]

In [167]:
question1_vectors=[]
question2_vectors=[]

In [168]:
for i in tqdm(range(40000)):
    question1_vectors.append(dv_[i])
    question2_vectors.append(dv_[i+40000])

100%|██████████| 40000/40000 [00:00<00:00, 281433.17it/s]


### Get cosine similarity using loaded vectors

In [169]:
for i in tqdm(range(40000)):
    cosine_sim=float(cosine_similarity([dv_[i]],[dv_[i+40000]]))
    if cosine_sim >= 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

100%|██████████| 40000/40000 [00:08<00:00, 4535.62it/s]


In [170]:
print(classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.74      0.67      0.71     25108
           1       0.52      0.60      0.56     14892

    accuracy                           0.65     40000
   macro avg       0.63      0.64      0.63     40000
weighted avg       0.66      0.65      0.65     40000



In [171]:
x=np.hstack((question1_vectors[:],question2_vectors[:]))

In [172]:
x_train,x_test,y_train,y_test = train_test_split(x,y_true,test_size=0.2,random_state=40)

In [173]:
parameters_rf = { 
    'n_estimators': [150,200,250,300],
    'max_depth' : [6,7,8],
    'criterion' :['gini', 'entropy']
}

### Train random forest classifier on doc2vec vectors

In [174]:
CV_rf = GridSearchCV(estimator=RandomForestClassifier(random_state=6), param_grid=parameters_rf, n_jobs=-1, cv= 4)
CV_rf.fit(x_train, y_train)

In [176]:
y_pred=CV_rf.predict(x_test)
score = classification_report(y_test, y_pred)

In [177]:
print(score)

              precision    recall  f1-score   support

           0       0.65      1.00      0.79      5168
           1       1.00      0.00      0.00      2832

    accuracy                           0.65      8000
   macro avg       0.82      0.50      0.39      8000
weighted avg       0.77      0.65      0.51      8000



In [178]:
parameters_svm = {'C': [10, 100, 1000],  
              'gamma': [0.01, 0.001, 0.0001]
              }

### Train SVM on doc2vec vectors

In [179]:
CV_svm = GridSearchCV(estimator=SVC(), n_jobs=-1,param_grid=parameters_svm)
CV_svm.fit(x_train, y_train)



In [182]:
y_pred_svm=CV_svm.predict(x_test)
score_svm = classification_report(y_test, y_pred_svm)
print(score_svm)

              precision    recall  f1-score   support

           0       0.76      0.85      0.80      5168
           1       0.65      0.51      0.57      2832

    accuracy                           0.73      8000
   macro avg       0.70      0.68      0.68      8000
weighted avg       0.72      0.73      0.72      8000



In [188]:
hinge_loss(y_test, y_pred_svm)

0.91875

In [183]:
filename = 'task-1/saved_models/doc2vec/rf_model.sav'
joblib.dump(CV_rf, filename)

['task-1/saved_models/doc2vec/rf_model.sav']

In [184]:
filename = 'task-1/saved_models/doc2vec/svm_model.sav'
joblib.dump(CV_svm, filename)

['task-1/saved_models/doc2vec/svm_model.sav']