In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, confusion_matrix
import nltk
import joblib
from scipy.sparse import hstack, vstack
from os.path import join

In [3]:
nltk.download('punkt')
from nltk.tokenize import casual_tokenize
from nltk.stem import PorterStemmer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Piotr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [4]:
df = pd.read_csv('../data/originals/train.csv')

In [5]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,How many keywords are there in the Racket prog...,How many keywords are there in PERL Programmin...,0
404286,404286,18840,155606,Do you believe there is life after death?,Is it true that there is life after death?,1
404287,404287,537928,537929,What is one coin?,What's this coin?,0
404288,404288,537930,537931,What is the approx annual cost of living while...,I am having little hairfall problem but I want...,0


In [6]:
df['question1'] = df['question1'].fillna('')
df['question2'] = df['question2'].fillna('')
df['question1'] = df['question1'].str.lower()
df['question2'] = df['question2'].str.lower()

In [7]:
stemmer = PorterStemmer()
df['question1'] = df['question1'].apply(lambda x: ' '.join([stemmer.stem(word) for word in casual_tokenize(x)]))
df['question2'] = df['question2'].apply(lambda x: ' '.join([stemmer.stem(word) for word in casual_tokenize(x)]))

In [8]:
df

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,what is the step by step guid to invest in sha...,what is the step by step guid to invest in sha...,0
1,1,3,4,what is the stori of kohinoor ( koh-i-noor ) d...,what would happen if the indian govern stole t...,0
2,2,5,6,how can i increas the speed of my internet con...,how can internet speed be increas by hack thro...,0
3,3,7,8,whi am i mental veri lone ? how can i solv it ?,find the remaind when [ math ] 23 ^ { 24 } [ /...,0
4,4,9,10,"which one dissolv in water quikli sugar , salt...",which fish would surviv in salt water ?,0
...,...,...,...,...,...,...
404285,404285,433578,379845,how mani keyword are there in the racket progr...,how mani keyword are there in perl program lan...,0
404286,404286,18840,155606,do you believ there is life after death ?,is it true that there is life after death ?,1
404287,404287,537928,537929,what is one coin ?,what' thi coin ?,0
404288,404288,537930,537931,what is the approx annual cost of live while s...,i am have littl hairfal problem but i want to ...,0


In [9]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['question1'] + ' ' + df['question2'])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, df['is_duplicate'], test_size=0.2, random_state=42)

In [12]:
# clf = LinearSVC()
# clf = XGBClassifier()
# clf = MultinomialNB()
# clf = LGBMClassifier()
clf = SVC()

In [13]:
# clf.fit(X_train.astype(np.float32), y_train.astype(np.float32))
clf.fit(X_train, y_train)

In [None]:
# y_pred = clf.predict(X_test.astype(np.float32))
# accuracy = accuracy_score(y_test.astype(np.float32), y_pred.astype(np.float32))

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix

# y_pred = clf.predict(X_test.astype(np.float32))
# cm = confusion_matrix(y_test.astype(np.float32), y_pred.astype(np.float32))

y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm).plot()

In [58]:
df2 = joblib.load('../data/TfidfDataframe.pkl')

In [79]:
X = df2['tfidf'].tolist()
y = df2['is_duplicate'].tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
model = MultinomialNB()
model.fit(vstack(X_train), y_train)