In [134]:
!pip install simhash



In [135]:
from simhash import Simhash, SimhashIndex

In [136]:
def simhash_similarity(q1, q2):
    
    hash1 = Simhash(q1)
    hash2 = Simhash(q2)
    
    distance = hash1.distance(hash2)
    
    hashbits = 64  
    
    prob = (hashbits - distance) / hashbits
    
    return prob

In [137]:
import pandas as pd
import numpy as np

In [138]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [139]:
data = data.drop(columns=["qid1", "qid2", "id"])
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [140]:
nan_counts = data.isna().sum()
print(nan_counts)

question1       1
question2       2
is_duplicate    0
dtype: int64


In [141]:
data = data.dropna()
data.reset_index(drop=True, inplace=True)

In [142]:
print(len(data))

404287


In [143]:
threshold = [0.6, 0.7, 0.8]
for i in threshold:
    cnt_p = 0
    cnt_n = 0
    cnt1 = 0
    cnt2 = 0
    for j in range(len(data) // 10):
        q1 = data["question1"][j]
        q2 = data["question2"][j]
        is_dup = data["is_duplicate"][j]
        sh = simhash_similarity(q1, q2)
        if (sh >= i and is_dup == 1):
            cnt_p += 1
        elif (sh < i and is_dup == 0):
            cnt_n += 1
        elif (sh >= i and is_dup == 0): 
            cnt1 += 1
        elif (sh < i and is_dup == 1):
            cnt2 += 1
            
    accuracy = (cnt_p + cnt_n) / (len(data) // 10)
    precision = (cnt_p) / (cnt_p + cnt1)
    recall = (cnt_p) / (cnt_p + cnt2)
    f1 = (2 * precision * recall) / (precision + recall)
    print(f"threshold = {i}:")
    print(f"    accuracy: {accuracy}")
    print(f"    precision: {precision}")
    print(f"    recall: {recall}")
    print(f"    f1: {f1}")
    print('\n')
    

threshold = 0.6:
    accuracy: 0.6035173642030276
    precision: 0.4798769230769231
    recall: 0.7773125996810207
    f1: 0.5934099383608553

threshold = 0.7:
    accuracy: 0.6449243098842387
    precision: 0.5281501340482574
    recall: 0.43201754385964913
    f1: 0.47527141133896267

threshold = 0.8:
    accuracy: 0.627238547541308
    precision: 0.49712643678160917
    recall: 0.12646198830409358
    f1: 0.20163170163170163



# Пробуем алгоритм на почищенных данных

In [144]:
import re

In [145]:
def preprocess_text(text): 
    text = re.sub(r"[^\w\s]", '', text.lower())
    return text.split()

In [146]:
! pip install nltk



In [147]:
from nltk.stem.porter import *
from nltk.corpus import stopwords
import nltk

In [148]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def delete_stop_words(text):
    return ' '.join([word.lower() for word in text.split() if word not in stop_words])

data['question1'] = data['question1'].apply(delete_stop_words)
data['question2'] = data['question2'].apply(delete_stop_words)
data_const = data.copy()
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timurabdulkadirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,question1,question2,is_duplicate
0,what step step guide invest share market india?,what step step guide invest share market?,0
1,what story kohinoor (koh-i-noor) diamond?,what would happen indian government stole kohi...,0
2,how i increase speed internet connection using...,how internet speed increased hacking dns?,0
3,why i mentally lonely? how i solve it?,find remainder [math]23^{24}[/math] divided 24...,0
4,"which one dissolve water quikly sugar, salt, m...",which fish would survive salt water?,0


In [149]:
stemmer = PorterStemmer()

def preprocess_sentence(text):
    return ' '.join([stemmer.stem(word) for word in preprocess_text(text)])

In [150]:
data['question1'] = data['question1'].apply(preprocess_sentence)
data['question2'] = data['question2'].apply(preprocess_sentence)
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,what step step guid invest share market india,what step step guid invest share market,0
1,what stori kohinoor kohinoor diamond,what would happen indian govern stole kohinoor...,0
2,how i increas speed internet connect use vpn,how internet speed increas hack dn,0
3,whi i mental lone how i solv it,find remaind math2324math divid 2423,0
4,which one dissolv water quikli sugar salt meth...,which fish would surviv salt water,0


In [151]:
threshold = [0.6, 0.7, 0.8]
for i in threshold:
    cnt_p = 0
    cnt_n = 0
    cnt1 = 0
    cnt2 = 0
    for j in range(len(data) // 10):
        q1 = data["question1"][j]
        q2 = data["question2"][j]
        is_dup = data["is_duplicate"][j]
        sh = simhash_similarity(q1, q2)
        if (sh >= i and is_dup == 1):
            cnt_p += 1
        elif (sh < i and is_dup == 0):
            cnt_n += 1
        elif (sh >= i and is_dup == 0): 
            cnt1 += 1
        elif (sh < i and is_dup == 1):
            cnt2 += 1
            
    accuracy = (cnt_p + cnt_n) / (len(data) // 10)
    precision = (cnt_p) / (cnt_p + cnt1)
    recall = (cnt_p) / (cnt_p + cnt2)
    f1 = (2 * precision * recall) / (precision + recall)
    print(f"threshold = {i}:")
    print(f"    accuracy: {accuracy}")
    print(f"    precision: {precision}")
    print(f"    recall: {recall}")
    print(f"    f1: {f1}")
    print('\n')

threshold = 0.6:
    accuracy: 0.5952557633323439
    precision: 0.47457561579211943
    recall: 0.8155901116427432
    f1: 0.6000146666992594

threshold = 0.7:
    accuracy: 0.6468536657761947
    precision: 0.5271230563568564
    recall: 0.4978734715576821
    f1: 0.5120809268309353

threshold = 0.8:
    accuracy: 0.6475957257346393
    precision: 0.5782073813708261
    recall: 0.19677033492822968
    f1: 0.2936189201249442


In [152]:
data = data_const
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,what step step guide invest share market india?,what step step guide invest share market?,0
1,what story kohinoor (koh-i-noor) diamond?,what would happen indian government stole kohi...,0
2,how i increase speed internet connection using...,how internet speed increased hacking dns?,0
3,why i mentally lonely? how i solve it?,find remainder [math]23^{24}[/math] divided 24...,0
4,"which one dissolve water quikly sugar, salt, m...",which fish would survive salt water?,0


In [153]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

def preprocess_lem(text):
    return ' '.join([morph.parse(word)[0].normal_form for word in preprocess_text(text)])
    
data['question1'] = data['question1'].apply(preprocess_lem)
data['question2'] = data['question2'].apply(preprocess_lem)

In [154]:
threshold = [0.6, 0.7, 0.8]
for i in threshold:
    cnt_p = 0
    cnt_n = 0
    cnt1 = 0
    cnt2 = 0
    for j in range(len(data) // 10):
        q1 = data["question1"][j]
        q2 = data["question2"][j]
        is_dup = data["is_duplicate"][j]
        sh = simhash_similarity(q1, q2)
        if (sh >= i and is_dup == 1):
            cnt_p += 1
        elif (sh < i and is_dup == 0):
            cnt_n += 1
        elif (sh >= i and is_dup == 0): 
            cnt1 += 1
        elif (sh < i and is_dup == 1):
            cnt2 += 1
            
    accuracy = (cnt_p + cnt_n) / (len(data) // 10)
    precision = (cnt_p) / (cnt_p + cnt1)
    recall = (cnt_p) / (cnt_p + cnt2)
    f1 = (2 * precision * recall) / (precision + recall)
    print(f"threshold = {i}:")
    print(f"    accuracy: {accuracy}")
    print(f"    precision: {precision}")
    print(f"    recall: {recall}")
    print(f"    f1: {f1}")
    print('\n')

threshold = 0.6:
    accuracy: 0.5981745325022262
    precision: 0.47689099965249626
    recall: 0.8207735247208932
    f1: 0.6032676386548466

threshold = 0.7:
    accuracy: 0.6508607895517958
    precision: 0.5331015397715178
    recall: 0.4992690058479532
    f1: 0.5156308980474246

threshold = 0.8:
    accuracy: 0.6478430790541209
    precision: 0.5846378626591526
    recall: 0.18613769271664007
    f1: 0.28237310348303846
