In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
import re
import pymorphy2
from nltk.stem.porter import *

In [14]:
data = pd.read_csv("train.csv")
data = data.drop(columns=["qid1", "qid2", "id"])
data = data.dropna()
data.reset_index(drop=True, inplace=True)

In [15]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def delete_stop_words(text):
    return ' '.join([word.lower() for word in text.split() if word not in stop_words])

data['question1'] = data['question1'].apply(delete_stop_words)
data['question2'] = data['question2'].apply(delete_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timurabdulkadirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def preprocess_text(text): 
    text = re.sub(r"[^\w\s]", '', text.lower())
    return text.split()

In [17]:
def train_test_stem(data2):
    stemmer = PorterStemmer()

    def preprocess_stem(text):
        return ' '.join([stemmer.stem(word) for word in preprocess_text(text)])
    
    data2['question1'] = data2['question1'].apply(preprocess_stem)
    data2['question2'] = data2['question2'].apply(preprocess_stem)
    print(data2.head())
    train, test = train_test_split(data2, test_size=0.2, random_state=0)
    return train, test

In [18]:
train, test = train_test_stem(data)

                                           question1  \
0      what step step guid invest share market india   
1               what stori kohinoor kohinoor diamond   
2       how i increas speed internet connect use vpn   
3                    whi i mental lone how i solv it   
4  which one dissolv water quikli sugar salt meth...   

                                           question2  is_duplicate  
0            what step step guid invest share market             0  
1  what would happen indian govern stole kohinoor...             0  
2                 how internet speed increas hack dn             0  
3               find remaind math2324math divid 2423             0  
4                 which fish would surviv salt water             0  


In [19]:
def bow (df_train, df_test):
    vectorizer = CountVectorizer()
    questions = pd.concat([df_train['question1'], df_train['question2']])
    vectorizer.fit(questions)
    q1_train_bow = vectorizer.transform(df_train['question1'])
    q2_train_bow = vectorizer.transform(df_train['question2'])
    q1_test_bow = vectorizer.transform(df_test['question1'])
    q2_test_bow = vectorizer.transform(df_test['question2'])
    df_train_features = q1_train_bow + q2_train_bow # Вычитание дает результаты порядком хуже
    df_test_features = q1_test_bow + q2_test_bow
    return df_train_features, df_test_features

In [26]:
def cboost(train_features, test_features, it):
    model = CatBoostClassifier(iterations=it, learning_rate=0.1, depth=6, verbose=100)
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [9]:
def ranfor(train_features, test_features, est):
    model = RandomForestClassifier(
        n_estimators=est,       
        criterion='entropy',          
        max_depth=None,           
        min_samples_split=2,       
        min_samples_leaf=1,   
        max_features='auto',      
        bootstrap=True,          
        random_state=42            
    )
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [20]:
def log_reg(train_features, test_features, iter):
    model = LogisticRegression(max_iter=iter)
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [21]:
train_features, test_features = bow(train, test)

In [19]:
log_reg(train_features, test_features, 5000)

accuracy = 0.7509955724850973
precision = 0.5884925975773889
recall = 0.6887453729227376
f1_score = 0.6346844721849259


In [22]:
log_reg(train_features, test_features, 5000)

accuracy = 0.7746543322862302
precision = 0.7251009421265141
recall = 0.6819404449226291
f1_score = 0.7028587270266303


In [20]:
log_reg(train_features, test_features, 10000)

accuracy = 0.7509955724850973
precision = 0.5884925975773889
recall = 0.6887453729227376
f1_score = 0.6346844721849259


In [25]:
log_reg(train_features, test_features, 10000)

accuracy = 0.7746543322862302
precision = 0.7251009421265141
recall = 0.6819404449226291
f1_score = 0.7028587270266303


In [21]:
log_reg(train_features, test_features, 20000)

accuracy = 0.7509955724850973
precision = 0.5884925975773889
recall = 0.6887453729227376
f1_score = 0.6346844721849259


In [22]:
cboost(train_features, test_features, 500)

0:	learn: 0.6801542	total: 304ms	remaining: 2m 31s
100:	learn: 0.5675354	total: 20.8s	remaining: 1m 22s
200:	learn: 0.5394863	total: 40.5s	remaining: 1m
300:	learn: 0.5219411	total: 60s	remaining: 39.6s
400:	learn: 0.5090246	total: 1m 20s	remaining: 19.9s
499:	learn: 0.4985828	total: 1m 42s	remaining: 0us
accuracy = 0.7531227584159885
precision = 0.48596904441453564
recall = 0.7550710999581765
f1_score = 0.5913445791025221


In [23]:
cboost(train_features, test_features, 2000)

0:	learn: 0.6801542	total: 352ms	remaining: 11m 43s
100:	learn: 0.5675354	total: 28.8s	remaining: 9m 2s
200:	learn: 0.5394863	total: 54.4s	remaining: 8m 6s
300:	learn: 0.5219411	total: 1m 23s	remaining: 7m 53s
400:	learn: 0.5090246	total: 2m 7s	remaining: 8m 26s
500:	learn: 0.4984297	total: 2m 30s	remaining: 7m 30s
600:	learn: 0.4893325	total: 2m 51s	remaining: 6m 38s
700:	learn: 0.4818543	total: 3m 35s	remaining: 6m 39s
800:	learn: 0.4752693	total: 4m 5s	remaining: 6m 6s
900:	learn: 0.4694255	total: 4m 32s	remaining: 5m 32s
1000:	learn: 0.4640238	total: 4m 57s	remaining: 4m 56s
1100:	learn: 0.4590826	total: 5m 20s	remaining: 4m 21s
1200:	learn: 0.4544554	total: 5m 44s	remaining: 3m 49s
1300:	learn: 0.4501390	total: 6m 10s	remaining: 3m 19s
1400:	learn: 0.4460380	total: 6m 36s	remaining: 2m 49s
1500:	learn: 0.4421004	total: 7m	remaining: 2m 19s
1600:	learn: 0.4387107	total: 7m 23s	remaining: 1m 50s
1700:	learn: 0.4353398	total: 7m 47s	remaining: 1m 22s
1800:	learn: 0.4321316	total: 8m 

In [24]:
cboost(train_features, test_features, 5000)

0:	learn: 0.6801542	total: 305ms	remaining: 25m 22s
100:	learn: 0.5675354	total: 28s	remaining: 22m 36s
200:	learn: 0.5394863	total: 1m 9s	remaining: 27m 40s
300:	learn: 0.5219411	total: 1m 34s	remaining: 24m 36s
400:	learn: 0.5090246	total: 1m 59s	remaining: 22m 46s
500:	learn: 0.4984297	total: 2m 23s	remaining: 21m 30s
600:	learn: 0.4893325	total: 2m 48s	remaining: 20m 32s
700:	learn: 0.4818543	total: 3m 11s	remaining: 19m 36s
800:	learn: 0.4752693	total: 3m 35s	remaining: 18m 48s
900:	learn: 0.4694255	total: 3m 58s	remaining: 18m 5s
1000:	learn: 0.4640238	total: 4m 21s	remaining: 17m 25s
1100:	learn: 0.4590826	total: 4m 44s	remaining: 16m 48s
1200:	learn: 0.4544554	total: 5m 9s	remaining: 16m 17s
1300:	learn: 0.4501390	total: 5m 34s	remaining: 15m 50s
1400:	learn: 0.4460380	total: 5m 58s	remaining: 15m 20s
1500:	learn: 0.4421004	total: 6m 22s	remaining: 14m 51s
1600:	learn: 0.4387107	total: 6m 46s	remaining: 14m 22s
1700:	learn: 0.4353398	total: 7m 9s	remaining: 13m 52s
1800:	learn:

In [25]:
ranfor(train_features, test_features, 25)

  warn(


accuracy = 0.8176061737861436
precision = 0.7266487213997308
recall = 0.765272856130404
f1_score = 0.7454608215395236


In [11]:
ranfor(train_features, test_features, 25)

  warn(


accuracy = 0.818583195231146
precision = 0.726278600269179
recall = 0.7676304278246026
f1_score = 0.7463821988623595


In [12]:
ranfor(train_features, test_features, 50)

  warn(


accuracy = 0.8239506294986272
precision = 0.7203230148048452
recall = 0.783286378105448
f1_score = 0.7504864068990902


In [27]:
ranfor(train_features, test_features, 200)

  warn(


accuracy = 0.8274258576764204
precision = 0.7268842530282638
recall = 0.7872813411078717
f1_score = 0.7558782365290413


In [12]:
cboost(train_features, test_features, 10000)

0:	learn: 0.6801542	total: 289ms	remaining: 48m 8s
100:	learn: 0.5675354	total: 20.3s	remaining: 33m 11s
200:	learn: 0.5394863	total: 39.3s	remaining: 31m 54s
300:	learn: 0.5219411	total: 57.8s	remaining: 31m 1s
400:	learn: 0.5090246	total: 1m 16s	remaining: 30m 21s
500:	learn: 0.4984297	total: 1m 34s	remaining: 29m 48s
600:	learn: 0.4893325	total: 1m 52s	remaining: 29m 22s
700:	learn: 0.4818543	total: 2m 10s	remaining: 28m 55s
800:	learn: 0.4752693	total: 2m 29s	remaining: 28m 33s
900:	learn: 0.4694255	total: 2m 47s	remaining: 28m 8s
1000:	learn: 0.4640238	total: 3m 5s	remaining: 27m 44s
1100:	learn: 0.4590826	total: 3m 23s	remaining: 27m 22s
1200:	learn: 0.4544554	total: 3m 40s	remaining: 26m 58s
1300:	learn: 0.4501390	total: 3m 58s	remaining: 26m 37s
1400:	learn: 0.4460380	total: 4m 16s	remaining: 26m 16s
1500:	learn: 0.4421004	total: 4m 34s	remaining: 25m 56s
1600:	learn: 0.4387107	total: 4m 52s	remaining: 25m 35s
1700:	learn: 0.4353398	total: 5m 10s	remaining: 25m 16s
1800:	learn: