In [2]:
import pandas as pd
import numpy as np

In [65]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [66]:
data = data.drop(columns=["qid1", "qid2", "id"])
data.head()

Unnamed: 0,question1,question2,is_duplicate
0,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [67]:
nan_counts = data.isna().sum()
print(nan_counts)

question1       1
question2       2
is_duplicate    0
dtype: int64


In [68]:
data = data.dropna()
data.reset_index(drop=True, inplace=True)

In [69]:
! pip install nltk



In [4]:
import nltk
from nltk.corpus import stopwords

In [71]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def delete_stop_words(text):
    return ' '.join([word.lower() for word in text.split() if word not in stop_words])

data['question1'] = data['question1'].apply(delete_stop_words)
data['question2'] = data['question2'].apply(delete_stop_words)
data.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timurabdulkadirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,question1,question2,is_duplicate
0,what step step guide invest share market india?,what step step guide invest share market?,0
1,what story kohinoor (koh-i-noor) diamond?,what would happen indian government stole kohi...,0
2,how i increase speed internet connection using...,how internet speed increased hacking dns?,0
3,why i mentally lonely? how i solve it?,find remainder [math]23^{24}[/math] divided 24...,0
4,"which one dissolve water quikly sugar, salt, m...",which fish would survive salt water?,0


In [72]:
data_const = data.copy()

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [74]:
import re

def preprocess_text(text): 
    text = re.sub(r"[^\w\s]", '', text.lower())
    return text.split()

In [4]:
from nltk.stem.porter import *

In [76]:
def train_test_stem(data2):
    stemmer = PorterStemmer()

    def preprocess_stem(text):
        return ' '.join([stemmer.stem(word) for word in preprocess_text(text)])
    
    data2['question1'] = data2['question1'].apply(preprocess_stem)
    data2['question2'] = data2['question2'].apply(preprocess_stem)
    print(data2.head())
    train, test = train_test_split(data2, test_size=0.2, random_state=0)
    return train, test

In [7]:
import pymorphy2

In [78]:
def train_test_lem(data2):
    morph = pymorphy2.MorphAnalyzer()
    
    def preprocess_lem(text):
        return ' '.join([morph.parse(word)[0].normal_form for word in preprocess_text(text)])
    
    data2['question1'] = data2['question1'].apply(preprocess_lem)
    data2['question2'] = data2['question2'].apply(preprocess_lem)
    print(data2.head())
    train, test = train_test_split(data2, test_size=0.2, random_state=0)
    return train, test

In [79]:
def tfidf (df_train, df_test):
    vectorizer = TfidfVectorizer()
    questions = pd.concat([df_train['question1'], df_train['question2']])
    vectorizer.fit(questions)
    q1_train_tfidf = vectorizer.transform(df_train['question1'])
    q2_train_tfidf = vectorizer.transform(df_train['question2'])
    q1_test_tfidf = vectorizer.transform(df_test['question1'])
    q2_test_tfidf = vectorizer.transform(df_test['question2'])
    df_train_features = q1_train_tfidf + q2_train_tfidf # Вычитание дает ужасные результаты
    df_test_features = q1_test_tfidf + q2_test_tfidf
    return df_train_features, df_test_features

In [80]:
def bow (df_train, df_test):
    vectorizer = CountVectorizer()
    questions = pd.concat([df_train['question1'], df_train['question2']])
    vectorizer.fit(questions)
    q1_train_bow = vectorizer.transform(df_train['question1'])
    q2_train_bow = vectorizer.transform(df_train['question2'])
    q1_test_bow = vectorizer.transform(df_test['question1'])
    q2_test_bow = vectorizer.transform(df_test['question2'])
    df_train_features = q1_train_bow + q2_train_bow # Вычитание дает ужасные результаты
    df_test_features = q1_test_bow + q2_test_bow
    return df_train_features, df_test_features

In [81]:
def log_reg(train_features, test_features):
    model = LogisticRegression(max_iter=10000)
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [82]:
def des_tree(train_features, test_features):
    model = DecisionTreeClassifier()
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [83]:
def svm(train_features, test_features):
    model = SVC(kernel='linear', max_iter=1000, random_state=42)
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [84]:
def cboost(train_features, test_features):
    model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, verbose=100)
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [85]:
def ranfor(train_features, test_features):
    model = RandomForestClassifier(
        n_estimators=100,       
        criterion='gini',          
        max_depth=None,           
        min_samples_split=2,       
        min_samples_leaf=1,   
        max_features='auto',      
        bootstrap=True,          
        random_state=42            
    )
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")

In [86]:
train, test = train_test_stem(data)

                                           question1  \
0      what step step guid invest share market india   
1               what stori kohinoor kohinoor diamond   
2       how i increas speed internet connect use vpn   
3                    whi i mental lone how i solv it   
4  which one dissolv water quikli sugar salt meth...   

                                           question2  is_duplicate  
0            what step step guid invest share market             0  
1  what would happen indian govern stole kohinoor...             0  
2                 how internet speed increas hack dn             0  
3               find remaind math2324math divid 2423             0  
4                 which fish would surviv salt water             0  


In [87]:
train_features, test_features = tfidf(train, test)
log_reg(train_features, test_features)

accuracy = 0.751984961290163
precision = 0.564872139973082
recall = 0.7021329987452949
f1_score = 0.626067499533843


In [88]:
train_features, test_features = bow(train, test)
log_reg(train_features, test_features)

accuracy = 0.7509955724850973
precision = 0.5884925975773889
recall = 0.6887453729227376
f1_score = 0.6346844721849259


In [89]:
train_features, test_features = tfidf(train, test)
svm(train_features, test_features)



accuracy = 0.5153231591184546
precision = 0.5520188425302827
recall = 0.3880138120240291
f1_score = 0.45570956362323267


In [90]:
train_features, test_features = bow(train, test)
svm(train_features, test_features)



accuracy = 0.4808676939820426
precision = 0.8020524899057874
recall = 0.3977473719339229
f1_score = 0.5317791411042945


In [91]:
train_features, test_features = bow(train, test)
cboost(train_features, test_features)

0:	learn: 0.6801542	total: 362ms	remaining: 6m 1s
100:	learn: 0.5675354	total: 27.9s	remaining: 4m 8s
200:	learn: 0.5394863	total: 56s	remaining: 3m 42s
300:	learn: 0.5219411	total: 1m 23s	remaining: 3m 13s
400:	learn: 0.5090246	total: 1m 48s	remaining: 2m 42s
500:	learn: 0.4984297	total: 2m 11s	remaining: 2m 11s
600:	learn: 0.4893325	total: 2m 35s	remaining: 1m 42s
700:	learn: 0.4818543	total: 2m 58s	remaining: 1m 16s
800:	learn: 0.4752693	total: 3m 22s	remaining: 50.3s
900:	learn: 0.4694255	total: 3m 46s	remaining: 24.9s
999:	learn: 0.4640838	total: 4m 10s	remaining: 0us
accuracy = 0.7715748596304632
precision = 0.5505383580080754
recall = 0.7619446772841576
f1_score = 0.6392155330702817


In [92]:
train_features, test_features = tfidf(train, test)
cboost(train_features, test_features)

0:	learn: 0.6782021	total: 1.08s	remaining: 18m 3s
100:	learn: 0.5507573	total: 58.7s	remaining: 8m 42s
200:	learn: 0.5252775	total: 1m 54s	remaining: 7m 35s
300:	learn: 0.5094706	total: 2m 45s	remaining: 6m 24s
400:	learn: 0.4979294	total: 3m 36s	remaining: 5m 23s
500:	learn: 0.4888572	total: 4m 28s	remaining: 4m 27s
600:	learn: 0.4812811	total: 5m 19s	remaining: 3m 32s
700:	learn: 0.4747914	total: 6m 11s	remaining: 2m 38s
800:	learn: 0.4690510	total: 6m 59s	remaining: 1m 44s
900:	learn: 0.4639414	total: 7m 49s	remaining: 51.6s
999:	learn: 0.4590031	total: 8m 41s	remaining: 0us
accuracy = 0.7718222068317298
precision = 0.5868775235531628
recall = 0.7386296264927585
f1_score = 0.6540668241646979


In [93]:
train_features, test_features = tfidf(train, test)
des_tree(train_features, test_features)

accuracy = 0.7502535308812981
precision = 0.6533647375504711
recall = 0.662504264756056
f1_score = 0.6579027613078096


In [94]:
train_features, test_features = bow(train, test)
des_tree(train_features, test_features)

accuracy = 0.7748769447673699
precision = 0.6887617765814267
recall = 0.6957142371614043
f1_score = 0.6922205501919078


In [95]:
train_features, test_features = bow(train, test)
ranfor(train_features, test_features)

  warn(


accuracy = 0.8261520195898984
precision = 0.726278600269179
recall = 0.7847093467117461
f1_score = 0.7543641987173886


In [96]:
train_features, test_features = tfidf(train, test)
ranfor(train_features, test_features)

  warn(


accuracy = 0.8245813648618566
precision = 0.6970053835800808
recall = 0.7999922762029814
f1_score = 0.7449563059661235


In [97]:
data = data_const
train, test = train_test_lem(data)

                                           question1  \
0     what step step guide invest share market india   
1               what story kohinoor kohinoor diamond   
2  how i increase speed internet connection using...   
3               why i mentally lonely how i solve it   
4  which one dissolve water quikly sugar salt met...   

                                           question2  is_duplicate  
0           what step step guide invest share market             0  
1  what would happen indian government stole kohi...             0  
2           how internet speed increased hacking dns             0  
3           find remainder math2324math divided 2423             0  
4                which fish would survive salt water             0  


In [98]:
train_features, test_features = tfidf(train, test)
svm(train_features, test_features)



accuracy = 0.5402310222859829
precision = 0.548485868102288
recall = 0.40693494433072047
f1_score = 0.4672246266731635


In [99]:
train_features, test_features = bow(train, test)
svm(train_features, test_features)



accuracy = 0.47923520245368423
precision = 0.8210296096904441
recall = 0.3987743095277006
f1_score = 0.5368166318336817


In [100]:
train_features, test_features = bow(train, test)
cboost(train_features, test_features)

0:	learn: 0.6804729	total: 379ms	remaining: 6m 18s
100:	learn: 0.5722553	total: 25.8s	remaining: 3m 50s
200:	learn: 0.5453968	total: 51.7s	remaining: 3m 25s
300:	learn: 0.5277511	total: 1m 15s	remaining: 2m 55s
400:	learn: 0.5151371	total: 1m 38s	remaining: 2m 26s
500:	learn: 0.5047034	total: 2m 1s	remaining: 2m
600:	learn: 0.4961164	total: 2m 22s	remaining: 1m 34s
700:	learn: 0.4886334	total: 2m 45s	remaining: 1m 10s
800:	learn: 0.4818330	total: 3m 7s	remaining: 46.6s
900:	learn: 0.4759184	total: 3m 29s	remaining: 23s
999:	learn: 0.4706546	total: 3m 51s	remaining: 0us
accuracy = 0.7678028148111504
precision = 0.5334118438761777
recall = 0.7635952025432301
f1_score = 0.6280778906915474


In [101]:
train_features, test_features = tfidf(train, test)
cboost(train_features, test_features)

0:	learn: 0.6786020	total: 1.62s	remaining: 27m 2s
100:	learn: 0.5530912	total: 1m 5s	remaining: 9m 47s
200:	learn: 0.5282327	total: 2m 7s	remaining: 8m 27s
300:	learn: 0.5128987	total: 3m 5s	remaining: 7m 11s
400:	learn: 0.5015113	total: 4m 5s	remaining: 6m 6s
500:	learn: 0.4923144	total: 5m 2s	remaining: 5m 1s
600:	learn: 0.4845876	total: 6m	remaining: 3m 59s
700:	learn: 0.4781411	total: 6m 59s	remaining: 2m 58s
800:	learn: 0.4725698	total: 7m 56s	remaining: 1m 58s
900:	learn: 0.4672053	total: 8m 54s	remaining: 58.8s
999:	learn: 0.4625261	total: 9m 53s	remaining: 0us
accuracy = 0.7671720794479211
precision = 0.5734522207267833
recall = 0.7348654708520179
f1_score = 0.6442016933776837


In [102]:
train_features, test_features = tfidf(train, test)
log_reg(train_features, test_features)

accuracy = 0.7571297830765045
precision = 0.5806191117092867
recall = 0.7063446582071224
f1_score = 0.6373407202216067


In [103]:
train_features, test_features = bow(train, test)
log_reg(train_features, test_features)

accuracy = 0.7564990477132751
precision = 0.6063257065948856
recall = 0.692837094851782
f1_score = 0.6467009994796246


In [104]:
train_features, test_features = tfidf(train, test)
des_tree(train_features, test_features)

accuracy = 0.7469885478245814
precision = 0.6462651413189772
recall = 0.6588570252469813
f1_score = 0.6525003397200707


In [105]:
train_features, test_features = bow(train, test)
des_tree(train_features, test_features)

accuracy = 0.7736031066808479
precision = 0.6830417227456258
recall = 0.6955389570341944
f1_score = 0.6892336943605065


In [106]:
train_features, test_features = tfidf(train, test)
ranfor(train_features, test_features)

  warn(


accuracy = 0.8216750352469762
precision = 0.6889973082099596
recall = 0.7982302264842318
f1_score = 0.7396023332671158


In [107]:
train_features, test_features = bow(train, test)
ranfor(train_features, test_features)

  warn(


accuracy = 0.8234683024561578
precision = 0.721399730820996
recall = 0.7815119924181673
f1_score = 0.7502537005283969


In [108]:
! pip install gensim



In [109]:
df = pd.read_csv("train.csv")
df = df.drop(columns=["qid1", "qid2", "id"])
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [8]:
from gensim.models import Word2Vec

In [1]:
def preprocess_text_wtv(text):
    text = text.lower()  # Приведение к нижнему регистру
    text = re.sub(r"[^\w\s]", '', text)  # Удаление знаков препинания
    text = text.strip()  # Удаление лишних пробелов
    return text

In [112]:
df['question1'] = df['question1'].apply(preprocess_text_wtv)
df['question2'] = df['question2'].apply(preprocess_text_wtv)

In [113]:
df['question1'] = df['question1'].apply(lambda x: x.split())
df['question2'] = df['question2'].apply(lambda x: x.split())

sentences = df['question1'].tolist() + df['question2'].tolist()

In [114]:
word2vec_model = Word2Vec(sentences)

In [115]:
def get_question_vector(question, model):
    words = [word for word in question if word in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

In [116]:
df['q1_vector'] = df['question1'].apply(lambda x: get_question_vector(x, word2vec_model))
df['q2_vector'] = df['question2'].apply(lambda x: get_question_vector(x, word2vec_model))


In [117]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [118]:
X_train = np.array([np.concatenate((v1, v2)) for v1, v2 in zip(train['q1_vector'], train['q2_vector'])])
X_test = np.array([np.concatenate((v1, v2)) for v1, v2 in zip(test['q1_vector'], test['q2_vector'])])
y_train = train['is_duplicate']
y_test = test['is_duplicate']

In [119]:
model = LogisticRegression()
model.fit(X_train, y_train)

# Предсказание и оценка модели
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f"accuracy = {accuracy}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1_score = {f1}")

accuracy = 0.7164535358282421
precision = 0.673623334171486
recall = 0.4490144810941271
f1_score = 0.5388498903795482


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
df = pd.read_csv("train.csv")
df = df.drop(columns=["qid1", "qid2", "id"])
df = df.dropna()
df.reset_index(drop=True, inplace=True)

In [6]:
df['question1'] = df['question1'].apply(preprocess_text_wtv)
df['question2'] = df['question2'].apply(preprocess_text_wtv)

In [7]:
import gensim.downloader as api

In [8]:
word2vec_model = api.load('word2vec-google-news-300')

In [9]:
def question_to_vector(question, model):
    words = [word for word in question if word in model]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean([model[word] for word in words], axis=0)

# Предобработка текстов (разделение на слова)
df['question1'] = df['question1'].apply(lambda x: x.split())
df['question2'] = df['question2'].apply(lambda x: x.split())

# Преобразование вопросов в вектора
df['q1_vector'] = df['question1'].apply(lambda x: question_to_vector(x, word2vec_model))
df['q2_vector'] = df['question2'].apply(lambda x: question_to_vector(x, word2vec_model))

# Разделение данных на обучающую и тестовую выборки
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Преобразование векторов вопросов в numpy массивы
X_train = np.array([np.concatenate((v1, v2)) for v1, v2 in zip(train['q1_vector'], train['q2_vector'])])
X_test = np.array([np.concatenate((v1, v2)) for v1, v2 in zip(test['q1_vector'], test['q2_vector'])])
y_train = train['is_duplicate']
y_test = test['is_duplicate']

# Обучение модели логистической регрессии
model = LogisticRegression()
model.fit(X_train, y_train)

# Предсказание и оценка модели
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
f1 = f1_score(y_test, predictions)

print(f"accuracy = {accuracy}")
print(f"precision = {precision}")
print(f"recall = {recall}")
print(f"f1_score = {f1}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy = 0.6931905315491356
precision = 0.6380674947784984
recall = 0.38914588361491015
f1_score = 0.483446466497314
