In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
import re
from nltk.stem.porter import *
import pickle
import joblib

In [19]:
data = pd.read_csv("train.csv")
data = data.drop(columns=["qid1", "qid2", "id"])
data = data.dropna()
data.reset_index(drop=True, inplace=True)

In [20]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def delete_stop_words(text):
    return ' '.join([word.lower() for word in text.split() if word not in stop_words])

data['question1'] = data['question1'].apply(delete_stop_words)
data['question2'] = data['question2'].apply(delete_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/timurabdulkadirov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
def preprocess_text(text): 
    text = re.sub(r"[^\w\s]", '', text.lower())
    return text.split()

In [22]:
def train_test_stem(data2):
    stemmer = PorterStemmer()

    def preprocess_stem(text):
        return ' '.join([stemmer.stem(word) for word in preprocess_text(text)])
    
    data2['question1'] = data2['question1'].apply(preprocess_stem)
    data2['question2'] = data2['question2'].apply(preprocess_stem)
    print(data2.head())
    train, test = train_test_split(data2, test_size=0.2, random_state=0)
    return train, test

In [23]:
train, test = train_test_stem(data)

                                           question1  \
0      what step step guid invest share market india   
1               what stori kohinoor kohinoor diamond   
2       how i increas speed internet connect use vpn   
3                    whi i mental lone how i solv it   
4  which one dissolv water quikli sugar salt meth...   

                                           question2  is_duplicate  
0            what step step guid invest share market             0  
1  what would happen indian govern stole kohinoor...             0  
2                 how internet speed increas hack dn             0  
3               find remaind math2324math divid 2423             0  
4                 which fish would surviv salt water             0  


In [24]:
def bow (df_train, df_test):
    vectorizer = CountVectorizer()
    questions = pd.concat([df_train['question1'], df_train['question2']])
    vectorizer.fit(questions)
    joblib.dump(vectorizer, 'vectorizer.pkl')
    q1_train_bow = vectorizer.transform(df_train['question1'])
    q2_train_bow = vectorizer.transform(df_train['question2'])
    q1_test_bow = vectorizer.transform(df_test['question1'])
    q2_test_bow = vectorizer.transform(df_test['question2'])
    df_train_features = q1_train_bow + q2_train_bow # Вычитание дает результаты порядком хуже
    df_test_features = q1_test_bow + q2_test_bow
    return df_train_features, df_test_features

In [14]:
def ranfor(train_features, test_features, est):
    model = RandomForestClassifier(
        n_estimators=est,       
        criterion='entropy',          
        max_depth=None,           
        min_samples_split=2,       
        min_samples_leaf=1,   
        max_features='auto',      
        bootstrap=True,          
        random_state=42            
    )
    model.fit(train_features, train['is_duplicate'])
    predictions = model.predict(test_features)
    accuracy = accuracy_score(predictions, test['is_duplicate'])
    print(f"accuracy = {accuracy}")
    precision = precision_score(predictions, test['is_duplicate'])
    print(f"precision = {precision}")
    recall = recall_score(predictions, test['is_duplicate'])
    print(f"recall = {recall}")
    f1_met = f1_score(predictions, test['is_duplicate'])
    print(f"f1_score = {f1_met}")
    with open('random_forest_model.pkl', 'wb') as file:
        pickle.dump(model, file)

In [25]:
train_features, test_features = bow(train, test)

In [16]:
ranfor(train_features, test_features, 100)

  warn(


accuracy = 0.8260036112691385
precision = 0.7259421265141319
recall = 0.7845739845085276
f1_score = 0.7541201349202188
