In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd

import copy
import traceback
import datetime
import joblib
import re
import os
import random
import string

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import matplotlib.pyplot as plt
%matplotlib inline

import torch
from torch import nn
from torch.nn import functional as F

from tqdm.notebook import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import pymorphy2
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
class Config:
    seed = 42
    positive_file = "../data/positive.csv"
    negative_file = "../data/negative.csv"
    russian_stop_words = "../data/russian_stop_words.txt"
    english_stop_words = "../data/english_stop_words.txt"
    test_size = 0.3
    
config = Config()

In [4]:
def init_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic=True
    os.environ['PYTHONHASHSEED'] = str(seed)
    
init_random_seed(config.seed)

In [5]:
df = pd.read_csv("../data/preprocessed_text_v1.csv", index_col=False)

In [6]:
df.head()

Unnamed: 0,message,ttype
0,работа полный пиддес каждый закрытие месяц сви...,0
1,коллега сидеть рубиться urban terror долбать в...,0
2,говорят обещаной год ждать,0
3,желать хороший полёт удачный посадка быть очен...,0
4,обновить какой леший surf работать простоплеер,0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df['message'].values.astype('U'), df['ttype'], random_state=config.seed, test_size=config.test_size)

In [25]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((158783,), (68051,), (158783,), (68051,))

In [26]:
X_train = X_train.tolist()
X_test = X_test.tolist()
y_train = y_train.tolist()
y_test = y_test.tolist()

In [27]:
X_train[:10]

['luna самый самый любимый рождественский песенка год',
 'скачать симс лизин диск прийтись папка картинка поудалять пофига новый накачать',
 'появиться ощущение приближаться новое год ёлка радость поставить',
 'итак получить несколько зачёт неделя спасть усердно работать приболеть',
 'мозг кипеть спин разболеться',
 'хороший мотивация мысль стареть успеть',
 'равно мой солнышко просто разнообразие должный разный называть',
 'хороший учитель найти сложно мы ментор везти',
 'оказываться такой сладкое губа мммм forever alone',
 'дыы порнососа музыка слушать пытаться сам придумать фанфик']

In [28]:
y_train[:10]

[1, 0, 1, 0, 0, 0, 1, 0, 1, 1]

In [29]:
# create pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(ngram_range=(1, 1), min_df=5)),  # strings to token integer counts
    ('tfidf', TfidfTransformer(norm="l2", smooth_idf=True, use_idf=True)),  # integer counts to weighted TF-IDF scores
    ('classifier', RandomForestClassifier()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

# this is where we define the values for GridSearchCV to iterate over
parameters = {
    'bow__min_df': [1, 2, 3, 4, 5],
    'bow__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__smooth_idf': (True, False),
    'classifier__alpha': (1e-2, 1e-3),
}

parameters = {
        "bow__min_df": [1, 2, 3, 4, 5],
        "bow__ngram_range": [(1, 1), (1, 2)],
        "tfidf__use_idf": (True, False),
        "tfidf__smooth_idf": (True, False),
        "classifier": [LogisticRegression()],
        "classifier__penalty": ['l2','l1'],
        "classifier__C": np.logspace(0, 4, 10)
}

In [30]:
# grid = GridSearchCV(pipeline, cv=5, param_grid=parameters, verbose=1, n_jobs=20, scoring=["accuracy", "precision", "recall", "f1", "roc_auc"], refit="f1")
grid = GridSearchCV(pipeline, cv=5, param_grid=parameters, verbose=1, n_jobs=20, scoring="f1")
grid.fit(X_train,y_train)
print(f"Best Model: {grid.best_score_} using {grid.best_params_}")

Fitting 5 folds for each of 800 candidates, totalling 4000 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    9.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:   53.0s
[Parallel(n_jobs=20)]: Done 410 tasks      | elapsed:  3.2min
[Parallel(n_jobs=20)]: Done 760 tasks      | elapsed: 13.0min
[Parallel(n_jobs=20)]: Done 1210 tasks      | elapsed: 16.1min
[Parallel(n_jobs=20)]: Done 1760 tasks      | elapsed: 20.0min
[Parallel(n_jobs=20)]: Done 2410 tasks      | elapsed: 24.1min
[Parallel(n_jobs=20)]: Done 3160 tasks      | elapsed: 28.0min
[Parallel(n_jobs=20)]: Done 4000 out of 4000 | elapsed: 32.6min finished


Best Model: 0.7363690658322289 using {'bow__min_df': 1, 'bow__ngram_range': (1, 2), 'classifier': LogisticRegression(C=2.7825594022071245), 'classifier__C': 2.7825594022071245, 'classifier__penalty': 'l2', 'tfidf__smooth_idf': False, 'tfidf__use_idf': True}


In [31]:
print(f"Best Model: {grid.best_score_} using {grid.best_params_}")

Best Model: 0.7363690658322289 using {'bow__min_df': 1, 'bow__ngram_range': (1, 2), 'classifier': LogisticRegression(C=2.7825594022071245), 'classifier__C': 2.7825594022071245, 'classifier__penalty': 'l2', 'tfidf__smooth_idf': False, 'tfidf__use_idf': True}


In [36]:
# save best model to current working directory
joblib.dump(grid, "sklearn_baseline_solution.pkl")
# load from file and predict using the best configs found in the CV step
model = joblib.load("sklearn_baseline_solution.pkl" )
# get predictions from best model above
y_preds = model.predict(X_test)
print('accuracy score: ',accuracy_score(y_test, y_preds))
print('\n')
print('confusion matrix: \n',confusion_matrix(y_test,y_preds))
print('\n')
print(classification_report(y_test, y_preds))

accuracy score:  0.7352867702164553


confusion matrix: 
 [[24043  9346]
 [ 8668 25994]]


              precision    recall  f1-score   support

           0       0.74      0.72      0.73     33389
           1       0.74      0.75      0.74     34662

    accuracy                           0.74     68051
   macro avg       0.74      0.74      0.74     68051
weighted avg       0.74      0.74      0.74     68051



In [37]:
y_preds = model.predict(X_test)
f1_score(y_test, y_preds, average="macro")

0.7426644953001342