In [1]:
# %pip install pymorphy2
# %pip install nltk
# %pip install sklearn
# %pip install wordcloud
# %pip install stop_words

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import string
import pymorphy2
from scipy.sparse import *
import stop_words
import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

import pickle

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score

from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split

In [4]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [5]:
def display_conf_matrix(y_true, y_pred):
    conf_matrix = confusion_matrix(y_true, y_pred)
    cm_display = ConfusionMatrixDisplay(conf_matrix, display_labels=Y.unique())
    cm_display.plot()
    plt.show()

In [6]:
def print_grid_search_info(model):
    print(f'Best estimator -> {model.best_estimator_}\n\
Best Score -> {model.best_score_}\n\
Best Parameters -> {model.best_params_}\n\
Best index -> {model.best_index_}')

In [7]:
def get_gridsearch_for_model(model, parameters : dict) -> GridSearchCV:
    model_grid = GridSearchCV(
        estimator=model,
        param_grid=parameters,
        scoring=['f1_micro', 'accuracy', 'recall_macro'],
        refit='f1_micro',
        cv=3,
        verbose=3,
        error_score=0
    )
    return model_grid

In [8]:
def save_model(path:str, model):
    with open(path, mode='wb') as pickle_file:
        pickle.dump(model, pickle_file)

def load_model(path:str):
    with open('./models/lin_svc.pkl', mode='rb') as pickle_file:
        model = pickle.load(pickle_file)
    return model

In [9]:
def print_metrics(y_test, y_pred):
    print(f'f1_micro = {f1_score(y_test, y_pred, average="micro")}\nrecall_score = {recall_score(y_test, y_pred, average="macro")}\nprecision_score = {precision_score(y_test, y_pred, average="macro")}')
    

# Data preprocessing

In [10]:
test_dataset = pd.read_csv("data/test.csv")
train_dataset = pd.read_csv("data/train.csv")
train_dataset.head()

Unnamed: 0,Class Index,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


Let's prepare copies of train dataset with 4 different text cleaning techniques (one method at a time):  
- stop words removing
- punctuation removing
- trash removing
- digits removing

In [11]:
train_dataset_splitted_texts = [line.split('\n') for line in train_dataset['Description']]
pd.DataFrame(train_dataset_splitted_texts)

Unnamed: 0,0
0,"Reuters - Short-sellers, Wall Street's dwindli..."
1,Reuters - Private investment firm Carlyle Grou...
2,Reuters - Soaring crude prices plus worries\ab...
3,Reuters - Authorities have halted oil export\f...
4,"AFP - Tearaway world oil prices, toppling reco..."
...,...
119995,KARACHI (Reuters) - Pakistani President Perve...
119996,Red Sox general manager Theo Epstein acknowled...
119997,The Miami Dolphins will put their courtship of...
119998,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


In [12]:
train_removed_stopwords = [' '.join([word if word not in stop_words.get_stop_words('en') else '' for word in text]) for text in train_dataset_splitted_texts]
pd.DataFrame(train_removed_stopwords)

Unnamed: 0,0
0,"Reuters - Short-sellers, Wall Street's dwindli..."
1,Reuters - Private investment firm Carlyle Grou...
2,Reuters - Soaring crude prices plus worries\ab...
3,Reuters - Authorities have halted oil export\f...
4,"AFP - Tearaway world oil prices, toppling reco..."
...,...
119995,KARACHI (Reuters) - Pakistani President Perve...
119996,Red Sox general manager Theo Epstein acknowled...
119997,The Miami Dolphins will put their courtship of...
119998,PITTSBURGH at NY GIANTS Time: 1:30 p.m. Line: ...


In [13]:
train_removed_punctuation = [' '.join([(' '.join(text)).translate(str.maketrans('', '', str("!\"\'(),-./:;?\\`")))]) for text in train_dataset_splitted_texts]
pd.DataFrame(train_removed_punctuation)
# train_removed_punctuation

Unnamed: 0,0
0,Reuters Shortsellers Wall Streets dwindlingba...
1,Reuters Private investment firm Carlyle Group...
2,Reuters Soaring crude prices plus worriesabou...
3,Reuters Authorities have halted oil exportflo...
4,AFP Tearaway world oil prices toppling record...
...,...
119995,KARACHI Reuters Pakistani President Pervez M...
119996,Red Sox general manager Theo Epstein acknowled...
119997,The Miami Dolphins will put their courtship of...
119998,PITTSBURGH at NY GIANTS Time 130 pm Line Steel...


In [14]:
exclude_symbols = u''.join(['№', '«', 'ђ', '°', '±', '‚', 'ћ', '‰', '…', '»', 'ѓ', 'µ', '·', 'ґ', 'њ', 'ї', 'џ', 'є', '‹',
                            '‡', '†', '¶', 'ќ', '€', '“', 'ў', '§', '„', '”', '\ufeff', '’', 'љ', '›', '•', '—', '‘', 
                            '\x7f', '\xad', '¤', '\xa0', '\u200b', '–']) + string.punctuation
regex_symb = re.compile('[%s]' % re.escape(exclude_symbols))

In [15]:
train_removed_trash = [regex_symb.sub('', ' '.join(text)) for text in train_dataset_splitted_texts]
train_removed_trash = [re.sub(r' +', ' ', text) for text in train_removed_trash]
pd.DataFrame(train_removed_trash)

Unnamed: 0,0
0,Reuters Shortsellers Wall Streets dwindlingban...
1,Reuters Private investment firm Carlyle Groupw...
2,Reuters Soaring crude prices plus worriesabout...
3,Reuters Authorities have halted oil exportflow...
4,AFP Tearaway world oil prices toppling records...
...,...
119995,KARACHI Reuters Pakistani President Pervez Mu...
119996,Red Sox general manager Theo Epstein acknowled...
119997,The Miami Dolphins will put their courtship of...
119998,PITTSBURGH at NY GIANTS Time 130 pm Line Steel...


In [16]:
regex_digit = re.compile('[%s]' % re.escape(string.digits))
train_removed_digits = [regex_digit.sub('', ' '.join(text)) for text in train_dataset_splitted_texts]
pd.DataFrame(train_removed_digits)

Unnamed: 0,0
0,"Reuters - Short-sellers, Wall Street's dwindli..."
1,Reuters - Private investment firm Carlyle Grou...
2,Reuters - Soaring crude prices plus worries\ab...
3,Reuters - Authorities have halted oil export\f...
4,"AFP - Tearaway world oil prices, toppling reco..."
...,...
119995,KARACHI (Reuters) - Pakistani President Perve...
119996,Red Sox general manager Theo Epstein acknowled...
119997,The Miami Dolphins will put their courtship of...
119998,PITTSBURGH at NY GIANTS Time: : p.m. Line: Ste...


In [17]:
vectorizer = CountVectorizer(stop_words=stop_words.get_stop_words('en'), max_features=10000)

In [18]:
Y = train_dataset["Class Index"]
seed = 42

In [13]:
CVect_stopwords = vectorizer.fit_transform(train_removed_stopwords).toarray()



In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(CVect_stopwords, Y, test_size=0.25, random_state=seed)

In [None]:
parameters_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 0.5, 1.0, 10.0],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 500]
}

grid_log = get_gridsearch_for_model(LogisticRegression(), parameters_logreg)

grid_log.fit(X_train, Y_train)
print_grid_search_info(grid_log)

In [20]:
Y_pred_logreg = grid_log.predict(X_test)
print_metrics(Y_test, Y_pred_logreg)
save_model('./models/log_reg.pkl', grid_log)

accuracy_score = 0.9016666666666667
recall_score = 0.9014730717851448
precision_score = 0.9013132593705164


In [None]:
parameters_linSVC = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': np.linspace(0.001, 0.1, 5),
    'max_iter': [1000, 3000, 5000],
    'dual':[True]
}

grid_linearSVC = get_gridsearch_for_model(LinearSVC(), parameters_linSVC)

grid_linearSVC.fit(X_train, Y_train)
print_grid_search_info(grid_linearSVC)

In [22]:
Y_pred_linSVC = grid_linearSVC.predict(X_test)
print_metrics(Y_test, Y_pred_linSVC)
save_model('./models/lin_svc.pkl', grid_linearSVC)

accuracy_score = 0.9051999999999999
recall_score = 0.9050006500418074
precision_score = 0.9049231276210165


In [None]:
# parameters_RFC = {
#     'n_estimators': [10, 50, 100, 300],
#     'criterion': ['gini', 'entropy', 'log_loss'],
#     'max_features': ['sqrt', 'log2'],
# }

# grid_RFC = get_gridsearch_for_model(RandomForestClassifier(), parameters_RFC)

# grid_RFC.fit(X_train, Y_train)
# print_grid_search_info(grid_RFC)

In [None]:
# Y_pred_RFC = grid_RFC.predict(X_test)
# print_metrics(Y_test, Y_pred_RFC)
# save_model('./models/RFC.pkl', grid_RFC)

In [None]:
parameters_KNN = {
    'n_neighbors' : [5, 6, 7],
    'weights' : ['uniform', 'distance'],
    'leaf_size' : [1, 2, 3],
    'n_jobs' : [10]
}

grid_KNN = get_gridsearch_for_model(KNeighborsClassifier(), parameters_KNN)

grid_KNN.fit(X_train, Y_train)
print_grid_search_info(grid_KNN)

In [25]:
Y_pred_KNN = grid_KNN.predict(X_test)
print_metrics(Y_test, Y_pred_KNN)
save_model('./models/KNN.pkl', grid_RFC)

accuracy_score = 0.5876333333333333
recall_score = 0.5868685205825099
precision_score = 0.6719880795980494


In [None]:
# parameters_MNB = {
#     'alpha' : np.linspace(0.3, 0.4, 100)
# }

# grid_MNB = get_gridsearch_for_model(MultinomialNB(),parameters_MNB)

# grid_MNB.fit(X_train, Y_train)
# print_grid_search_info(grid_MNB)

In [None]:
# Y_pred_MNB = grid_MNB.predict(X_test)
# print_metrics(Y_test, Y_pred_MNB)
# save_model('./models/MNB.pkl', grid_RFC)

In [18]:
CVect_trash = vectorizer.fit_transform(train_removed_trash).toarray()



In [19]:
X_train, X_test, Y_train, Y_test = train_test_split(CVect_trash, Y, test_size=0.25, random_state=seed)

In [None]:
parameters_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 0.5, 1.0, 10.0],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 500]
}

grid_log = get_gridsearch_for_model(LogisticRegression(), parameters_logreg)

grid_log.fit(X_train, Y_train)
print_grid_search_info(grid_log)

In [21]:
Y_pred_logreg = grid_log.predict(X_test)
print_metrics(Y_test, Y_pred_logreg)
save_model('./models/log_reg_trash.pkl', grid_log)

accuracy_score = 0.8989
recall_score = 0.8987174214816545
precision_score = 0.8985895429774059


In [None]:
parameters_linSVC = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': np.linspace(0.001, 0.1, 5),
    'max_iter': [1000, 3000, 5000],
    'dual':[True]
}

grid_linearSVC = get_gridsearch_for_model(LinearSVC(), parameters_linSVC)

grid_linearSVC.fit(X_train, Y_train)
print_grid_search_info(grid_linearSVC)

In [None]:
Y_pred_linSVC = grid_linearSVC.predict(X_test)
print_metrics(Y_test, Y_pred_linSVC)
save_model('./models/lin_svc_trash.pkl', grid_linearSVC)

In [20]:
CVect_digits = vectorizer.fit_transform(train_removed_digits).toarray()



In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(CVect_digits, Y, test_size=0.25, random_state=seed)

In [22]:
parameters_logreg = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 0.5, 1.0, 10.0],
    'solver': ['lbfgs', 'liblinear'],
    'max_iter': [100, 500]
}

grid_log = get_gridsearch_for_model(LogisticRegression(), parameters_logreg)

grid_log.fit(X_train, Y_train)
print_grid_search_info(grid_log)

Fitting 3 folds for each of 40 candidates, totalling 120 fits
[CV 1/3] END C=0.01, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=0.000) f1_micro: (test=0.000) recall_macro: (test=0.000) total time=   2.4s
[CV 2/3] END C=0.01, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=0.000) f1_micro: (test=0.000) recall_macro: (test=0.000) total time=   1.6s
[CV 3/3] END C=0.01, max_iter=100, penalty=l1, solver=lbfgs; accuracy: (test=0.000) f1_micro: (test=0.000) recall_macro: (test=0.000) total time=   1.7s
[CV 1/3] END C=0.01, max_iter=100, penalty=l1, solver=liblinear; accuracy: (test=0.793) f1_micro: (test=0.793) recall_macro: (test=0.793) total time=   7.5s
[CV 2/3] END C=0.01, max_iter=100, penalty=l1, solver=liblinear; accuracy: (test=0.793) f1_micro: (test=0.793) recall_macro: (test=0.793) total time=   6.8s
[CV 3/3] END C=0.01, max_iter=100, penalty=l1, solver=liblinear; accuracy: (test=0.794) f1_micro: (test=0.794) recall_macro: (test=0.794) total time=   6.8s
[CV 1/3]

In [None]:
Y_pred_logreg = grid_log.predict(X_test)
print_metrics(Y_test, Y_pred_logreg)
save_model('./models/log_reg_digits.pkl', grid_log)

In [None]:
parameters_linSVC = {
    'penalty': ['l1', 'l2'],
    'loss': ['hinge', 'squared_hinge'],
    'C': np.linspace(0.001, 0.1, 5),
    'max_iter': [1000, 3000, 5000],
    'dual':[True]
}

grid_linearSVC = get_gridsearch_for_model(LinearSVC(), parameters_linSVC)

grid_linearSVC.fit(X_train, Y_train)
print_grid_search_info(grid_linearSVC)

In [None]:
Y_pred_linSVC = grid_linearSVC.predict(X_test)
print_metrics(Y_test, Y_pred_linSVC)
save_model('./models/lin_svc_digits.pkl', grid_linearSVC)

In [None]:
# CVect_punctuation = vectorizer.fit_transform(train_removed_punctuation).toarray()

# Comparison

In [None]:
def fill_metrix_df(metrix, name, accuracy, recall, precision):
    metrix[name] = [accuracy, recall, precision]

all_metrics_df = pd.DataFrame(index=['f1_micro', 'Recall', 'Precision'])
fill_metrix_df(all_metrics_df, 'LogisticRegression',
                f1_micro(Y_test, Y_pred_logreg), 
                recall_score(Y_test, Y_pred_logreg, average='macro'), 
                precision_score(Y_test, Y_pred_logreg, average='macro'))
fill_metrix_df(all_metrics_df, 'LinearSVC',
                accuracy_score(Y_test, Y_pred_linearSVC), 
                recall_score(Y_test, Y_pred_linearSVC, average='macro'), 
                precision_score(Y_test, Y_pred_linearSVC, average='macro'))
# fill_metrix_df(all_metrics_df, 'MultinomialNB',
#                 accuracy_score(Y_test, Y_pred_MNB), 
#                 recall_score(Y_test, Y_pred_MNB, average='macro'), 
#                 precision_score(Y_test, Y_pred_MNB, average='macro'))

LogisticRegression

In [None]:
display_conf_matrix(Y_test, Y_pred_logreg)

As we can see, Logistic regression have a few problems with prediction of two categories: "nauka" and "hitech".

LinearSVC

In [None]:
display_conf_matrix(Y_test, Y_pred_linearSVC)

LinearSVC model have confused "nauka" and "hitech" more often like LogisticRegression.

KNeigboursClassifier

In [None]:
display_conf_matrix(Y_test, Y_pred_KNN)

KNeigbors classifier confused almost everything, and now it's clear why it has the lowest scores between others.

GradientBoostingClassifier

In [None]:
display_conf_matrix(Y_test, Y_pred_GBC)

GradientBoosting classifier has more confusions between "nauka" and "hitech" than logistic regression and LinearSVC.

RandomForestClassifier

In [None]:
display_conf_matrix(Y_test, Y_pred_RFC)

As we can see, RandomForest classifier good at prediction of "auto" label, but that classifier, like others, confuse "hitech" and "nauka".

MultinomialNB

In [None]:
display_conf_matrix(Y_test, Y_pred_MNB)

Like other models, MultinonialNB confuses "nauka" and "hitech"

All metrics are displayed below

In [None]:
all_metrics_df

### Summary