### Импортируем библиотеки

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords

In [3]:
# Считываем обучающую и тестовую выборки
train = pd.read_csv('train_data.csv', encoding='utf', engine='python', index_col=0)
test = pd.read_csv('test_data.csv', encoding='utf', engine='python', index_col=0)
# Объеденим заголовки и текст в одну переменную
train_sentences = train.title.values + ' ' + train.text.values
test_sentences = test.title.values + ' ' + test.text.values

### tf_idf модель

In [2]:
# можно ещё попробовать удалить все цифры
import nltk
import string
from nltk.corpus import stopwords
import pymorphy2
 
def tokenize_best(file_text):
    morph = pymorphy2.MorphAnalyzer()
    #Разделим на токены
    tokens = nltk.word_tokenize(file_text)
 
    #Удалим пунктуацию и символы
    tokens = [i for i in tokens if ( i not in string.punctuation )]
 
    #Удаляем стоп-слова
    stop_words = stopwords.words('russian')
    stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
    tokens = [i for i in tokens if ( i not in stop_words )]
 
    #удаляем кавычки
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    
    # Лемматизация
    tokens = [morph.parse(i)[0].normal_form for i in tokens]
    
    # удаление '' символов
    tokens = [i for i in tokens if ( i !='' )]
    return tokens

In [4]:
# Задаём параметры tf-idf и указываем нашу функцию токенизации!
tf_idf2 = TfidfVectorizer(ngram_range=(1, 3),
    max_df=0.8,
    min_df=10,
    analyzer='word',
    tokenizer=tokenize_best,
    token_pattern=None)  

In [5]:
# Для сохранения обученной модели
import pickle
import time

In [6]:
%%time
s_time = time.time()
tf_idf_model = tf_idf2.fit(np.concatenate([train_sentences, test_sentences]))
print((time.time() - s_time)/60.)
# 15min 25s

15.46509609222412
Wall time: 15min 27s


In [7]:
# сохраним модель, чтобы не обучать постоянно
pickle.dump(tf_idf_model, open("tfidf.final.pickle", "wb"))

In [8]:
%%time
train_features = tf_idf_model.transform(train_sentences)
test_features = tf_idf_model.transform(test_sentences)

Wall time: 15min 49s


In [9]:
# Сохраним наши признаки тоже
pickle.dump(train_features, open("train_features.final.pickle", "wb"))
pickle.dump(test_features, open("test_features.final.pickle", "wb"))

### Настройка и обучение Логистической Регрессии

In [10]:
lm = LogisticRegression(n_jobs=-1,
                        penalty='l2',
                        tol=0.000000001,
                        random_state=42,
                        C=10, 
                        max_iter=100000)

In [11]:
lm_params = {'solver':['lbfgs'],
             'C':[0.001, 0.01, 0.1, 1, 2, 5, 10, 20, 100],
             'tol' : [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.0001]   
}
lm_search = GridSearchCV(estimator=lm, 
                         param_grid=lm_params, 
                         scoring ='accuracy', 
                         cv=StratifiedKFold(10), 
                         n_jobs=-1,
                         verbose=1)

In [12]:
%%time
lm_search_fitted = lm_search.fit(X=train_features, y=pd.factorize(train.type)[0])
print(lm_search_fitted.best_score_)

Fitting 10 folds for each of 63 candidates, totalling 630 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   55.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 630 out of 630 | elapsed: 12.6min finished


0.7145667140203257
Wall time: 12min 35s


### Делаем предсказание и сохраняем результат

In [13]:
predicts = lm_search_fitted.best_estimator_.predict(test_features)
predicts_names = pd.factorize(train.type)[1][predicts]

In [14]:
sub = pd.DataFrame({'index': range(0, len(predicts)),
                    'type': predicts_names})
sub.to_csv('sub2.csv', index=False)