Импортируем библиотеки

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [None]:
import nltk
import string
from nltk.corpus import stopwords
import pymorphy2
import pickle

Подгрузим данные

In [2]:
# Считываем обучающую и тестовую выборки
train = pd.read_csv('train_data.csv', encoding='utf', engine='python', index_col=0)
test = pd.read_csv('test_data.csv', encoding='utf', engine='python', index_col=0)
# Объеденим заголовки и текст в одну переменную
train_sentences = train.title.values + ' ' + train.text.values
test_sentences = test.title.values + ' ' + test.text.values

# tf_idf модель

Инициализируем tf_idf модель

Будем использовать настраиваемый токенайзер-функцию

In [4]:
#Свой токенайзер
def tokenize_best(file_text):
    morph = pymorphy2.MorphAnalyzer()
    #Разделим на токены
    tokens = nltk.word_tokenize(file_text)
 
    #Удалим пунктуацию и символы
    tokens = [i for i in tokens if ( i not in string.punctuation )]
 
    #Удаляем стоп-слова
    stop_words = stopwords.words('russian')
    stop_words.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', '—', 'к', 'на'])
    tokens = [i for i in tokens if ( i not in stop_words )]
 
    #удаляем кавычки
    tokens = [i.replace("«", "").replace("»", "") for i in tokens]
    
    # Лемматизация
    tokens = [morph.parse(i)[0].normal_form for i in tokens]
    
    # удаление '' символов
    tokens = [i for i in tokens if ( i !='' )]
    return tokens

In [5]:
#Производим настройки tf_idf
tf_idf2 = TfidfVectorizer(ngram_range=(1, 2), 
                         token_pattern=('\w{1,}'),
                         tokenizer=tokenize_best,
                         analyzer='word',
                         max_features=20000,
                         sublinear_tf=True,
                        )

Построим эту модель.

In [12]:
%%time
tf_idf_model = tf_idf2.fit(np.concatenate([train_sentences, test_sentences]))
# 15min 25s

Wall time: 25min 41s


In [13]:
%%time
train_features = tf_idf_model.transform(train_sentences)
test_features = tf_idf_model.transform(test_sentences)

Wall time: 25min 50s


In [16]:
# Сохраним наши признаки
pickle.dump(train_features, open("train_features.final.ranj2.pickle", "wb"))
pickle.dump(test_features, open("test_features.final.ranj2.pickle", "wb"))

# Обучение модели

In [7]:
#Считываем ранее обработанные признаки
with open('test_features_final_ranj2.pickle', 'rb') as dump_in:
    token_test = pickle.load(dump_in)
with open('train_features_final_ranj2.pickle', 'rb') as dump_in:
    token_train = pickle.load(dump_in)

In [8]:
lm = LogisticRegression(random_state=42,
                        C=10, 
                        max_iter=100000)

## Сеточный поиск лучших параметров

In [28]:
lm_params = {'penalty':['l2'],
             'C':[0.001, 0.01, 0.1, 1, 2, 5, 10, 20, 100],
             'solver':['lbfgs', 'liblinear'],
             'tol' : [10, 1, 0.1, 0.01, 0.001, 0.0001, 0.0001]
    
    
}
lm_search = GridSearchCV(estimator=lm, 
                         param_grid=lm_params, 
                         scoring ='roc_auc', 
                         cv=StratifiedKFold(10), 
                         n_jobs=-1,
                         verbose=1)

In [29]:
%%time
lm_search_fitted = lm_search.fit(X=token_train, y=pd.factorize(train.score)[0])

Fitting 10 folds for each of 126 candidates, totalling 1260 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:   23.7s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:  3.0min


Wall time: 3min 4s


[Parallel(n_jobs=-1)]: Done 1260 out of 1260 | elapsed:  3.1min finished


In [30]:
#лучший результат за всё обучение
lm_search_fitted.best_score_

0.9305058655272753

In [31]:
#Параметры модели
lm_search_fitted.best_estimator_

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=42, solver='liblinear',
          tol=0.1, verbose=0, warm_start=False)

In [20]:
#Делаем предсказания
predicts = lm_search_fitted.best_estimator_.predict_proba(token_test)[:, 0]

In [22]:
#Сохраняем результаты для отправки на Kaggle
sub = pd.DataFrame({'index': range(0, len(predicts)),
                    'score':predicts})
sub.to_csv('sub111.csv', index=False)
sub.head()

Unnamed: 0,index,score
0,0,0.986108
1,1,0.876189
2,2,0.751644
3,3,0.976215
4,4,0.705782
