Идея взята у https://www.kaggle.com/sudhirnl7/logistic-regression-tfidf

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
import pymorphy2
from allennlp.data.tokenizers import Tokenizer, PretrainedTransformerTokenizer

In [2]:
transformer_model = 'DeepPavlov/rubert-base-cased'
tokenizer = PretrainedTransformerTokenizer(transformer_model)

In [3]:
raw_train = pd.read_csv('/home/mlepekhin/data/ru_train')
raw_test = pd.read_csv('/home/mlepekhin/data/ru_test')
raw_train.head()

Unnamed: 0.1,Unnamed: 0,target,text
0,1532,A8,ОАО « Нижнекамскнефтехим » ( НКНХ ) не отказыв...
1,389,A11,... в ходе написания ходатайства : сделать его...
2,207,A14,3.2 . Т опливо и его характеристики . 3.3 . М ...
3,1574,A8,Президент России Дмитрий Медведев в субботу на...
4,196,A16,Что заставляло человечество меняться к лучшему...


In [4]:
import nltk
from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

In [11]:
train_clean_text = [' '.join(map(str, tokenizer.tokenize(text))) for text in raw_train.text.values]
train_clean_label = raw_train.target.values
print(train_clean_label[:10])

['A8' 'A11' 'A14' 'A8' 'A16' 'A12' 'A16' 'A8' 'A4' 'A14']


In [12]:
test_clean_text = [' '.join(map(str, tokenizer.tokenize(text))) for text in raw_test.text.values]
test_clean_label = raw_test.target.values
print(test_clean_label[:10])

['A7' 'A17' 'A17' 'A11' 'A8' 'A8' 'A11' 'A12' 'A17' 'A11']


In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [23]:
class BigVectorizer:
    def __init__(self, max_word_features=5000, max_char_features=10000):
        self.vect_word = TfidfVectorizer(
            max_features=max_word_features, lowercase=True, analyzer='word',
            stop_words=stopwords.words('russian'), ngram_range=(1,3),dtype=np.float32
        )
        self.vect_char = TfidfVectorizer(
            max_features=max_char_features, lowercase=True, analyzer='char',
            stop_words=stopwords.words('russian'), ngram_range=(3,6),dtype=np.float32
        )

    def fit_transform(self, X):
        vect_word = self.vect_word.fit_transform(X)
        vect_char = self.vect_char.fit_transform(X)
        return sparse.hstack([vect_word, vect_char])
       
    def transform(self, X):
        vect_word = self.vect_word.transform(X)
        vect_char = self.vect_char.transform(X)
        return sparse.hstack([vect_word, vect_char])

In [24]:
vectorizer = BigVectorizer()
train_vect = vectorizer.fit_transform(train_clean_text)

In [25]:
test_vect = vectorizer.transform(test_clean_text)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score

In [27]:
def evaluate(predictor, X_train, X_test, y_train, y_test):
    predicted_train = predictor.predict(X_train)
    predicted_test = predictor.predict(X_test)
    
    #print('recall train', recall_score(predicted_train, y_train))
    #print('recall test', recall_score(predicted_test, y_test))
    print('accuracy train', accuracy_score(predicted_train, y_train))
    print('accuracy test', accuracy_score(predicted_test, y_test))
    #print('f1 train', f1_score(predicted_train, y_train))
    #print('f1 test', f1_score(predicted_test, y_test))

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
#lr_grid_search = GridSearchCV(
#    LogisticRegression(random_state=42),# class_weight='balanced'), 
#    {'C': [0.5, 1, 1.5, 2, 2.5, 3]}
#)
lr_estimator = LogisticRegression(random_state=42, C=3)

In [30]:
lr_estimator.fit(train_vect, train_clean_label)

LogisticRegression(C=3, random_state=42)

In [31]:
evaluate(lr_estimator, train_vect, test_vect, train_clean_label, test_clean_label)

accuracy train 0.9861782999308915
accuracy test 0.7763975155279503


In [25]:
import pickle
from os.path import join as pathjoin

def save_model(predictor, vectorizer, model_dir):
    !mkdir {model_dir}
    with open(pathjoin(model_dir, 'predictor'), 'wb') as fout:
        fout.write(pickle.dumps(predictor))
    with open(pathjoin(model_dir, 'vectorizer'), 'wb') as fout:
        fout.write(pickle.dumps(vectorizer))
        
def load_model(model_dir):
    return pickle.loads(open(pathjoin(model_dir, 'predictor'), 'rb').read()),\
           pickle.loads(open(pathjoin(model_dir, 'vectorizer'), 'rb').read())

In [26]:
save_model(lr_estimator, vectorizer, 'simple_lr')
new_lr, new_vectorizer = load_model('simple_lr')

mkdir: cannot create directory ‘simple_lr’: File exists


In [None]:
PretrainedTransformerTokenizer(transformer_model, max_length=MAX_TOKENS-2)