Идея взята у https://www.kaggle.com/sudhirnl7/logistic-regression-tfidf

In [1]:
import nltk
nltk.download('stopwords')
import numpy as np
import pandas as pd

from scipy import sparse

[nltk_data] Downloading package stopwords to /home/mike/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
raw_train = pd.read_csv('train_df.csv')
raw_test = pd.read_csv('test_df.csv')
raw_train.head()

Unnamed: 0.1,Unnamed: 0,hubs,likes,text,title
0,599,программирование,6,"\nЗдравствуйте, по материалам опубликованного ...",Цикл статей «Изучаем VoIP-движок Mediastreamer...
1,752,гаджеты|будущее здесь,3,В школе я «изучал» французский язык. Не зря я ...,Волшебный английский
2,2016,программирование,23,Введение\nВ последние несколько лет голосовые ...,Технологии ASR и TTS для прикладного программи...
3,1001,научно-популярное,5,"\n\nУчёные из Института материаловедения, подр...",Учёные объявили о появлении замены дорогого ма...
4,2514,программирование,23,Было ли вам когда-либо интересно написать свою...,Разработка шахматной программы


In [3]:
import nltk

from nltk import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

In [4]:
def make_label_matrix(label_list):
    id_to_label = ['научно-популярное', 'гаджеты', 'программирование', 
                   'космонавтика', 'информационная безопасность', 'it-компании',
                   'компьютерное железо', 'разработка веб-сайтов', 
                   'diy или сделай сам', 'будущее здесь']
    label_to_id = {label: label_id for label_id, label in enumerate(id_to_label)}
    result_matr = np.zeros((len(label_list), len(id_to_label))).astype(int)
    
    for i in range(len(label_list)):
        cur_labels = str(label_list[i]).split('|')
        for label in cur_labels:
            if str(label) == 'nan':
                continue
            result_matr[i][label_to_id[label]] = 1
    return result_matr

In [5]:
train_clean_text = [text.replace('\n', ' ') for text in raw_train.text.values]
train_clean_label = make_label_matrix(raw_train.hubs.values)
print(train_clean_label[:10])

[[0 0 1 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1]
 [0 0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]]


In [6]:
test_clean_text = [text.replace('\n', ' ') for text in raw_test.text.values]
test_clean_label = make_label_matrix(raw_test.hubs.values)
print(test_clean_label[:10])

[[1 0 0 1 0 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 0 0 0 1]
 [1 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0]
 [1 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0]
 [0 0 1 0 0 1 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 1]]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
class BigVectorizer:
    def __init__(self, max_word_features=2048, max_char_features=2048):
        self.vect_word = TfidfVectorizer(
            max_features=max_word_features, lowercase=True, analyzer='word',
            stop_words=stopwords.words('russian'), ngram_range=(1,3),dtype=np.float32
        )
        self.vect_char = TfidfVectorizer(
            max_features=max_char_features, lowercase=True, analyzer='char',
            stop_words=stopwords.words('russian'), ngram_range=(3,6),dtype=np.float32
        )

    def fit_transform(self, X):
        vect_word = self.vect_word.fit_transform(X)
        vect_char = self.vect_char.fit_transform(X)
        return sparse.hstack([vect_word, vect_char])
       
    def transform(self, X):
        vect_word = self.vect_word.transform(X)
        vect_char = self.vect_char.transform(X)
        return sparse.hstack([vect_word, vect_char])

In [9]:
vectorizer = BigVectorizer()
train_vect = vectorizer.fit_transform(train_clean_text)

In [10]:
test_vect = vectorizer.transform(test_clean_text)

In [14]:
import pickle

pickle.dump(train_vect, open('train_vect.pcl', 'wb'))
pickle.dump(test_vect, open('test_vect.pcl', 'wb'))

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.multiclass import OneVsRestClassifier

In [16]:
def evaluate(predictor, X_train, X_test, y_train, y_test):
    predicted_train = predictor.predict(X_train)
    predicted_test = predictor.predict(X_test)
    
    print('accuracy train', accuracy_score(predicted_train, y_train))
    print('accuracy test', accuracy_score(predicted_test, y_test))

In [17]:
from sklearn.model_selection import GridSearchCV

In [27]:
lr_estimator = OneVsRestClassifier(LogisticRegression(random_state=42, C=3, multi_class='ovr'))

In [28]:
lr_estimator.fit(train_vect, train_clean_label)

OneVsRestClassifier(estimator=LogisticRegression(C=3, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='ovr', n_jobs=None,
                                                 penalty='l2', random_state=42,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [29]:
evaluate(lr_estimator, train_vect, test_vect, train_clean_label, test_clean_label)

accuracy train 0.7075625680087051
accuracy test 0.6800870511425462


In [None]:
from sklearn.svm import SVC

svm_estimator = OneVsRestClassifier(SVC())
svm_estimator.fit(train_vect, train_clean_label)
evaluate(svm_estimator, train_vect, test_vect, train_clean_label, test_clean_label)

In [None]:
import pickle
from os.path import join as pathjoin

def save_model(predictor, vectorizer, model_dir):
    !mkdir {model_dir}
    with open(pathjoin(model_dir, 'predictor'), 'wb') as fout:
        fout.write(pickle.dumps(predictor))
    with open(pathjoin(model_dir, 'vectorizer'), 'wb') as fout:
        fout.write(pickle.dumps(vectorizer))
        
def load_model(model_dir):
    return pickle.loads(open(pathjoin(model_dir, 'predictor'), 'rb').read()),\
           pickle.loads(open(pathjoin(model_dir, 'vectorizer'), 'rb').read())

In [None]:
save_model(lr_estimator, vectorizer, 'simple_lr')
new_lr, new_vectorizer = load_model('simple_lr')

In [None]:
save_model(svm_estimator, vectorizer, 'simple_svm')