In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import re

In [2]:
data = pd.read_csv('./ogvdata.csv', encoding='utf-8', delimiter=';')

In [3]:
train_data_full = data['title']
train_label_full = data['class_level']

In [4]:
train, test, train_labels, test_labels = train_test_split(train_data_full, train_label_full, test_size=0.33, random_state=42)

In [5]:
text_clf = Pipeline([
                     ('tfidf', TfidfVectorizer()),
                     ('clf', RandomForestClassifier(n_estimators=100))
                     ])
 
model = text_clf.fit(train, train_labels)

In [6]:
preds = text_clf.predict(test)

In [7]:

print(accuracy_score(test_labels, preds))

0.9946800576741411


In [8]:
text_clf.predict_proba(['пфр крым область'])

array([[0.27, 0.04, 0.69]])

In [9]:
text_clf.predict(['пфр крым область'])

array(['федеральный'], dtype=object)

In [10]:
scores = cross_val_score(text_clf, train_data_full, train_label_full, cv=5)

In [11]:
scores

array([0.98425068, 0.99466776, 0.98580688, 0.9958976 , 0.99565146])

In [12]:
from sklearn.externals import joblib
# Dump model
joblib.dump(text_clf, 'model_ogv.pkl') 
# Load model
text_clf = joblib.load('model_ogv.pkl')