In [3]:
import pandas as pd
import numpy as np
import string, re, random
import pickle

from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [9]:
stemmer = SnowballStemmer('russian')

In [5]:
with open(f'data/russian.txt', 'r') as file:
    stopwords = file.readlines()
    stopwords = [ word.replace('\n', '') for word in stopwords ]

In [6]:
def preproc(text):
    text = text.lower()
    # whitespace
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = text.split(' ')
    # remove stopwords
    text = [ word for word in text if word not in stopwords ]
    # stemmer
    text = [ stemmer.stem(word) for word in text ]
    return (' ').join(text)

In [45]:
CLF_PATH_1 = 'clf_1.pkl'
CLF_PATH_2 = 'clf_2.pkl'
TFIDF_PATH = 'tfidf.pkl'
SUBMIT_PATH = 'submit.csv'

def train(train_path):
    # set seed     
    my_seed = 12345
    np.random.seed(my_seed)
    random.seed(my_seed)
    
    # load data     
    train_df = pd.read_csv(train_path)
    df_0 = pd.read_csv('best_sample_0.csv', usecols=['text', 'label']).sample(1000, random_state=my_seed)
    df_0.text = [ preproc(text) for text in df_0.text ]
    train_df = pd.concat([df_0, train_df])
    
    # data transform     
    tfidf = TfidfVectorizer(smooth_idf=True, ngram_range=(3,6), analyzer='char_wb')
    x_train = tfidf.fit_transform(train_df.text)
    y_train = train_df.label
    
    # model design 
    # clf 1
    estimators = [
        ('sgd', SGDClassifier(loss='modified_huber', alpha=1e-5, 
                              random_state=my_seed, average=False,
                              class_weight={0: 0.25, 1: 0.4, 2: 0.38})),
        ('bayes', MultinomialNB(alpha=0.05, class_prior=[0.27, 0.41, 0.38])),
    ]
    clf_1 = VotingClassifier(estimators=estimators, voting='soft', weights=[0.519, 0.428])
    clf_1.fit(x_train, y_train)
    
    # clf 2
    pred = clf_1.predict_proba(x_train)
    clf_2 = LogisticRegression(multi_class='multinomial',
                              class_weight={0: 0.23, 1: 0.52, 2: 0.37})
    clf_2.fit(pred, y_train)

    # save clf_1, clf_2, tfidf     
    pickle.dump(clf_1, open(CLF_PATH_1, 'wb'))
    pickle.dump(clf_2, open(CLF_PATH_2, 'wb'))
    pickle.dump(tfidf, open(TFIDF_PATH, 'wb'))

def predict(test_path):
    # loadings
    test_df = pd.read_csv(test_path)
    tfidf = pickle.load(open(TFIDF_PATH, 'rb'))
    clf_1 = pickle.load(open(CLF_PATH_1, 'rb'))
    clf_2 = pickle.load(open(CLF_PATH_2, 'rb'))
    
    # data transform     
    x_test = tfidf.transform(test_df.text)
    
    # make predictions
    temp_pred = clf_1.predict_proba(x_test)
    y_pred = clf_2.predict(temp_pred)
    
    # submit predictions
    test_df['label'] = y_pred
    test_df.to_csv(SUBMIT_PATH)

In [46]:
train('data/my_train.csv')
predict('data/my_test.csv')

In [47]:
y_pred = pd.read_csv('submit.csv')['label'].values
y_test =  pd.read_csv('my_test_copy.csv')['label'].values
f1_score(y_test, y_pred, average='micro')

0.898125

In [48]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.68      0.80      0.74      1292
           1       0.98      0.99      0.99      3562
           2       0.91      0.83      0.87      3146

    accuracy                           0.90      8000
   macro avg       0.86      0.88      0.86      8000
weighted avg       0.90      0.90      0.90      8000

