In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import json

# Download data

In [2]:
with open('./data/train.json') as f:
    raw_train = json.load(f)
with open('./data/test.json') as f:
    raw_test = json.load(f)

## Build TF-IDF language-model

In [3]:
def ru_token(string):
    return [i for i in word_tokenize(string) if re.match(r'[\u0400-\u04ffа́]+$', i)]

In [4]:
params = {}
params['tokenizer'] = ru_token
params['stop_words'] = stopwords.words('russian')
params['ngram_range'] = (1, 3)
params['min_df'] = 3

In [5]:
tfidf  = TfidfVectorizer(**params)

In [6]:
tfidf.fit([i['text'] for i in raw_train + raw_test])



TfidfVectorizer(min_df=3, ngram_range=(1, 3),
                stop_words=['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с',
                            'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его',
                            'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы',
                            'по', 'только', 'ее', 'мне', ...],
                tokenizer=<function ru_token at 0x7fccd9180b80>)

## Train validation set split

In [7]:
train = {}
val = {}
tmp = defaultdict(list)
for e in raw_train:
    tmp[e['sentiment']].append(e['text'])
for l in tmp:
    train[l], val[l] = train_test_split(tmp[l], test_size=0.2, random_state=2018)

## Upsampling align for balance

In [8]:
def upsampling_align(some_dict, random_state=2018):
    rand = np.random.RandomState(random_state)
    upper = max([len(some_dict[l]) for l in some_dict])
    print('upper bound: {}'.format(upper))
    tmp = {}
    for l in some_dict:
        if len(some_dict[l]) < upper:
            repeat_time = int(upper/len(some_dict[l]))
            remainder = upper % len(some_dict[l])
            _tmp = some_dict[l].copy()
            rand.shuffle(_tmp)
            tmp[l] = some_dict[l] * repeat_time + _tmp[:remainder]
            rand.shuffle(tmp[l])
        else:
            tmp[l] = some_dict[l]
    return tmp

In [9]:
btrain = upsampling_align(train)

upper bound: 3227


## Softmax regression model training

In [10]:
m_params = {}
m_params['solver'] = 'lbfgs'
m_params['multi_class'] = 'multinomial'

In [11]:
softmax = LogisticRegression(**m_params)

In [12]:
%%time
train_x = [j for i in sorted(btrain.keys()) for j in btrain[i]]
train_y = [i for i in sorted(btrain.keys()) for j in btrain[i]]
softmax.fit(tfidf.transform(train_x), train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(multi_class='multinomial')

# Evaluate the softmax regression model

### Accuracy

In [13]:
test_x = [j for i in sorted(val.keys()) for j in val[i]]
true = [i for i in sorted(val.keys()) for j in val[i]]

In [14]:
%%time

pred = softmax.predict(tfidf.transform(test_x))

In [15]:
accuracy_score(true, pred)

0.7156684815486993

### Macro recall

In [16]:
lab = LabelEncoder()
c_true = lab.fit_transform(true)
c_pred = lab.transform(pred)
print(classification_report(c_true, c_pred, target_names=lab.classes_, digits=5))

              precision    recall  f1-score   support

    negative    0.64650   0.70732   0.67554       287
     neutral    0.75134   0.69269   0.72083       807
    positive    0.70756   0.75313   0.72964       559

    accuracy                        0.71567      1653
   macro avg    0.70180   0.71771   0.70867      1653
weighted avg    0.71833   0.71567   0.71594      1653



### Balance score

In [17]:
bval = upsampling_align(val)

upper bound: 807


In [18]:
b_test_x = [j for i in sorted(bval.keys()) for j in bval[i]]
b_true = [i for i in sorted(bval.keys()) for j in bval[i]]
b_pred = softmax.predict(tfidf.transform(b_test_x))
lab = LabelEncoder()
c_true = lab.fit_transform(b_true)
c_pred = lab.transform(b_pred)
print(classification_report(c_true, c_pred, target_names=lab.classes_, digits=5))

              precision    recall  f1-score   support

    negative    0.82653   0.70260   0.75954       807
     neutral    0.60893   0.69269   0.64812       807
    positive    0.74541   0.75465   0.75000       807

    accuracy                        0.71665      2421
   macro avg    0.72696   0.71665   0.71922      2421
weighted avg    0.72696   0.71665   0.71922      2421



## Prediction

In [19]:
sub_pred = softmax.predict(tfidf.transform([i['text'] for i in raw_test]))
sub_df = pd.DataFrame()
sub_df['id'] =  [i['id'] for i in raw_test]
sub_df['sentiment'] = sub_pred

In [20]:
sub_df.head()

Unnamed: 0,id,sentiment
0,0,positive
1,1,positive
2,2,negative
3,3,positive
4,4,negative


In [21]:
sub_df.to_csv('softmax_reg.csv', index=False)