In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json


from tqdm import tqdm_notebook

# Sentiment Analysis in Russian 
(from https://www.kaggle.com/c/sentiment-analysis-in-russian/data)

The goal is to estimate sentiment of news in russian. 

## Load data

In [2]:
!wget https://raw.githubusercontent.com/hushchyn-mikhail/hse_se_ml/s08/2020/s08-nlp/Data/train.json

--2022-04-09 09:23:22--  https://raw.githubusercontent.com/hushchyn-mikhail/hse_se_ml/s08/2020/s08-nlp/Data/train.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 59298269 (57M) [text/plain]
Saving to: ‘train.json’


2022-04-09 09:23:24 (32,3 MB/s) - ‘train.json’ saved [59298269/59298269]



In [3]:
# Load data
# with open('Data/train.json') as json_file:
with open('train.json') as json_file:
    data = json.load(json_file)

In [4]:
# Show example
num = 1 # 100 - pos

print("ID: ",          data[num]["id"], "\n")
print("Text: \n",      data[num]["text"])
print("Sentiment: ",   data[num]["sentiment"], "\n")

ID:  1957 

Text: 
 Медики рассказали о состоянии пострадавшего мужчины, на которого было совершено нападение возле отделения банка по Тимирязева. Как прокомментировали Tengrinews.kz в пресс-службе Управления здравоохранения Алматы, с места происшествия в службу скорой помощи обратились двое человек. 

«Одному из них на месте была оказана медицинская помощь. От госпитализации он отказался. Второй пациент был доставлен в больницу скорой неотложной помощи (БСНП) с сотрясением головного мозга, ушибленной раной головы. Состояние на данный момент оценивается ближе к удовлетворительному. Пока он проходит обследование в больнице», — сообщили в Управлении здравоохранения Алматы.  

Напомним, в Алматы на пересечении улиц Тимирязева и Маркова возле БЦ «Алатау Гранд» произошла стрельба, ориентировочно в обеденное время. В здании расположены отделения банков «ВТБ» и «Сбербанк». 

В настоящее время полицейские разыскивают подозреваемых в стрельбе. По факту нападения в местном управлении внутренних 

## Tokenization and data cleaning

Let's split each text into words (**tokenizations**) and remove all **stop words** and punctuation characters. **Stop words** are words that commonly used in texts and can be ignored losing the texts meaning.

<center><img src="img/tokenization.png"></center>

In [5]:
import string # for work with strings
import nltk   # Natural Language Toolkit

In [6]:
# get russian stop words
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')

# example of stop words
stop_words[:10]

[nltk_data] Downloading package stopwords to /Users/atsky/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со']

In [7]:
# punctuation characters
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
# define word tokenizer
word_tokenizer = nltk.WordPunctTokenizer()

In [9]:
def process_data(data):
    texts = []
    targets = []

    for item in data:
        
        # collect nlabels of news
        if item['sentiment'] == 'negative':
            targets.append(0)
        else:
            targets.append(1)
        
        text_lower = item['text'].lower() # convert words in a text to lower case
        tokens     = word_tokenizer.tokenize(text_lower) # splits the text into tokens (words)
        
        # remove punct and stop words from tokens
        tokens = [word for word in tokens if (word not in string.punctuation and word not in stop_words)]
        
        texts.append(tokens) # collect the text tokens
    
    return texts, targets

In [10]:
# run tokenization and data cleaning
texts, y = process_data(data)

In [11]:
# example
i = 1
print("Label: ", y[i])
print("Tokens: ", texts[i][:5])

Label:  0
Tokens:  ['медики', 'рассказали', 'состоянии', 'пострадавшего', 'мужчины']


## Words normalization

Here we will consider 2 ways of words normalizing: **stemming** and **lemmatization**.

### Stemming

<center><img src="img/stem2.svg" width="400"></center>

In [12]:
from nltk.stem.snowball import SnowballStemmer 

# define stemmer
stemmer = SnowballStemmer("russian")

In [13]:
# example of its work
i = 1
for aword in texts[i][:10]:
    aword_stem = stemmer.stem(aword)
    print("Before: %s, After: %s" % (aword, aword_stem))

Before: медики, After: медик
Before: рассказали, After: рассказа
Before: состоянии, After: состоян
Before: пострадавшего, After: пострада
Before: мужчины, After: мужчин
Before: которого, After: котор
Before: совершено, After: соверш
Before: нападение, After: нападен
Before: возле, After: возл
Before: отделения, After: отделен


### Lemmatization

Lemmatization convert a word to its normal form.

<center><img src="img/lemm.png" width="400"></center>

In [14]:
# ! pip install pymorphy2

In [15]:
import pymorphy2 # Морфологический анализатор

# define lemmatizer :)
morph = pymorphy2.MorphAnalyzer()

In [16]:
# example of its work
i = 1
for aword in texts[i][:10]:
    aword_norm = morph.parse(aword)[0].normal_form
    print("Before: %s, After: %s" % (aword, aword_norm))

Before: медики, After: медик
Before: рассказали, After: рассказать
Before: состоянии, After: состояние
Before: пострадавшего, After: пострадать
Before: мужчины, After: мужчина
Before: которого, After: который
Before: совершено, After: совершить
Before: нападение, After: нападение
Before: возле, After: возле
Before: отделения, After: отделение


Oscar goes to stemming!

Stemming oscar speach:  Thanks to the academy for this prestigious award! I would like to thank all nlp developers that are lazy to use lematization and do not want to wait for too long. Thank you, thank you very much!  

In [None]:
# apply stemming to all texts
for i in tqdm_notebook(range(len(texts))):           # tqdm_notebook creates the process bar below :)
    text_stemmed = list(map(stemmer.stem, texts[i])) # apply stemming to each word in a text
    texts[i] = ' '.join(text_stemmed)                # unite all stemmed words into a new text

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(range(len(texts))):           # tqdm_notebook creates the process bar below :)


  0%|          | 0/8263 [00:00<?, ?it/s]

In [None]:
# example
i = 1
print("Label: ",   y[i])
print("Text: \n",  texts[i])

## Split into train and test

In [None]:
#train test_split
from sklearn.model_selection import train_test_split
train_texts, test_texts, train_y, test_y = train_test_split(texts, y, test_size=0.33, random_state=42, stratify = y)

## TF-IDF

TF-IDF measures importance of word in a corpus of documents.

<center><img src="img/tfidf.jpg" width="800"></center>
Image from: http://filotechnologia.blogspot.com/2014/01/a-simple-java-class-for-tfidf-scoring.html

In [20]:
#calc tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [21]:
# Fit TF-IDF on train texts
vectorizer = TfidfVectorizer(max_features = 25) # select the top 25 words
vectorizer.fit(train_texts)

# The top 25 words
vectorizer.get_feature_names()

['2016',
 'банк',
 'год',
 'государств',
 'государствен',
 'казахста',
 'казахстанск',
 'компан',
 'котор',
 'млрд',
 'национальн',
 'нов',
 'област',
 'перв',
 'президент',
 'проект',
 'работ',
 'развит',
 'республик',
 'рк',
 'сво',
 'стран',
 'такж',
 'тенг',
 'эт']

In [22]:
# Apply TF-IDF to train and test texts
train_X = vectorizer.fit_transform(train_texts)
test_X  = vectorizer.fit_transform(test_texts)

In [23]:
# Example
train_X.todense()[:2] # show the first 2 rows

matrix([[0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 1.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.50615647, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.28990537, 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.32315937, 0.38382162, 0.63875621]])

In [24]:
# word - column in X accordance
vectorizer.vocabulary_

{'стран': 21,
 'казахстанск': 6,
 'республик': 18,
 'казахста': 5,
 'такж': 22,
 'развит': 17,
 'котор': 8,
 '2016': 0,
 'год': 3,
 'национальн': 11,
 'работ': 16,
 'рк': 19,
 'сво': 20,
 'государствен': 4,
 'проект': 15,
 'област': 13,
 'компан': 7,
 'тенг': 23,
 'млрд': 10,
 'эт': 24,
 'банк': 2,
 'перв': 14,
 'лет': 9,
 '2017': 1,
 'нов': 12}

## Fit a classifier

In [25]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_X, train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

### Evaluate on test data

In [26]:
predict = model.predict(test_X)
proba  = model.predict_proba(test_X)

from sklearn.metrics import accuracy_score, roc_auc_score
print("ACCURACY = {}".format(accuracy_score(test_y, predict)))
print("ROC-AUC =  {}".format(roc_auc_score(test_y, proba[:, 1])))

ACCURACY = 0.8221488815548221
ROC-AUC =  0.6436539410322453


**Results:** 25 words are too small to estimate news sentiment properly. We need more words. But how will we deal with high dimensionalities?

# Latent Semantic Analysis (LSA)

LSA is just similar to PCA. It reduces dimension of the input matrix X.

<center><img src="img/lsa.jpg" width="800"></center>

Let's take more words.

In [27]:
# Fit TF-IDF on train texts
vectorizer = TfidfVectorizer(max_features = 40000)
vectorizer.fit(train_texts)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=40000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [28]:
# Apply TF-IDF to train and test texts
train_X = vectorizer.transform(train_texts)
test_X  = vectorizer.transform(test_texts)

In [29]:
train_X.shape

(5536, 40000)

Now we have 40000 words. But it is too large for a classification model. Let's use LSA to reduce dimension. In sklearn LSA is TruncatedSVD

In [30]:
from sklearn.decomposition import TruncatedSVD

# fit SVD decomposition
svd = TruncatedSVD(n_components = 1000)
svd.fit(train_X)

TruncatedSVD(algorithm='randomized', n_components=1000, n_iter=5,
             random_state=None, tol=0.0)

In [31]:
# apply SVD to train and test samples
train_svd_X = svd.transform(train_X)
test_svd_X  = svd.transform(test_X)

In [32]:
train_svd_X.shape

(5536, 1000)

## Fit a classifier

In [33]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(train_svd_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

### Evaluate on test data

In [34]:
predict = model.predict(test_svd_X)
proba   = model.predict_proba(test_svd_X)

from sklearn.metrics import accuracy_score, roc_auc_score
print("ACCURACY = {}".format(accuracy_score(test_y, predict)))
print("ROC-AUC =  {}".format(roc_auc_score(test_y, proba[:, 1])))

ACCURACY = 0.8709204253758709
ROC-AUC =  0.9201691707108434


# Kaggle competition

https://www.kaggle.com/c/explicit-content-detection