# Екатерина Кострыкина БКЛ181

#### Homework 1
### Sentiment analysis of movie reviews using tonal dictionaries. Light version.

#### Imports

In [1]:
import re
import requests
session = requests.session()
from fake_useragent import UserAgent
ua = UserAgent(verify_ssl=False)
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from pymorphy2 import MorphAnalyzer
a_morph = MorphAnalyzer()
from sklearn.model_selection import train_test_split

In [36]:
def clean(review):
    review = re.sub('[^А-Яа-яЁё ]', ' ', review)
    review = re.sub(' +', ' ', review)
    return review


def load_reviews(number, ton):
    data = []
    for i in range(1, number+1):
        url = f'https://www.kinopoisk.ru/reviews/type/comment/status/{ton}/period/month/page/{i}/#list'
        req = session.get(url, headers={'User-Agent': ua.random})
        page = req.text
        soup = BeautifulSoup(page, 'html.parser')
        text = soup.find_all('span', {'class': '_reachbanner_'})
        for review in text:
            data.append(clean(str(review))) 
    return data


def get_dataset(number):
    pos_data = load_reviews(number, ton='good')
    neg_data = load_reviews(number, ton='bad')
    data = pd.DataFrame()
    data['reviews'] = np.concatenate((pos_data, neg_data), axis=0)
    a = np.zeros((len(pos_data), 1))
    b = np.ones((len(neg_data), 1))
    data['label'] = np.concatenate((a, b), axis=0) #0 +, 1 -
    return data


def lemmatize(review):
    review = ' '.join([a_morph.parse(i)[0].normal_form for i in review])
    return review


def preprocess(review):
    review = review.lower() # lowercase conversion
    review = word_tokenize(review) # tokenize
    review = lemmatize(review) # lemmatize
    return review


def get_sets(X_train, y_train):
    pos_set = set(' '.join([i for i in X_train.loc[y_train==0.0]]).split()) # positive reviews
    neg_set = set(' '.join([i for i in X_train.loc[y_train==1.0]]).split()) # negative reviews
    pos_dict = pos_set.difference(neg_set) # elements which belongs only to positive reviews
    neg_dict = neg_set.difference(pos_set) # elements which belongs only to negative reviews
    return (pos_dict, neg_dict)
    
    
def get_tonality(review, ton_sets, prep=False):
    if prep: # for new unprocessed inputs
        review = preprocess(review)
    review = set(review.split())
    check_pos = len(review.intersection(ton_sets[0])) # size of intersection with positive set
    check_neg = len(review.intersection(ton_sets[1])) # size of intersection with negative set
    if check_pos > check_neg:
        return 0.0
    else:
        return 1.0
    

def accuracy(y_test, y_pred):
    ans = np.mean(y_test==y_pred)
    return ans

#### Creating dataset
Since [kinopoisk.ru](https://www.kinopoisk.ru/reviews/) has 10 reviews per page, we pass the number 25 to the **get_dataset** function to download 25 pages of positive and 25 pages negative reviews (25 * 10 = 250)

* 250 positive and 250 negative reviews

#### Lowercase conversion, tokenize, lemmatize via **preprocess** function

In [54]:
dataset = get_dataset(25)
dataset['reviews'] = [preprocess(i) for i in dataset['reviews']]

In [55]:
dataset

Unnamed: 0,reviews,label
0,это какой то фантастика никто бы не стать бега...,0.0
1,замечательный одновременно лёгкий и глубокий ф...,0.0
2,один раз я смотреть этот фильм в кинотеатр ког...,0.0
3,тема ребёнок который вырасти без родитель редк...,0.0
4,каждый новый часть хороший предыдущий как тако...,0.0
...,...,...
495,мегана фокс ворваться в большой кинематограф б...,1.0
496,сложность это фильм создаваться не столько сло...,1.0
497,я где то читать что фильм продолжительность ме...,1.0
498,как только появиться трейлер то я сразу решить...,1.0


#### Splitting into training and test samples

In [56]:
X_train, X_test, y_train, y_test = train_test_split(dataset['reviews'], dataset['label'], test_size=0.1)

#### Creating tonal dictionaries from a training sample

In [57]:
tonality_sets = get_sets(X_train, y_train)

#### Predicting tonality and counting accuracy score

In [58]:
y_pred = [get_tonality(i, tonality_sets) for i in X_test]
accuracy(y_test, y_pred)

0.68

#### Improvement options:
1. leave the most frequent words in the sets
2. concatenate negative particles "not" ("не" for Russian) with the next word
3. use syntactic bigrams, built on dependency trees, instead of regular unigrams