# Екатерина Кострыкина БКЛ181

#### Homework 1
### Sentiment analysis of movie reviews using tonal dictionaries. Light version.

#### Imports

In [11]:
import re
import requests
session = requests.session()
from fake_useragent import UserAgent
ua = UserAgent(verify_ssl=False)
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from pymorphy2 import MorphAnalyzer
a_morph = MorphAnalyzer()
from natasha import Segmenter, NewsEmbedding, NewsMorphTagger, Doc, MorphVocab
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [50]:
def clean(review):
    review = re.sub('[^А-Яа-яЁё ]', ' ', review)
    review = re.sub(' +', ' ', review)
    return review


def load_reviews(number, ton):
    data = []
    for i in range(1, number+1):
        url = f'https://www.kinopoisk.ru/reviews/type/comment/status/{ton}/period/month/page/{i}/#list'
        req = session.get(url, headers={'User-Agent': ua.random})
        page = req.text
        soup = BeautifulSoup(page, 'html.parser')
        text = soup.find_all('span', {'class': '_reachbanner_'})
        for review in text:
            data.append(clean(str(review))) 
    return data


def get_dataset(number):
    pos_data = load_reviews(number, ton='good')
    neg_data = load_reviews(number, ton='bad')
    data = pd.DataFrame()
    data['reviews'] = np.concatenate((pos_data, neg_data), axis=0)
    a = np.zeros((len(pos_data), 1))
    b = np.ones((len(neg_data), 1))
    data['label'] = np.concatenate((a, b), axis=0) #0 +, 1 -
    return data


def lemmatize(review):
    review = ' '.join([a_morph.parse(i)[0].normal_form for i in review])
    return review


def preprocess(review):
    review = review.lower() # lowercase conversion
    review = word_tokenize(review) # tokenize
    review = lemmatize(review) # lemmatize
    return review


def get_sets(X_train, y_train):
    pos_set = set(' '.join([i for i in X_train.loc[y_train==0.0]]).split()) # positive reviews
    neg_set = set(' '.join([i for i in X_train.loc[y_train==1.0]]).split()) # negative reviews
    pos_dict = pos_set.difference(neg_set) # elements which belongs only to positive reviews
    neg_dict = neg_set.difference(pos_set) # elements which belongs only to negative reviews
    return (pos_dict, neg_dict)


def get_bigrams(text):
    doc = Doc(text)
    doc.segment(Segmenter())
    doc.tag_morph(NewsMorphTagger(NewsEmbedding()))

    lemma_tag = [[i.text, i.pos] for i in doc.tokens]

    bigrams = []
    for i in range(len(lemma_tag)-1):
        lemma = lemma_tag[i][0]
        next_lemma = lemma_tag[i+1][0]
        tag = lemma_tag[i][1]
        next_tag = lemma_tag[i+1][1]
        if lemma == 'не' and next_tag == 'ADJ':
            bigrams.append(' '.join([lemma, next_lemma]))
        elif tag == 'A' and next_tag == 'VERB':
            bigrams.append(' '.join([lemma, next_lemma]))
        elif tag == 'ADJ' and next_tag == 'NOUN':
            bigrams.append(' '.join([lemma, next_lemma]))

    return bigrams


def get_sets_with_bigrams(X_train, y_train):
    pos = ' '.join([i for i in X_train.loc[y_train==0.0]]) # positive reviews
    neg = ' '.join([i for i in X_train.loc[y_train==1.0]]) # negative reviews
    pos_set = set(get_bigrams(pos))
    neg_set = set(get_bigrams(neg))
    pos_set.update(pos.split())
    neg_set.update(neg.split())
    pos_dict = pos_set.difference(neg_set) # elements which belongs only to positive reviews
    neg_dict = neg_set.difference(pos_set) # elements which belongs only to negative reviews
    return (pos_dict, neg_dict)

    
def get_tonality(review, ton_sets, prep=False):
    if prep: # for new unprocessed inputs
        review = preprocess(review)
    review = set(review.split())
    check_pos = len(review.intersection(ton_sets[0])) # size of intersection with positive set
    check_neg = len(review.intersection(ton_sets[1])) # size of intersection with negative set
    if check_pos > check_neg:
        return 0.0
    else:
        return 1.0

def get_tonality_with_bigrams(review, ton_sets, prep=False):
    if prep: # for new unprocessed inputs
        review = preprocess(review)
    bigrams = get_bigrams(review)
    review = set(review.split())
    review.update(bigrams)
    check_pos = len(review.intersection(ton_sets[0])) # size of intersection with positive set
    check_neg = len(review.intersection(ton_sets[1])) # size of intersection with negative set
    if check_pos > check_neg:
        return 0.0
    else:
        return 1.0
    

def accuracy(y_test, y_pred):
    ans = np.mean(y_test==y_pred)
    return ans

#### Creating dataset
Since [kinopoisk.ru](https://www.kinopoisk.ru/reviews/) has 10 reviews per page, we pass the number 25 to the **get_dataset** function to download 25 pages of positive and 25 pages negative reviews (25 * 10 = 250)

* 250 positive and 250 negative reviews

#### Lowercase conversion, tokenize, lemmatize via **preprocess** function

In [41]:
dataset = get_dataset(30)
dataset['reviews'] = [preprocess(i) for i in dataset['reviews']]

In [42]:
dataset

Unnamed: 0,reviews,label
0,тот харди шайла лабаф и джейсон кларк играть б...,0.0
1,иногда случайно среди многий тысяча тонна коро...,0.0
2,есть такой человек который весь свой жизнь выс...,0.0
3,картина рассказывать о учитель география котор...,0.0
4,фильм рассказывать о время славянский князь гд...,0.0
...,...,...
595,сразу оговориться что интерстеллара я понравит...,1.0
596,говориться о тот что данный фильм являться про...,1.0
597,я любить фильм нолан можно сказать поклонник о...,1.0
598,я быть очень одушевить услышать о хороший чело...,1.0


#### Splitting into training and test samples

In [43]:
X_train, X_test, y_train, y_test = train_test_split(dataset['reviews'], dataset['label'], test_size=0.1)

#### Creating tonal dictionaries from a training sample

без групп биграмм

In [46]:
tonality_sets = get_sets(X_train, y_train)

с группами биграмм

In [44]:
tonality_sets_with_bigrams = get_sets_with_bigrams(X_train, y_train)

#### Predicting tonality and counting accuracy score

без групп биграмм

In [47]:
y_pred = [get_tonality(i, tonality_sets) for i in X_test]
accuracy(y_test, y_pred)

0.7833333333333333

с группами биграмм

In [52]:
y_pred = [get_tonality_with_bigrams(i, tonality_sets_with_bigrams) for i in X_test]
accuracy(y_test, y_pred)

0.8

С использованием групп биграмм качество стало немного лучше


### Группы биграмм:
1. "не" + прилагательное 
    - например, так как в хорошем отзыве могут встретиться "не" и "плохой", которые по отдельности могут относится к группе "негативных" слов, а объединив их в одно "не плохой" мы получим "хороший"
    
    
2. прилагательное + существительное 
3. наречие + глагол
    - так как прилагательные и наречия являются оценочными средствами, было бы полезно выделить к чему именно они относятся