In [2]:
# Get tone ukrainian dict
# https://github.com/lang-uk/tone-dict-uk

import pandas as pd

tone_dict_df = pd.read_csv('tone-dict-uk.tsv', sep='\t')
tone_dict_df = tone_dict_df.astype({"tone_value": float})

TONE_DICT_UK = tone_dict_df.set_index('word').to_dict()['tone_value']

In [3]:
!pip install fasttext-langdetect

Defaulting to user installation because normal site-packages is not writeable


In [4]:
# Detect language

from ftlangdetect import detect
import string

def sanitize_text(text):
    text = text.replace(chr(769), '')
    text = text.replace('\n', ' ')
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Returns 'uk', 'en', 'ru'
def get_lang(line):
    sanitized_line = sanitize_text(line) 
    return detect(sanitized_line, low_memory=True)['lang']

In [5]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')

SIA_UK = SentimentIntensityAnalyzer()
SIA_UK.lexicon.update(TONE_DICT_UK)

SIA_EN = SentimentIntensityAnalyzer()

def get_polarity_scores(text):
    lang = get_lang(text)
    if lang == 'en':
        return SIA_EN.polarity_scores(text)
    return SIA_UK.polarity_scores(text)

def classify_polarity_scores(polarity_scores):
    compound = polarity_scores['compound']
    result = "NEUTRAL"
    if compound > 0.05:
        result = "POSITIVE"
    else:
        result = "NEGATIVE"
    return result

[nltk_data] Downloading package stopwords to /home/mlgmag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/mlgmag/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/mlgmag/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [8]:
!pip install git+https://github.com/kmike/pymorphy2.git
!pip install -U pymorphy2-dicts-uk

Defaulting to user installation because normal site-packages is not writeable
Collecting git+https://github.com/kmike/pymorphy2.git
  Cloning https://github.com/kmike/pymorphy2.git to /tmp/pip-req-build-j16mmkz3
  Running command git clone --filter=blob:none --quiet https://github.com/kmike/pymorphy2.git /tmp/pip-req-build-j16mmkz3
  Resolved https://github.com/kmike/pymorphy2.git to commit 92d546f042ff14601376d3646242908d5ab786c1
  Preparing metadata (setup.py) ... [?25ldone
Defaulting to user installation because normal site-packages is not writeable


In [16]:
import pymorphy2
import string
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

nltk.download('stopwords')
ENGLISH_STOP_WORDS = stopwords.words('english')

morph = pymorphy2.MorphAnalyzer(lang='uk')

def to_raw_uk(text):
    sentences = nltk.sent_tokenize(text)

    normal_words = []

    for sentence in sentences:
        sanitized_sentence = sanitize_text(sentence)
        tokens = nltk.word_tokenize(sanitized_sentence)
        for token in tokens:        
            parse_result = morph.parse(token)[0]
            normal_form = parse_result.normal_form
            normal_words.append(normal_form)

    return " ".join(normal_words)



ps = PorterStemmer()

def to_raw_en(text):
    sentences = nltk.sent_tokenize(text)

    normal_words = []

    for sentence in sentences:
        sanitized_sentence = sanitize_text(sentence)
        tokens = nltk.word_tokenize(sanitized_sentence)
        filtered_tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        for token in filtered_tokens:
            normal_form = ps.stem(token)
            normal_words.append(normal_form)

    return " ".join(normal_words)


[nltk_data] Downloading package stopwords to /home/mlgmag/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
def get_polarity(text):
    lang = get_lang(text)
    raw_text = ""
    if lang == 'en':
        raw_text = to_raw_en(text)
    elif lang == 'uk':
        raw_text = to_raw_ua(text)

    polarity_scores = get_polarity_scores(raw_text)
    return classify_polarity_scores(polarity_scores)

In [47]:
REVIEWS_LANG_FILE = './reviews_lang.jsonlines'
reviews_lang = pd.read_json(REVIEWS_LANG_FILE, lines=True)

In [44]:
def should_process(lang):
    return lang in ['en', 'uk']

def add_polarity(reviews_row):
    lang = reviews_row['lang']
    polarity = ""
    if should_process(lang):
        review = reviews_row['review']
        polarity = get_polarity(review)

    reviews_row['polarity'] = polarity

    return reviews_row

# ~500 seconds
reviews_polarity = reviews_lang.apply(add_polarity, axis=1)

In [45]:
reviews_lang_json_lines = reviews_polarity.to_json(orient='records', lines=True, force_ascii=False)

In [46]:
OUTPUT_FILE_PATH = "./reviews_polarity.jsonlines"

with open(OUTPUT_FILE_PATH, "w") as f:
    f.write(reviews_lang_json_lines)