In [None]:
import os
import pandas as pd
import gc

In [None]:
from multiprocessing.pool import Pool
from contextlib import closing
from tqdm import tqdm


def multiprocess_func(main_input, func, additional_inputs=None,
                      gather_func=None, to_split=True, gather_func_args=None,
                      chunk_size=100, n_processes=8):
    if not gather_func_args:
        gather_func_args = []
    if not additional_inputs:
        additional_inputs = []
    if not gather_func:
        gather_func = lambda x: [z for i in x for z in i]
    if to_split:
        splitted = [(main_input[i:i + chunk_size], *additional_inputs) if additional_inputs else main_input[i:i + chunk_size]\
                    for i in range(0, len(main_input), chunk_size)]
    else:
        splitted = [(i, *additional_inputs) if additional_inputs else i for i in main_input]
    with closing(Pool(n_processes)) as p:
        result = list(tqdm(p.imap(func, splitted),
                           total=len(splitted)))
    return gather_func(result, *gather_func_args)


In [None]:
path = '../../../data_reviews/'

In [None]:
! wc -l $path/*

# Merging the data

In [None]:
df1 = pd.read_csv(os.path.join(path, 'rozetka_ukr.csv'), encoding='windows-1251', 
           sep=';')
df1.shape

In [None]:
df1['entity_name'] = df1['prod_link'].apply(lambda x: x.split('/')[-3])

In [None]:
df1 = df1[['comment','translate', 'rating', 'entity_name']]\
.rename(columns={'comment':'review', 'translate':'review_translate'})
df1['dataset_name'] = 'rozetka'

In [None]:
df2 = pd.read_csv(os.path.join(path, 'rozetka_ru.csv'), encoding='windows-1251', 
           sep=';')
df2.shape

In [None]:
df2 = df2[~df2['prod_link'].isna()]

In [None]:
df2['entity_name'] = df2['prod_link'].apply(lambda x: x.split('/')[-3])

In [None]:
df2 = df2[['comment','translate', 'rating', 'entity_name']]\
.rename(columns={'comment':'review', 'translate':'review_translate'})
df2['dataset_name'] = 'rozetka'

In [None]:
df3 = pd.read_csv(os.path.join(path, 'hotels_final.csv'), encoding='windows-1251', 
           sep=';')
df3.shape

In [None]:
df3 = df3.rename(columns={'hotel_name':'entity_name'})

In [None]:
df3 = df3[['review', 'translate', 'overall_rating', 'entity_name']]\
.rename(columns={'overall_rating' : 'rating', 'translate':'review_translate'})
df3['dataset_name'] = 'tripadvisor_hotels_ukraine'

In [None]:
df4 = pd.read_csv(os.path.join(path, 'restaurants_review_final.csv'), encoding='windows-1251', 
           sep=';')
df4.shape

In [None]:
df4 = df4.rename(columns={'name':'entity_name'})

In [None]:
df4 = df4.rename(columns={'overall_rating' : 'rating'})[['review', 'title_translate', 'review_translate', 'rating',
                                                        'entity_name']]
df4['dataset_name'] = 'tripadvisor_restaurants_ukraine'

In [None]:
df = pd.concat([df1, df2, df3, df4], axis=0)

In [None]:
df.head()

In [None]:
del df1, df2, df3, df4;
gc.collect();

In [None]:
df = df[~df['rating'].isna()]

In [None]:
df['title_translate'] = df['title_translate'].fillna('')

In [None]:
df = df[~df['review'].isna()]

In [None]:
df['translated'] = df['review']!=df['review_translate']

In [None]:
df.isna().sum()

In [None]:
df['translated'].value_counts()

In [None]:
df.shape

In [None]:
df[df['rating']==2].sample(1)[['review', 'review_translate']].values

In [None]:
df['entity_name'].nunique()

# Basic data analysis

In [None]:
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
sns.set()

In [None]:
from nltk.tokenize import sent_tokenize

## Characters number 

In [None]:
print('Max number of characters in translated review : {}'.format(df['review_translate'].apply(len).max()))
print('Min number of characters in translated review : {}'.format(df['review_translate'].apply(len).min()))
print('Mean number of characters in translated review : {}'.format(df['review_translate'].apply(len).mean()))
print('Median number of characters in translated review : {}'.format(df['review_translate'].apply(len).median()))


In [None]:
sns.distplot(np.log10(df['review_translate'].apply(len)))

In [None]:
np.percentile(df['review_translate'].apply(len), q=0.2)

### filter out those reviews which char len is an outlier

In [None]:
df = df[df['review_translate'].apply(len)>np.percentile(df['review_translate'].apply(len), q=0.2)]

### find those reviews which have a lot less characters that real text

In [None]:
df['diff_len'] = df['review'].apply(len)-df['review_translate'].apply(len)

In [None]:
df = df[df['review_translate']!='#ERROR!']

In [None]:
df['diff_len'] = df['diff_len'].apply(abs)

In [None]:
sns.distplot(np.log1p(df['diff_len']))

In [None]:
df = df[df['diff_len']<200]

In [None]:
df[df['translated']==True]['diff_len'].max()

In [None]:
df = df.drop(columns=['diff_len'])

### deleting empty symbols

In [None]:
df['review_translate'] = df['review_translate'].str.strip()
df = df[df['review_translate'].apply(lambda x: True if x else False)]

### remove \n char

In [None]:
df['translated'].value_counts()

In [None]:
import re

In [None]:
def remove_multy_spaces(text):
    try:
        text = re.sub(r'\s+', ' ', text)
        return text
    except Exception as ex:
        return None

In [None]:
df['review_translate'] = df['review_translate'].str.replace('\n', '').str.strip()

In [None]:
def spacing_between_chars_text(text):
    text = list(text)
    new_text = []
    for idx_char in range(len(text)):
        if not text[idx_char].isalnum() and text[idx_char]!="'" and text[idx_char]!=' ':
            new_text.append(' ')
            new_text.append(text[idx_char])
            new_text.append(' ')
        else:
            new_text.append(text[idx_char])

    return ''.join(new_text).strip()

In [None]:
df['review_translate'] = multiprocess_func(df['review_translate'].values, 
                  func=spacing_between_chars_text,
                  gather_func=lambda x: x,
                  to_split=False)

In [None]:
df['review_translate'] = multiprocess_func(df['review_translate'].values, 
                  func=remove_multy_spaces,
                  gather_func=lambda x: x,
                  to_split=False)

In [None]:
df['review_translate'].values[0]

## Sentence number

In [None]:
sent_tokenized = multiprocess_func(df['review_translate'].values, 
                  func=sent_tokenize,
                  gather_func=lambda x: x,
                  to_split=False)

In [None]:
sns.distplot(np.log10([len(i) for i in sent_tokenized]))

In [None]:
df['review_translate_sentences'] = sent_tokenized

# Delete those which are partially translated

In [None]:
import fasttext
from itertools import chain

In [None]:
model = fasttext.load_model('../../../lid.176.bin')

In [None]:
def detect_lang_sentences(batched_texts, model):
    result = []
    for texts in tqdm(batched_texts):
        lengths = [len(i) for i in texts]
        sentences = list(chain(*texts))
        predicted_langs, _ = model.predict(sentences)
        predicted_langs = list(map(lambda x: x[0].split('__')[-1], predicted_langs))
        assert sum(lengths)==len(sentences)
        assert len(predicted_langs)==len(sentences)
        batched_langs = []
        start = 0
        end = lengths[0]
        for i in lengths[1:]:
            to_add = predicted_langs[start:end]
            if not to_add:
                break
            batched_langs.append(to_add)
            start = end
            end = end+i
            
        if predicted_langs[start:end]:
                batched_langs.append(predicted_langs[start:end])
        assert [len(i) for i in batched_langs]==lengths
        result.extend(batched_langs)

    return result

In [None]:
def detect_lang(batched_texts, model):
    result = []
    for texts in tqdm(batched_texts):
        predicted_langs, _ = model.predict(list(texts))
        result.extend(list(map(lambda x: x[0].split('__')[-1], predicted_langs)))

    return result

In [None]:
batch_size=100
to_detect_lang = df.loc[df['translated']==True, 'review_translate_sentences'].values
batches = [to_detect_lang[i:i+batch_size] for i in range(0, len(to_detect_lang), batch_size)]

In [None]:
sum([len(i) for i in batches])

In [None]:
result = detect_lang_sentences(batches, model)

In [None]:
batch_size=100
to_detect_lang = df.loc[df['translated']==True, 'review_translate'].values
batches = [to_detect_lang[i:i+batch_size] for i in range(0, len(to_detect_lang), batch_size)]

In [None]:
result = detect_lang(batches, model)

In [None]:
df['language_translated'] = 'uk'
df.loc[df['translated']==True, 'language_translated'] = result

In [None]:
df = df[df['language_translated']=='uk']

In [None]:
df.drop(columns='language_translated', inplace=True)

# Tokenize texts

In [None]:
from nltk.tokenize import regexp_tokenize

In [None]:
def NLTK_special_chars_excluded_tokenizer(input_text):
    overall_pattern = r"[\w'-]+|[^\w\s'-]+"
    return regexp_tokenize(input_text, pattern=overall_pattern, gaps=False, discard_empty=True)

In [None]:
def tokenize_sentence_tokens(sentences):
    tokens = []
    for sent in sentences:
        tokens.append(NLTK_special_chars_excluded_tokenizer(sent))
    return tokens

In [None]:
df['review_translate_sentences_tokens'] = multiprocess_func(df['review_translate_sentences'].values, 
                  func=tokenize_sentence_tokens,
                  gather_func=lambda x: x,
                  to_split=False)

# Add spaces between chars 

In [None]:
from functools import partial

In [None]:
def apply_func_sent(sentences, func):
    result = []
    for sent in sentences:
        result.append(func(sent))
    return result

In [None]:
def spacing_between_chars_tokens(tokens):
    tokens = list(np.hstack([spacing_between_chars(i) for i in tokens]))
    return [i for i in tokens if i]

In [None]:
def spacing_between_chars(text):
    text = list(text)
    new_text = []
    for idx_char in range(len(text)):
        if not text[idx_char].isalnum() and text[idx_char]!="'":
            new_text.append(' ')
            new_text.append(text[idx_char])
            new_text.append(' ')
        else:
            new_text.append(text[idx_char])

    return ''.join(new_text).strip().split(' ')

In [None]:
spacing_between_chars_sentences = partial(apply_func_sent, func=spacing_between_chars_tokens)

In [None]:
df['review_translate_sentences_tokens'] = multiprocess_func(df['review_translate_sentences_tokens'].values, 
                  func=spacing_between_chars_sentences,
                  gather_func=lambda x: x,
                  to_split=False)

# Find pos tags

In [None]:
import pymorphy2

In [None]:
morph = pymorphy2.MorphAnalyzer(lang='uk')

In [None]:
def pos_tagging(ent):
    batch, morph = ent
    tags_batch = []
    for sentences in batch:
        tags_sentences = []
        for sentence in sentences:
            tags_sentences.append([morph.parse(word)[0].tag._POS for word in sentence])
        tags_batch.append(tags_sentences)
    return tags_batch

In [None]:
df['review_translate_sentences_pos'] = multiprocess_func(df['review_translate_sentences_tokens'].values, 
                  func=pos_tagging,
                  gather_func=None,
                  to_split=True,
                  chunk_size=100,
                  n_processes=12,
                  additional_inputs=[morph])

# Find lemmas

In [None]:
def lemmatizing(ent):
    batch, morph = ent
    tags_batch = []
    for sentences in batch:
        tags_sentences = []
        for sentence in sentences:
            tags_sentences.append([morph.parse(word)[0].normal_form for word in sentence])
        tags_batch.append(tags_sentences)
    return tags_batch

In [None]:
df['review_translate_sentences_lemma'] = multiprocess_func(df['review_translate_sentences_tokens'].values, 
                  func=lemmatizing,
                  gather_func=None,
                  to_split=True,
                  chunk_size=100,
                  n_processes=12,
                  additional_inputs=[morph])

In [None]:
df.head(1)

# Delete plain questions

In [None]:
def is_question_sentences(ent):
    sentences, tags = ent
    is_question_vector = []
    for i in range(len(sentences)):
        is_question_vector.append(is_question(sentences[i], tags[i]))
    return is_question_vector

In [None]:
def is_question(words, tags):
    tags = [tag for word, tag in list(zip(words, tags))\
            if not word in ['.', ',', '!', '?']]
    
    # Check if the last character of the sentence is a question mark
    if words[-1] == "?" and len(tags)>1:
        # Check if the sentence ends with a verb or an auxiliary verb
        if tags[-1] in ["VERB", "INFN"] or (tags[-1] == "GRND" and tags[-2] in ["VERB", "INFN"]):
            return True
        # Check if the sentence starts with an auxiliary verb and ends with a verb
        elif tags[0] == "PRCL" and tags[-1] in ["VERB", "INFN"]:
            return True
        else:
            return False
    elif words[-1]=='?' and len(tags)==1:
        return True
    else:
        return False


In [None]:
to_input = list(zip(df['review_translate_sentences_tokens'].values.tolist(), 
            df['review_translate_sentences_pos'].values.tolist()))

In [None]:
questions_mask = multiprocess_func(to_input, 
                  func=is_question_sentences,
                  gather_func=lambda x: x,
                  to_split=False,
                  n_processes=12,
                  )

In [None]:
df['is_question'] = questions_mask

In [None]:
df = df[~df['is_question'].apply(lambda x: all(x))]

In [None]:
df.to_csv('processed_data.csv', index=False)