In [6]:
import numpy as np
import pandas as pd

import seaborn
import matplotlib.pyplot as plt

from tqdm import tqdm
seaborn.set()
%matplotlib inline

In [2]:
# https://github.com/kiddick/speller
import requests


class Speller(object):
    service = 'http://speller.yandex.net/services/spellservice.json/checkText'

    def __init__(self, text, options=None, lang=None, format_text=None):
        self.text = text
        self.options = options
        self.lang = lang
        self.format_text = format_text
        self._answer = None

    def check(self):
        data = {'text': self.text}
        if self.options:
            data['options'] = self.options
        if self.lang:
            data['lang'] = self.lang
        if self.format_text:
            data['format'] = self.format_text
        answer = requests.post(url=self.service, data=data).json()
        return answer

    @property
    def answer(self):
        if self._answer is None:
            self._answer = self.check()
        return self._answer

    @property
    def correct(self):
        return not self.answer

    @property
    def spellsafe(self):
        raise NotImplementedError("Subclasses should implement this!")


class Word(Speller):

    @property
    def variants(self):
        if self.correct:
            return
        return self.answer[0]['s']

    @property
    def spellsafe(self):
        if self.correct:
            return
        return self.variants[0]


class Text(Speller):

    @property
    def spellsafe(self):
        changes = {el['word']: el['s'][0] for el in self.answer if len(el['s']) > 0}
        result = self.text
        for wrong, fixed in changes.items():
            result = result.replace(wrong, fixed)
        return result

    @property
    def errors(self):
        return [el['word'] for el in self.answer]

def spellcheck_dataframe_simple(dataframe, text_field='text', lang=None):
    fixed_texts = []

    total = len(dataframe)
    for idx, line in tqdm(dataframe.iterrows(), total=total, leave=False):
        fixed_text = Text(line[text_field], lang=lang).spellsafe
        fixed_texts.append(fixed_text)

    return fixed_texts

print(Text('42 is a cUl maagic namber').spellsafe)

42 is a cool magic number


### Train

In [12]:
filepath = '/data/classification/SentiRuEval_data/all_data/train.csv'
data = pd.read_csv(filepath)[['text', 'label']]
data.sample(2)

Unnamed: 0,text,label
13956,063 1185319 новый номер!!!!!!! МТС не работает...,negative
11742,Час была в Билайн.Написала заявление.Столько к...,negative


In [13]:
data.label.unique()

array(['neutral', 'positive', 'both', 'negative'], dtype=object)

In [7]:
spellchecked = spellcheck_dataframe_simple(data, lang='ru', text_field='text')
data['text_spellchecked'] = spellchecked
print('changed texts ratio: ', sum(data.text != data.text_spellchecked) / len(data))
data.to_csv(filepath, index=False)



changed texts ratio:  0.3518912529550827


### Valid

In [8]:
filepath = '/data/classification/SentiRuEval_data/all_data/valid.csv'
data = pd.read_csv(filepath)[['text', 'label']]
data.sample(2)

Unnamed: 0,text,label
1150,@MatveyevIgor в Москве 3G то у Билайна нормаль...,negative
114,"@Beeline_RUS домашний, всё, всё норм",neutral


In [9]:
spellchecked = spellcheck_dataframe_simple(data, lang='ru', text_field='text')
data['text_spellchecked'] = spellchecked
print('changed texts ratio: ', sum(data.text != data.text_spellchecked) / len(data))
data.to_csv(filepath, index=False)



changed texts ratio:  0.3851063829787234


### Test

In [10]:
filepath = '/data/classification/SentiRuEval_data/all_data/test.csv'
data = pd.read_csv(filepath)[['text', 'label']]
data.sample(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,text,label
17265,RT @stevenudemejab: #Сбербанк - калькулятор кр...,neutral
28148,@ru_mts Интернет у меня в планшете не работает...,neutral


In [11]:
spellchecked = spellcheck_dataframe_simple(data, lang='ru', text_field='text')
data['text_spellchecked'] = spellchecked
print('changed texts ratio: ', sum(data.text != data.text_spellchecked) / len(data))
data.to_csv(filepath, index=False)



changed texts ratio:  0.34081681394793406
