In [4]:
# https://github.com/kiddick/speller
import requests


class Speller(object):
    service = 'http://speller.yandex.net/services/spellservice.json/checkText'

    def __init__(self, text, options=None, lang=None, format_text=None):
        self.text = text
        self.options = options
        self.lang = lang
        self.format_text = format_text
        self._answer = None

    def check(self):
        data = {'text': self.text}
        if self.options:
            data['options'] = self.options
        if self.lang:
            data['lang'] = self.lang
        if self.format_text:
            data['format'] = self.format_text
        answer = requests.post(url=self.service, data=data).json()
        return answer

    @property
    def answer(self):
        if self._answer is None:
            self._answer = self.check()
        return self._answer

    @property
    def correct(self):
        return not self.answer

    @property
    def spellsafe(self):
        raise NotImplementedError("Subclasses should implement this!")


class Word(Speller):

    @property
    def variants(self):
        if self.correct:
            return
        return self.answer[0]['s']

    @property
    def spellsafe(self):
        if self.correct:
            return
        return self.variants[0]


class Text(Speller):

    @property
    def spellsafe(self):
        changes = {el['word']: el['s'][0] for el in self.answer if len(el['s']) > 0}
        result = self.text
        for wrong, fixed in changes.items():
            result = result.replace(wrong, fixed)
        return result

    @property
    def errors(self):
        return [el['word'] for el in self.answer]

print(Text('42 is a cUl maagic namber').spellsafe)

42 is a cool magic number


In [5]:
import pandas as pd
from tqdm import tqdm

In [7]:
def spellcheck_dataframe(dataframe, text_field='text', lang=None):
    fixed_texts = []

    total = len(dataframe)
    for idx, line in tqdm(dataframe.iterrows(), total=total, leave=False):
        fixed_text = Text(line[text_field], lang=lang).spellsafe
        fixed_texts.append({
            'text_spellchecked': fixed_text,
            'text_original': line.text,
            'sentiment': int(line.sentiment == 'pos')
        })

    return fixed_texts

In [10]:
def spellcheck_dataframe_simple(dataframe, text_field='text', lang=None):
    fixed_texts = []

    total = len(dataframe)
    for idx, line in tqdm(dataframe.iterrows(), total=total, leave=False):
        fixed_text = Text(line[text_field], lang=lang).spellsafe
        fixed_texts.append(fixed_text)

    return fixed_texts

# IMDB

In [28]:
data_train = pd.read_csv('/media/data/nlp/sentiment/IMDB/splits/train.csv')
data_val = pd.read_csv('/media/data/nlp/sentiment/IMDB/splits/validation.csv')
data_test = pd.read_csv('/media/data/nlp/sentiment/IMDB/splits/test.csv')

In [26]:
data_test.sample()

Unnamed: 0.1,Unnamed: 0,sentiment,text_original,text_spellchecked
8006,8006,0,i can't believe i missed this one. made in 197...,i can't believe i missed this one. made in 197...


In [16]:
from tqdm import tqdm

In [19]:
data_train_sp = pd.DataFrame(spellcheck_dataframe(data_train, lang='en'))
data_val_sp = pd.DataFrame(spellcheck_dataframe(data_val, lang='en'))
data_test_sp = pd.DataFrame(spellcheck_dataframe(data_test, lang='en'))


  0%|          | 0/21250 [00:00<?, ?it/s][A
  0%|          | 2/21250 [00:00<17:56, 19.74it/s][A
  0%|          | 5/21250 [00:00<14:47, 23.94it/s][A
Exception in thread Thread-7:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/phobos_aijun/.virtualenvs/pytorch-env/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



In [30]:
data_train_sp['sentiment'] = data_train['sentiment'] == 'pos'
data_val_sp['sentiment'] = data_val['sentiment'] == 'pos'
data_test_sp['sentiment'] = data_test['sentiment'] == 'pos'


In [22]:
data_train_sp.sample()

Unnamed: 0,sentiment,text_original,text_spellchecked
15263,0,anyone who knows me even remotely can tell you...,anyone who knows me even remotely can tell you...


In [23]:
sum(data_train_sp['text_original'] != data_train_sp['text_spellchecked'])

19739

In [32]:
data_train_sp.to_csv('/media/data/nlp/sentiment/IMDB/splits/train.csv')
data_val_sp.to_csv('/media/data/nlp/sentiment/IMDB/splits/validation.csv')
data_test_sp.to_csv('/media/data/nlp/sentiment/IMDB/splits/test.csv')

# SST

In [27]:
data_train = pd.read_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/train.csv')
data_val = pd.read_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/validation.csv')
data_test = pd.read_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/test.csv')

In [20]:
data_train_sp = pd.DataFrame(spellcheck_dataframe(data_train))
data_val_sp = pd.DataFrame(spellcheck_dataframe(data_val))
data_test_sp = pd.DataFrame(spellcheck_dataframe(data_test))

100%|██████████| 6920/6920 [03:27<00:00, 33.30it/s]
100%|██████████| 872/872 [00:24<00:00, 35.91it/s]
100%|██████████| 1821/1821 [00:56<00:00, 32.43it/s]


In [32]:
data_train_sp.to_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/train.csv')
data_val_sp.to_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/validation.csv')
data_test_sp.to_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/test.csv')

In [26]:
pd.read_csv('/media/data/nlp/sentiment/stanfordSentimentTreebank/splits/train.csv').sample(5)

Unnamed: 0.1,Unnamed: 0,sentiment,text_original,text_spellchecked
4772,4772,negative,feels less like a change in -lrb- herzog 's -r...,feels less like a change in -lrb- herzog 's -r...
5536,5536,negative,just is n't as weird as it ought to be .,just isn't as weird as it ought to be .
6714,6714,negative,a complete waste of time .,a complete waste of time .
5678,5678,negative,depressingly thin and exhaustingly contrived .,depressingly thin and exhaustingly contrived .
1140,1140,positive,a slight but sweet film .,a slight but sweet film .


# Mokoron

In [4]:
data_pos = pd.read_csv('../data/ru-mokoron/positive.csv',
                   names=['id', 'date', 'user', 'text', 'sentiment'], index_col=0,
                   sep=';', usecols=[0, 1, 2, 3, 4])
data_neg = pd.read_csv('../data/ru-mokoron/negative.csv',
                   names=['id', 'date', 'user', 'text', 'sentiment'], index_col=0,
                   sep=';', usecols=[0, 1, 2, 3, 4])

data = pd.concat([data_pos, data_neg])

In [5]:
data.sample(5)

Unnamed: 0_level_0,date,user,text,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
410790781600165888,1386775129,NKAnnet,@Marinkatwit ага смотрела:-) :-) крутые песенк...,1
423086524247580672,1389706662,enter_ru,@gimlis Как товар надлежащего качества без бра...,-1
414620033315532800,1387688093,n_merkulova,"@Julia_Kantaeva фуфуфу, холод, мороз! Не хочу ...",-1
411194442373021696,1386871369,one_litter,@red_unicorn0 интересно как ты представляешь м...,1
409830126382493697,1386546091,Andy_Igorevich,"Интересно, успею я набрать 1000 твитов до Ново...",1


In [6]:
data.shape, data_pos.shape, data_neg.shape

((226834, 4), (114911, 4), (111923, 4))

In [8]:
from tqdm import tqdm_notebook

In [9]:
fixed_texts = []

total = len(data)
for idx, line in tqdm(data.iterrows(), total=total):
    fixed_text = Text(line.text).spellsafe
    fixed_texts.append({
        'id': idx,
        'text': fixed_text,
        'original_text': line.text,
        'sentiment': line.sentiment
    })

100%|██████████| 226834/226834 [1:53:39<00:00, 33.26it/s]


In [10]:
fixed_texts_df = pd.DataFrame(fixed_texts)

In [15]:
fixed_texts_df.sample(10)

Unnamed: 0,id,original_text,sentiment,text
183018,417350736423292928,ЗБС меня мама наказала (( на улицу нельзя((,-1,ЗБС меня мама наказала (( на улицу нельзя((
216959,423694828938203136,RT @NPlakhotnik: @KLitvinenko_O @Alya__Seal @L...,-1,RT @NPlakhotnik: @KLitvinenko_O @Alya__Seal @L...
155129,413927178020343808,ЕЩЕ У МЕНЯ ГОРИТ ЛИЦО СЪЕШЬТЕ ЕГО ИЛИ ЧТО НИБД...,-1,ЕЩЕ У МЕНЯ ГОРИТ ЛИЦО СЪЕШЬТЕ ЕГО ИЛИ ЧТО НИБД...
182299,417305860172296192,@MishaKrupin Бля вот живут же ебаные евреи(,-1,@MishaKrupin Бля вот живут же ебаные евреи(
164486,415101071380328448,"когда я уже вместо того ,чтобы идти на репетиц...",-1,"когда я уже вместо того ,чтобы идти на репетиц..."
2747,409041916274438144,"RT @firenetwiffi: Дядя который снимался в "" ос...",1,"RT @firenetwiffi: Дядя который снимался в "" ос..."
183893,417516997996711936,"@MrsLovegood Пусть это будет уткой, Господи;(",-1,"@MrsLovegood Пусть это будет уткой, Господи;("
208907,422390663712231425,"так хочется многое рассказать ,но не могу, так...",-1,"так хочется многое рассказать ,но не могу, так..."
82114,410763357562871809,@russian_bruin @Pavlyxin @fcdin самое забавное...,1,@Russian_bruin @Pavlyxin @fcdin самое забавное...
33259,409756592981434368,"Спасибо блять одной пизде, что я страдаю хуйне...",1,"Спасибо блять одной пизде, что я страдаю хуйне..."


In [16]:
fixed_texts_df.to_csv('../data/ru-mokoron/spellchecked.csv')

# Airline tweets

In [3]:
ls ../../data/airline_tweets/

test.csv  train.csv  validation.csv


In [8]:
pd.read_csv('../../data/airline_tweets/train.csv').sample(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
2099,569656088817233921,negative,1.0,Can't Tell,0.6735,US Airways,,JK47theweapon,,1,@USAirways You suck in Philadelphia. Let us o...,,2015-02-22 16:32:49 -0800,Philly Area,Eastern Time (US & Canada)
5498,569995335893848064,neutral,0.639,,,American,,scottfmurphy,,0,@AmericanAir boom. http://t.co/PzGc6Jch7n,,2015-02-23 15:00:52 -0800,"New York, NY",Central Time (US & Canada)


In [11]:
for filename in  ['test.csv', 'train.csv', 'validation.csv']:
    filepath = '../../data/airline_tweets/' + filename
    data = pd.read_csv(filepath)
    spellchecked = spellcheck_dataframe_simple(data)
    data['text_spellchecked'] = spellchecked
    print('changed texts ratio: ', sum(data.text != data.text_spellchecked) / len(data))
    data.to_csv(filepath, index=False)

  0%|          | 4/10099 [00:00<04:39, 36.09it/s]

changed texts ratio:  1.0


  0%|          | 0/2164 [00:00<?, ?it/s]

changed texts ratio:  1.0


                                                   

changed texts ratio:  1.0




In [23]:
for filename in  ['test.csv', 'train.csv', 'validation.csv']:
    filepath = '../../data/airline_tweets/' + filename
    data = pd.read_csv(filepath)
#     spellchecked = spellcheck_dataframe_simple(data)
#     data['text_spellchecked'] = spellchecked
    print('changed texts ratio: ', sum(data.text != data.text_spellchecked) / len(data))
#     data.to_csv(filepath, index=False)

changed texts ratio:  0.7315157116451017
changed texts ratio:  0.739479156352114
changed texts ratio:  0.737985212569316


In [24]:
pd.read_csv('../../data/airline_tweets/train.csv').sample(2)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,text_spellchecked
7711,569658322732756992,neutral,0.7067,,0.0,Southwest,,Ashishwadhwa9,,0,@SouthwestAir -U dont have Atlanta to San fran...,,2015-02-22 16:41:42 -0800,,,@South west Air -U don't have Atlanta to San F...
5388,568282478076669952,negative,1.0,Customer Service Issue,0.6739,US Airways,,Gregg_Silver,,0,@USAirways no. Other than being on my credit c...,,2015-02-18 21:34:35 -0800,,Quito,@US Airways no. Other than being on my credit ...


# Exp

In [3]:
import requests

In [28]:
params = {'text': 'какого цвеета ты сегодя?', 'lang': 'ru,en'}
res = requests.get('http://speller.yandex.net/services/spellservice.json/checkText', params=params)

In [32]:
res.json()

[{'code': 1,
  'col': 7,
  'len': 6,
  'pos': 7,
  'row': 0,
  's': ['цвета'],
  'word': 'цвеета'},
 {'code': 1,
  'col': 17,
  'len': 6,
  'pos': 17,
  'row': 0,
  's': ['сегодня'],
  'word': 'сегодя'}]

In [33]:
t = 'какого цвеета ты сегодя?'

In [35]:
t[7:7+6]

'цвеета'