In [1]:
import numpy as np
import pandas as pd

import torch
import torchtext

from tqdm import tqdm

In [7]:
%%time
MAX_WORD_LEN = 8  # chars in word
MAX_TEXT_LEN = 256  # words in text

text_field = torchtext.data.Field(
    lower=True, include_lengths=False, fix_length=MAX_TEXT_LEN, tensor_type=torch.FloatTensor, batch_first=True,
    tokenize=lambda x: x, use_vocab=False, sequential=False
)
label_field = torchtext.data.Field(sequential=False, use_vocab=False)

train, test = torchtext.datasets.imdb.IMDB.splits(text_field, label_field)

downloading aclImdb_v1.tar.gz
CPU times: user 10.2 s, sys: 2.51 s, total: 12.7 s
Wall time: 1min 1s


In [8]:
data_train = []
data_test = []

for line in tqdm(train):
    data_train.append({'text': line.text, 'sentiment': line.label})

for line in tqdm(test):
    data_test.append({'text': line.text, 'sentiment': line.label})

100%|██████████| 25000/25000 [00:00<00:00, 1138951.83it/s]
100%|██████████| 25000/25000 [00:00<00:00, 1143460.34it/s]


In [9]:
data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)

In [10]:
data_train.sample()

Unnamed: 0,sentiment,text
13463,neg,"not really spoilers in my opinion, but i wante..."


### Clean

In [11]:
# https://github.com/kiddick/speller
import requests


class Speller(object):
    service = 'http://speller.yandex.net/services/spellservice.json/checkText'

    def __init__(self, text, options=None, lang=None, format_text=None):
        self.text = text
        self.options = options
        self.lang = lang
        self.format_text = format_text
        self._answer = None

    def check(self):
        data = {'text': self.text}
        if self.options:
            data['options'] = self.options
        if self.lang:
            data['lang'] = self.lang
        if self.format_text:
            data['format'] = self.format_text
        answer = requests.post(url=self.service, data=data).json()
        return answer

    @property
    def answer(self):
        if self._answer is None:
            self._answer = self.check()
        return self._answer

    @property
    def correct(self):
        return not self.answer

    @property
    def spellsafe(self):
        raise NotImplementedError("Subclasses should implement this!")


class Word(Speller):

    @property
    def variants(self):
        if self.correct:
            return
        return self.answer[0]['s']

    @property
    def spellsafe(self):
        if self.correct:
            return
        return self.variants[0]


class Text(Speller):

    @property
    def spellsafe(self):
        changes = {el['word']: el['s'][0] for el in self.answer if len(el['s']) > 0}
        result = self.text
        for wrong, fixed in changes.items():
            result = result.replace(wrong, fixed)
        return result

    @property
    def errors(self):
        return [el['word'] for el in self.answer]

def spellcheck_dataframe_simple(dataframe, text_field='text', lang=None):
    fixed_texts = []

    total = len(dataframe)
    for idx, line in tqdm(dataframe.iterrows(), total=total, leave=False):
        fixed_text = Text(line[text_field], lang=lang).spellsafe
        fixed_texts.append(fixed_text)

    return fixed_texts

print(Text('42 is a cUl maagic namber').spellsafe)

42 is a cool magic number


In [13]:
spellchecked = spellcheck_dataframe_simple(data_train, lang='en', text_field='text')
data_train['text_spellchecked'] = spellchecked
print('changed texts ratio: ', sum(data_train.text != data_train.text_spellchecked) / len(data_train))

                                                     

changed texts ratio:  0.92936




In [14]:
spellchecked = spellcheck_dataframe_simple(data_test, lang='en', text_field='text')
data_test['text_spellchecked'] = spellchecked
print('changed texts ratio: ', sum(data_test.text != data_test.text_spellchecked) / len(data_test))

                                                     

changed texts ratio:  0.92664




Валидация на 15% train 

In [15]:
indices = list(data_train.index)
np.random.shuffle(indices)

VAL_SIZE = 0.15

val_size = int(len(indices) * VAL_SIZE)
train_size = len(indices) - val_size

train_idxs = indices[:train_size]
valid_idxs = indices[train_size:]

print('Train: ', len(train_idxs))
print('Valid: ', len(valid_idxs))
#Train:  21250
#Valid:  3750

Train:  21250
Valid:  3750


In [16]:
data_val = data_train.loc[valid_idxs]
data_train = data_train.loc[train_idxs]

In [17]:
len(data_val), len(data_train)  # (3750, 21250)

(3750, 21250)

In [20]:
import os
os.makedirs('/data/classification/IMDB', exist_ok=True)

In [21]:
data_train.to_csv('/data/classification/IMDB/train.csv')
data_val.to_csv('/data/classification/IMDB/validation.csv')
data_test.to_csv('/data/classification/IMDB/test.csv')

In [22]:
data_val.sample()

Unnamed: 0,sentiment,text,text_spellchecked
23900,neg,the first home alone was a decent enough film....,the first home alone was a decent enough film....
