In [24]:
import numpy as np
import pandas as pd

import torch
import torch.utils.data

In [2]:
data_pos = pd.read_csv('/media/data/nlp/sentiment/rus-mokoron/positive.csv',
                   names=['id', 'date', 'user', 'text', 'sentiment'], index_col=0,
                   sep=';', usecols=[0, 1, 2, 3, 4])
data_neg = pd.read_csv('/media/data/nlp/sentiment/rus-mokoron/negative.csv',
                   names=['id', 'date', 'user', 'text', 'sentiment'], index_col=0,
                   sep=';', usecols=[0, 1, 2, 3, 4])

data = pd.concat([data_pos, data_neg])

In [3]:
data.sample(5)

Unnamed: 0_level_0,date,user,text,sentiment
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
413821004637806593,1387497590,Artem_NS,И как в этом шлюхообразном мире можно найти св...,-1
410848220143972352,1386788823,missdevine_,"среда, 18.12, волейбол. учителя vs лицеисты :)...",1
410792065178812417,1386775435,pennyroyal_teea,@cykaaaa_ я в рязани хочу учиться..в.москву бу...,1
416426390217834496,1388118762,ViktoriaMshn,@strikalo в жизни оскорблять не так просто:(,-1
411127764864217088,1386855472,AvramenkoAnuta,"У меня попугай подпевает музыке, которую я слу...",1


## Train-val-test split

In [8]:
len(data)

226834

In [12]:
indices = list(data.index)
np.random.shuffle(indices)

TEST_SIZE = 0.15
VAL_SIZE = 0.15

test_size = int(len(indices) * TEST_SIZE)
val_size = int(len(indices) * VAL_SIZE)
train_size = len(indices) - test_size - val_size

train_idxs = indices[:train_size]
valid_idxs = indices[train_size:train_size+val_size]
test_idxs = indices[train_size+val_size:]

print('Train: ', train_size)
print('Valid: ', val_size)
print('Test : ', test_size)

Train:  158784
Valid:  34025
Test :  34025


In [14]:
train_data = data.loc[train_idxs]
valid_data = data.loc[valid_idxs]
test_data = data.loc[test_idxs]

In [15]:
basepath = '/media/data/nlp/sentiment/rus-mokoron/splits/'
train_data.to_csv(basepath+'train.csv')
valid_data.to_csv(basepath+'validation.csv')
test_data.to_csv(basepath+'test.csv')

# Dataset

In [56]:
import numpy as np
import pandas as pd

import torch
import torch.utils.data

ALPHABET = ['<UNK>'] + ['\n'] + [s for s in """ абвгдеёжзийклмнопрстуфхцчщъыьэюя0123456789-,;.!?:'’’/\|_@#$%ˆ&* ̃‘+-=<>()[]{}"""]
ALPHABET += 'abcdefghijklmnopqrstuvwxyz'

class MokoronDatasetOneHot(torch.utils.data.Dataset):
    """
    Zero vector for padding.
    """

    def __init__(self, filepath, maxlen=512, noise_level=0):
        self.alphabet = ALPHABET

        self.dataframe = pd.read_csv(filepath)
        self.noise_level = 0
        self.maxlen = maxlen
        self.char2int = {s: i for s, i in zip(self.alphabet, range(len(self.alphabet)))}
        self.noise_level = noise_level

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        line = self.dataframe.iloc[idx]
        if self.noise_level > 0:
            text = self._noise_generator(text)
        text = self._preprocess_text_nobatch(line.text)
        label = (line.sentiment == 1.)
        return text, label

    def _noise_generator(string):
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised

    def _one_hot(self, char):
        zeros = np.zeros(len(self.alphabet))
        if char in self.char2int:
            zeros[self.char2int[char]] = 1.
        else:
            zeros[self.char2int['<UNK>']] = 1.

    def _preprocess_text_nobatch(self, text):
        text = text.lower()
        one_hotted_text = np.zeros((self.maxlen, len(self.alphabet)))
        for i, char in enumerate(text):
            if i >= self.maxlen:
                break
            one_hotted_text[i, self.char2int.get(char, self.char2int['<UNK>'])] = 1.

        return torch.FloatTensor(one_hotted_text)

    def onehot2text(self, one_hotted_text, show_pad=False):
        text = ''
        max_values, idx = torch.max(one_hotted_text, 1)
        for c, i in enumerate(idx):
            if max_values[c] == 0:
                if show_pad:
                    symb = '<PAD>'
                else:
                    symb = ''
            else:
                symb = ALPHABET[i]
            text += symb
        return text


In [57]:
dataset = MokoronDatasetOneHot(filepath=basepath+'train.csv')

In [58]:
text, sentiment = dataset[0]
text, sentiment

(
     0     0     0  ...      0     0     0
     0     0     0  ...      0     0     0
     0     0     0  ...      0     0     0
        ...          ⋱          ...       
     0     0     0  ...      0     0     0
     0     0     0  ...      0     0     0
     0     0     0  ...      0     0     0
 [torch.FloatTensor of size 512x106], False)

In [59]:
dataset.dataframe.iloc[0].text

'@SPB_citizen @gruppa_GS Сдулися оба.((( \nОдно радует: в семействе Гудковых не наблюдается конфликта отцов и детей.'

In [60]:
dataset.onehot2text(text)

'@spb_citizen @gruppa_gs сдулися оба.((( \nодно радует: в семействе гудковых не наблюдается конфликта отцов и детей.'