Этот датасет взят из [репозитория](https://github.com/vineetdhanawat/twitter-sentiment-analysis). Конкретно вот [этот файл](https://github.com/vineetdhanawat/twitter-sentiment-analysis/blob/master/datasets/Sentiment%20Analysis%20Dataset.csv)

In [16]:
import numpy as np
import pandas as pd

import torch.utils.data as datautils

In [9]:
FILEPATH = '/media/data/nlp/data/Sentiment Analysis Dataset.csv'

In [5]:
twitter = pd.read_csv('/media/data/nlp/data/Sentiment Analysis Dataset.csv', encoding='latin1')

In [7]:
twitter.sample(20)

Unnamed: 0,ItemID,Sentiment,SentimentText
522779,522791,0,@wdrummond Me neither......damnit!!!!
266323,266335,1,@KrisAllenmusic I'm gonna post an article in o...
442429,442441,0,@PsychicVeg He is fine thx Weather bn really ...
499836,499848,1,"@tommcfly tom, figs are fruits. i doubt miss p..."
824226,824240,1,hmm...... what shall my day consist of? shall ...
160696,160708,0,@deyalo http://twitpic.com/6svrt - definitely ...
161320,161332,1,@Diabeticizme oh ya.. And I've seen videos it...
92813,92825,1,"@ChrisRGross Thabk you, but people are going t..."
573939,573953,0,2 days ago @ ejekt09: nice performances by Ech...
344573,344585,1,@lowestformofwit inFamous


In [10]:
class TwitterDataset(datautils.Dataset):
    def __init__(self, filepath):
        self.data = pd.read_csv(filepath, encoding='latin1')
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        line = self.data.iloc[idx]
        text = line.SentimentText
        label = line.Sentiment
        return text, label

In [11]:
dataset = TwitterDataset(FILEPATH)

### Разобьём на train / val / test

In [20]:
len(twitter)

1048575

In [28]:
twitter_idxs = list(range(len(twitter)))
np.random.shuffle(twitter_idxs)

TEST_SIZE = 0.3
VAL_SIZE = 0.3

test_size = int(len(twitter) * TEST_SIZE)
val_size = int(len(twitter) * VAL_SIZE)
train_size = len(twitter) - test_size - val_size

train_idxs = twitter_idxs[:train_size]
valid_idxs = twitter_idxs[train_size:train_size+val_size]
test_idxs = twitter_idxs[train_size+val_size:]

assert len(train_idxs) == train_size
assert len(valid_idxs) == val_size, (len(valid_idxs), val_size)
assert len(test_idxs) == test_size, (len(test_idxs), test_size)

print('Train: ', train_size)
print('Valid: ', val_size)
print('Test : ', test_size)

Train:  419431
Valid:  314572
Test :  314572


### Сохраним это и не будем трогать

In [33]:
train_data = twitter.iloc[train_idxs]
valid_data = twitter.iloc[valid_idxs]
test_data = twitter.iloc[test_idxs]

In [34]:
train_data.to_csv('/media/data/nlp/data/twitter_sentiment/twitter_sentiment_train.csv')
valid_data.to_csv('/media/data/nlp/data/twitter_sentiment/twitter_sentiment_valid.csv')
test_data.to_csv('/media/data/nlp/data/twitter_sentiment/twitter_sentiment_test.csv')

In [35]:
train_data = pd.read_csv('/media/data/nlp/data/twitter_sentiment/twitter_sentiment_train.csv')

In [36]:
train_data

Unnamed: 0.1,Unnamed: 0,ItemID,Sentiment,SentimentText
0,893676,893690,0,Im so bored i dunno what to do if anyone sees...
1,816030,816044,0,heading up to work
2,182383,182395,1,@ekaa17 : sa friendster and sa YM but i just ...
3,6533,6536,1,that's good defensw team we outta here baby
4,17698,17710,1,@_bigsteve location sorted sorry for the delay.
5,806056,806070,1,I cant wait for left 4 dead 2
6,349536,349548,0,@Marshmelowsquid Christaaaaaa!! Science is hur...
7,582388,582402,0,Argh. Just realised new Tweetdeck release does...
8,692588,692602,1,@cheydoodle yeees how about youu?
9,379343,379355,1,@pnikoforonda Hi Niko... is billy going here i...


# Новый dataset-класс

In [53]:
import torch

class OneHotDataset(datautils.Dataset):
    def __init__(self, dataframe, alphabet=None, noise_level=0, maxlen=512):
        """
        :param dataframe: pandas dataframe with fields "text": str and "label": int
        """
        if alphabet is None:
            raise NotImplementedError()
        else:
            self.alphabet = alphabet
        self.char2int = {s: i for s, i in zip(self.alphabet, range(len(self.alphabet)))}

        self.maxlen = maxlen
        self.dataframe = dataframe
        self.noise_level = noise_level
        if self.noise_level > 0:
            raise NotImplementedError()

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        line = self.dataframe.iloc[idx]
        text = self._preprocess_text_nobatch(line.text)
        label = line.label
        return text, label

    def _noise_generator(string):
        noised = ""
        for c in string:
            if random() > self.noise_level:
                noised += c
            if random() < self.noise_level:
                noised += choice(self.alphabet)
        return noised

    def _one_hot(self, char):
        zeros = np.zeros(len(self.alphabet))
        if char in self.char2int:
            zeros[self.char2int[char]] = 1.
        else:
            zeros[self.char2int['UNK']] = 1.

    def _preprocess_text_nobatch(self, text):
        one_hotted_text = np.zeros((self.maxlen, len(self.alphabet)))
        for i, char in enumerate(text):
            if i >= self.maxlen:
                break
            one_hotted_text[i, self.char2int.get(char, self.char2int['UNK'])] = 1.
        if i < self.maxlen:
            for j in range(i+1, self.maxlen):
                one_hotted_text[j, self.char2int['PAD']] = 1.

        return torch.FloatTensor(one_hotted_text)

    def onehot2text(self, one_hotted_text):
        text = ''
        _, idx = torch.max(one_hotted_text, 1)
        for i in idx:
            symb = self.alphabet[i]
            if symb == 'PAD':
                break
            else:
                text += symb
        return text


In [39]:
ALPHABET = [' ', 'e', 't', 'a', 'i', 'o', 's', 'n', 'r', 'h', 'l', 'd', 'c', 'm', 'u', 'f', 'g', 'y', 'b', 'w', 'p',\
            '.', 'v', ',', 'k', "'", '/', '>', '<', '-', '"', 'j', 'x', ')', '(', '!', 'z', 'q', '0', '1', '?', ':',\
            '9', '2', '*', ';', '3', '5', '8', '4', '7', '&', '6', 'é', '\x96', '`', '$', '\x85', '_', '%', '=', '#',\
            'UNK', 'PAD']

In [41]:
valid_data.columns = ['ItemID', 'label', 'text']

In [44]:
valid_data.sample()

Unnamed: 0,ItemID,label,text
90785,90797,1,@cmsevert Not a bad breakfast brother... Next ...


In [54]:
dataset = OneHotDataset(valid_data, alphabet=ALPHABET)

In [55]:
dataset[5]

(
     0     0     0  ...      0     1     0
     0     0     0  ...      0     0     0
     0     1     0  ...      0     0     0
        ...          ⋱          ...       
     0     0     0  ...      0     0     1
     0     0     0  ...      0     0     1
     0     0     0  ...      0     0     1
 [torch.FloatTensor of size 512x64], 1)