In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import re
import sys
import spacy
import pickle
import numpy as np

from glob import glob
import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

from utils import *
from data_utils import Vocabulary, tokenizer
from train_utils import train

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy
# setup
NLP = spacy.load('en_core_web_sm')  # NLP toolkit

In [2]:

story = pd.read_csv('./blogtext_short.csv')
story.head()

Unnamed: 0.1,Unnamed: 0,sample,id,sign,text
0,1,1,589736,Aries,Much funny. 2 points. As mentioned in the...
1,2,2,589736,Aries,"Harpers, Harpers, everywhere. Harpers, Har..."
2,3,3,589736,Aries,"In an earlier post, Johnathan said: 'And ..."
3,4,4,589736,Aries,"I'd post this on the RTG Blog, but I can't..."
4,5,5,589736,Aries,The answer to the first question lies with ...


In [3]:
# story['id'].value_counts()
# story.dtypes
story['id'] = story['id'].astype(str)
# story.dtypes

In [4]:
PAD = '<pad>'  # special symbol we use for padding text
UNK = '<unk>'  # special symbol we use for rare or unknown word
# parameters
max_len = 200
min_count = 10
batch_size = 50

In [5]:
class TextClassificationDataset():
    
    # def __init__(self, split='train',  vocab_path='vocab.pkl', max_len=100, min_count=10, data, tokenizer=None):   
    def __init__(self, split,  vocab_path, max_len, min_count, data, tokenizer):
        # self.path = path
        # assert split in ['train', 'test']
        self.split = split
        self.data = data

        self.vocab_path = vocab_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.min_count = min_count
        
        self.cache = {}
        self.vocab = None
        
        self.classes = []
        self.class_to_index = {}
        self.text_files = []
        
#         split_path = f'{path}/{split}'
        
        
        X_train, X_test, y_train, y_test = train_test_split(data['text'], data['id'], test_size=0.2)
        if split == 'test':
            red_data = pd.concat([X_test, y_test], axis=1)
       
        else:
            red_data = pd.concat([X_train, y_train], axis=1)
        


        for cls_idx, label in enumerate(['449628','734562','589736']):  # three labels
            text_files = [(fname, cls_idx) for fname in red_data.loc[red_data['id'] == label, 'text']]
            self.text_files += text_files
            self.classes += [label]
            self.class_to_index[label] = cls_idx
        
        self.num_classes = len(self.classes)
            
        # build vocabulary from training and validation texts
        self.build_vocab()
        
        
    def __getitem__(self, index):
        # read the tokenized text file and its label (neg=0, pos=1)
        fname, class_idx = self.text_files[index]
        
        if fname in self.cache:
            return self.cache[fname], class_idx
        
        # read text file 
        text = fname
        
        # tokenize the text file
        tokens = self.tokenizer(text.lower().strip())
        
        # padding and trimming
        if len(tokens) < self.max_len:
            num_pads = self.max_len - len(tokens)
            tokens = [PAD] * num_pads + tokens
        elif len(tokens) > self.max_len:
            tokens = tokens[:self.max_len]
            
        # numericalizing
        ids = torch.LongTensor(self.max_len)
        for i, word in enumerate(tokens):
            if word not in self.vocab.word2index:
                ids[i] = self.vocab.word2index[UNK]  # unknown words
            elif word != PAD and self.vocab.word2count[word] < self.min_count:
                ids[i] = self.vocab.word2index[UNK]  # rare words
            else:
                ids[i] = self.vocab.word2index[word]
                
        # save in cache for future use
        self.cache[fname] = ids
        
        return ids, class_idx
    
    def __len__(self):
        return len(self.text_files)
    
    def build_vocab(self):
        if not os.path.exists(self.vocab_path):
            vocab = Vocabulary(self.tokenizer)

            for line in self.data['text']:
                vocab.add_sentence(line.lower())

            # sort words by their frequencies
            words = [(0, PAD), (0, UNK)]
            words += sorted([(c, w) for w, c in vocab.word2count.items()], reverse=True)

            self.vocab = Vocabulary(self.tokenizer)
            for i, (count, word) in enumerate(words):
                self.vocab.word2index[word] = i
                self.vocab.word2count[word] = count
                self.vocab.index2word[i] = word
                self.vocab.count += 1

            pickle.dump(self.vocab, open(self.vocab_path, 'wb'))
        else:
            self.vocab = pickle.load(open(self.vocab_path, 'rb'))


In [6]:
train_ds = TextClassificationDataset(split='train', vocab_path='vocab_class.pkl', max_len=max_len, min_count=min_count, data=story, tokenizer=tokenizer)
                 
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [7]:
valid_ds = TextClassificationDataset(split='test', vocab_path='vocab_class.pkl', max_len=max_len, min_count=min_count, data=story, tokenizer=tokenizer)

valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)

In [8]:
len(train_ds)

7052

In [9]:
len(valid_ds)

1764

In [10]:
train_ds.classes

['449628', '734562', '589736']

In [11]:
train_ds.class_to_index

{'449628': 0, '734562': 1, '589736': 2}

In [12]:
ids, label = train_ds[0]
print(train_ds.classes[label])
print(ids)

449628
tensor([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   18,  191,   10,    1,   10,    5,  174,   15,  217, 1030,
          13,   17,   50, 1916,    5,  174,   15, 2858,    4, 1481,   15,  121,
         619,  620,   97,   17,   76,    5,   10,    1,   10, 4432,  145,  380,
          39,   92,   77,   15, 1251,    2, 2858,  429, 5038,   61,   15, 3596,
         115,   15,   79, 1656,  

  ids = torch.LongTensor(self.max_len)


In [13]:
# convert back the sequence of integers into original text
print(' '.join([train_ds.vocab.index2word[i.item()] for i in ids]))

<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> urllink use ' <unk> ' to help you change careers it 's an exercise to help you identify the skills you most enjoy using here 's how to ' <unk> ' gather two friends or other people you trust . identify several instances when you achieved something you were proud of . write down those experiences . then examine them to <unk> what skills you used to achieve those <unk> . by sharing ideas with your two friends , 

In [14]:
# print the original text
print(train_ds.text_files[0][0])

         urlLink Use 'Trioing' to Help You Change Careers    It's an exercise to help you identify the skills you most enjoy using: Here's how to 'trio': Gather two friends or other people you trust.  Identify several instances when you achieved something you were proud of. Write down those experiences. Then examine them to pinpoint what skills you used to achieve those successes.   By sharing ideas with your two friends, you can uncover skills that you didn't realize you had. Your friends can help confirm your strengths and skills too.     


In [15]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embed_size
        self.num_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
                        
    def forward(self, x):
        # x = [seq_len, batch_size]
        embedded = self.dropout(self.embedding(x))
        # embedded = [seq_len, batch_size, embedding_dim]
        output, (hidden, cell) = self.lstm(embedded)
        # output = [seq_len, batch_size, hidden_dim * num_directions]
        # hidden = [num_layers * num_directions, batch_size, hidden_dim]
        # cell = [num_layers * num_directions, batch_size, hidden_dim]
        if self.lstm.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
            # hidden = [batch_size, hidden_dim * num_directions]
        else:
            hidden = self.dropout(hidden[-1,:,:])
            # hidden = [batch_size, hidden_dim]
        return self.fc(hidden)

In [16]:
train_ds.num_classes

3

In [17]:
vocab_size = 2 + len([w for (w, c) in train_ds.vocab.word2count.items() if c >= min_count])
print(vocab_size)

5226


In [18]:
# Define hyperparameters
embed_size = 100
embedding_dim = 100
hidden_dim = 128
output_dim = train_ds.num_classes
n_layers = 1
bidirectional = True
dropout = 0.5
lr = 0.001
num_epochs = 5

# Define the model
model = LSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)



In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [20]:
criterion = nn.CrossEntropyLoss().to(device)
criterion = criterion.to(device)
    
optimizer = optim.Adam(model.parameters(), lr=lr, betas=(0.7, 0.99))
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)

In [21]:
hist = train(model, train_dl, valid_dl, criterion, optimizer, device, scheduler, num_epochs)

[Epoch:  4/ 5] | Training Loss: 0.0135 | Testing Loss: 0.0143 | Training Acc: 65.01 | Testing Acc: 64.46



Training:   0%|          | 0/142 [00:00<?, ?it/s]

Validation:   0%|          | 0/36 [00:00<?, ?it/s]

[Epoch:  5/ 5] | Training Loss: 0.0129 | Testing Loss: 0.0136 | Training Acc: 66.44 | Testing Acc: 64.40

