# Sentiment Classification Project

In [None]:
# !pip install -r reqs.txt

In [5]:
### Load data

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np
from scipy.sparse._csr import csr_matrix

from typing import Callable, Dict

from tqdm import tqdm

NEGATIVE = 0
POSITIVE = 1

class TwitterDataModule:
    def __init__(
        self,
        path_train_pos: str,
        path_train_neg: str,
        path_predict: str,
        convert_to_features: Callable,
        convert_to_features_kwargs: Dict=None,
        tokenizer: Callable=None,
        tokenizer_kwargs: Dict=None,
        val_percentage: float=0.1,
        batch_size: int=32,
    ) -> None:
        super().__init__()
        self.path_train_pos = path_train_pos
        self.path_train_neg = path_train_neg
        self.path_predict = path_predict
        self.val_percentage = val_percentage
        self.convert_to_features = convert_to_features
        self.convert_to_features_kwargs = convert_to_features_kwargs or {}
        self.tokenizer = tokenizer
        self.tokenizer_kwargs = tokenizer_kwargs or {}
        self.batch_size = batch_size
        
    def setup(self, stage: str=None) -> None:
        """Recovers data from disk and performs train/val split"""
        if stage is None or stage == "fit":
            positive = self._load_tweets(self.path_train_pos)[:10]
            negative = self._load_tweets(self.path_train_neg)[:10]
            tweets = np.array(positive + negative)
            if self.tokenizer is not None:
                tweets = self.tokenizer(tweets, **self.tokenizer_kwargs)
            tweets = self.convert_to_features(tweets, **self.convert_to_features_kwargs) 
            if isinstance(tweets, csr_matrix): # CountVectorizer
                tweets = torch.from_numpy(tweets.todense()).float()
            # else: tweets: torch.tensor

            labels = torch.tensor([POSITIVE] * len(positive) + [NEGATIVE] * len(negative), dtype=torch.float).unsqueeze(1)

            # train, val split
            np.random.seed(1) # reproducibility
            shuffled_indices = np.random.permutation(tweets.shape[0])
            split = int((1 - self.val_percentage) * tweets.shape[0])
            train_indices = shuffled_indices[:split]
            val_indices = shuffled_indices[split:]

            self.train_data = _Dataset(tweets[train_indices], labels[train_indices])
            self.val_data =  _Dataset(tweets[val_indices], labels[val_indices])
            
        if stage is None or stage == "predict":
            test = self.predict_corpus
            tweets = self.convert_to_features(np.array(test))
            if isinstance(tweets, csr_matrix): # CountVectorizer
                tweets = torch.from_numpy(tweets.todense())
            self.test_data = tweets

        self.dims = (self.batch_size, *(tweets.shape[1:])) # save input dimensions
    
    def train_dataloader(self):
        return  DataLoader(self.train_data, self.batch_size)
    
    def val_dataloader(self):
        return DataLoader(self.val_data, self.batch_size)
    
    def predict_dataloader(self):
        return DataLoader(self.test_data, self.batch_size)
    
    def _load_tweets(self, path: str):
        tweets = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in tqdm(f):
                tweets.append(line.rstrip())
        return tweets
    
class _Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [21]:
batch_size = 5
embedding_dim = 300
    
from string import punctuation 
from preprocessing.embeddings import create_w2v_embeddings

translator = str.maketrans('','', punctuation)
dm = TwitterDataModule(
    "twitter-datasets/train_pos_full.txt",
    "twitter-datasets/train_neg_full.txt",
    "twitter-datasets/test_data.txt",
    convert_to_features=create_w2v_embeddings,
    convert_to_features_kwargs={
        "workers": 8,
        "vector_size": embedding_dim,
        "min_count": 1,
        "window": 5,
        "sample": 1e-3,
    },
    tokenizer=lambda x: [tweet.translate(translator).split() for tweet in x],
    batch_size=batch_size,
)

# Run datamodule to check input dimensions
dm.setup(stage="fit")
print(dm.dims)

1250000it [00:00, 1538534.22it/s]
1250000it [00:00, 2080840.71it/s]


(5, 29, 300)


In [None]:
### Define model

In [22]:
import torch.nn as nn
from models.rnn import RNNClassifier

model = RNNClassifier(
    rnn=nn.LSTM(
        input_size=dm.dims[-1],
        hidden_size=64,
        num_layers=2,
        batch_first=True,
        dropout=0.1,
    ),
    classifier=nn.Sequential(
        nn.Linear(64, 16),
        nn.BatchNorm1d(16),
        nn.ReLU(),
        nn.Linear(16, 8),
        nn.BatchNorm1d(8),
        nn.ReLU(),
        nn.Linear(8, 1),
        nn.Sigmoid(),
    )
)

In [30]:
### Training Loop
EPOCHS = 20

def train_one_epoch(
    train_dataloader,
    model,
    loss_fn,
    optimizer
):
    running_loss = 0.
    last_loss = 0.

    # Here, we use enumerate(training_loader) instead of iter(training_loader) 
    # so that we can track the batch index and do some intra-epoch reporting
    for i, data in enumerate(train_dataloader):
        inputs, labels = data
        outputs = model(inputs)

        # Compute the loss and its gradients
        loss = loss_fn(outputs, labels)
        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i%100 == 99:
            last_loss = running_loss / 99 # average loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            running_loss = 0.
    return last_loss

def run_epoch(
    train_dataloader,
    val_dataloader,
    model,
    loss_fn,
    optimizer
):
    model.train(True)
    avg_loss = train_one_epoch(train_dataloader, model, loss_fn, optimizer)
    
    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()
    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        correct = 0
        tot_vsamples = 0
        for i, vdata in enumerate(val_dataloader):
            vinputs, vlabels = vdata
            voutputs = model(vinputs)
            vloss = loss_fn(voutputs, vlabels)
            running_vloss += vloss
            
            tot_vsamples += len(vlabels)
            correct += torch.sum((voutputs.round() == vlabels).float())
            
    avg_vloss = running_vloss / (i + 1)
    avg_vacc = correct / tot_vsamples
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    print('ACC valid {}'.format(avg_vacc))
            

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch + 1))
    run_epoch(
        dm.train_dataloader(),
        dm.val_dataloader(),
        model,
        nn.BCELoss(),
        optimizer
    )

EPOCH 1:
LOSS train 0.0 valid 1.0790126323699951
ACC valid 0.5
EPOCH 2:
LOSS train 0.0 valid 0.7644025683403015
ACC valid 0.5
EPOCH 3:
LOSS train 0.0 valid 0.7912778854370117
ACC valid 0.5
EPOCH 4:
LOSS train 0.0 valid 0.8006210327148438
ACC valid 0.5
EPOCH 5:
LOSS train 0.0 valid 0.8012359142303467
ACC valid 0.5
EPOCH 6:
LOSS train 0.0 valid 0.7070640325546265
ACC valid 0.5
EPOCH 7:
LOSS train 0.0 valid 0.7152944803237915
ACC valid 0.5
EPOCH 8:
LOSS train 0.0 valid 0.7030220627784729
ACC valid 0.5
EPOCH 9:
LOSS train 0.0 valid 0.7448733448982239
ACC valid 0.5
EPOCH 10:
LOSS train 0.0 valid 0.6974402666091919
ACC valid 0.5
EPOCH 11:
LOSS train 0.0 valid 0.6943747997283936
ACC valid 0.5
EPOCH 12:
LOSS train 0.0 valid 0.694464921951294
ACC valid 0.5
EPOCH 13:
LOSS train 0.0 valid 0.6931994557380676
ACC valid 0.5
EPOCH 14:
LOSS train 0.0 valid 0.6944620013237
ACC valid 0.5
EPOCH 15:
LOSS train 0.0 valid 0.6944003105163574
ACC valid 0.5
EPOCH 16:
LOSS train 0.0 valid 0.7304163575172424
ACC