# Sentiment Classification Project

In [1]:
!pip install -r reqs.txt

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import pytorch_lightning as L
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

import torch
import torch.nn as nn

from dataset.twitter_dataset import TwitterDataModule
from recipes.sentiment_analysis import SentimentAnalysisNet

In [2]:
import pytorch_lightning as L

import torch
from torch.utils.data import Dataset, DataLoader

import numpy as np
from typing import Callable, Dict

from scipy.sparse._csr import csr_matrix

NEGATIVE = 0
POSITIVE = 1

class TwitterDataModule(L.LightningDataModule):
    def __init__(
        self,
        path_train_pos: str,
        path_train_neg: str,
        path_predict: str,
        convert_to_features: Callable,
        convert_to_features_kwargs: Dict=None,
        tokenizer: Callable=None,
        tokenizer_kwargs: Dict=None,
        val_percentage: float=0.1,
        batch_size: int=32,
    ) -> None:
        super().__init__()
        self.path_train_pos = path_train_pos
        self.path_train_neg = path_train_neg
        self.path_predict = path_predict
        self.val_percentage = val_percentage
        self.convert_to_features = convert_to_features
        self.convert_to_features_kwargs = convert_to_features_kwargs or {}
        self.tokenizer = tokenizer
        self.tokenizer_kwargs = tokenizer_kwargs or {}
        self.batch_size = batch_size

    @property
    def train_corpus(self):
        positive = self._load_tweets(self.path_train_pos)
        negative = self._load_tweets(self.path_train_neg)
        return np.array(positive + negative)

    @property
    def predict_corpus(self):
        return np.array(self._load_tweets(self.path_predict))

    def setup(self, stage: str=None) -> None:
        """Recovers data from disk and performs train/val split"""
        if stage is None or stage == "fit":
            positive = self._load_tweets(self.path_train_pos)
            negative = self._load_tweets(self.path_train_neg)
            tweets = np.array(positive + negative)
            if self.tokenizer is not None:
                tweets = self.tokenizer(tweets, **self.tokenizer_kwargs)
            tweets = self.convert_to_features(tweets, **self.convert_to_features_kwargs) 
            if isinstance(tweets, csr_matrix): # CountVectorizer
                tweets = torch.from_numpy(tweets.todense()).float()
            # else: tweets: torch.tensor

            labels = torch.tensor([POSITIVE] * len(positive) + [NEGATIVE] * len(negative), dtype=torch.float).unsqueeze(1)

            # train, val split
            np.random.seed(1) # reproducibility
            shuffled_indices = np.random.permutation(tweets.shape[0])
            split = int((1 - self.val_percentage) * tweets.shape[0])
            train_indices = shuffled_indices[:split]
            val_indices = shuffled_indices[split:]

            self.train_data = _Dataset(tweets[train_indices], labels[train_indices])
            self.val_data =  _Dataset(tweets[val_indices], labels[val_indices])
            
        if stage is None or stage == "predict":
            test = self.predict_corpus
            tweets = self.convert_to_features(np.array(test))
            if isinstance(tweets, csr_matrix): # CountVectorizer
                tweets = torch.from_numpy(tweets.todense())
            self.test_data = tweets

        self.dims = (self.batch_size, *(tweets.shape[1:])) # save input dimensions
    
    def train_dataloader(self):
        return  DataLoader(self.train_data, self.batch_size)
    
    def val_dataloader(self):
        return DataLoader(self.val_data, self.batch_size)
    
    def predict_dataloader(self):
        return DataLoader(self.test_data, self.batch_size)
    
    def _load_tweets(self, path: str):
        tweets = []
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                tweets.append(line.rstrip())
        return tweets
    
class _Dataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [3]:
batch_size = 32
embedding_dim = 300
    
from preprocessing.tokenize import Tokenizer
from preprocessing.embeddings import create_w2v_embeddings

dm = TwitterDataModule(
    "twitter-datasets/train_pos_full.txt",
    "twitter-datasets/train_neg_full.txt",
    "twitter-datasets/test_data.txt",
    convert_to_features=create_w2v_embeddings,
    convert_to_features_kwargs={
        "workers": 8,
        "vector_size": embedding_dim,
        "min_count": 1,
        "window": 5,
        "sample": 1e-3,
    },
    tokenizer=Tokenizer(),
    batch_size=batch_size,
)

In [None]:
dm.setup(stage="fit")
print(dm.dims)

In [None]:
class RNNClassifier(nn.Module):
    def __init__(
        self, 
        rnn: nn.Module,
        classifier: nn.Module,
    ):
        super().__init__()
        self.rnn = rnn
        self.classifier = classifier

    def forward(self, x):
        # x: (batch_size, max_seq_len, embedding_size)
        x, _ = self.rnn(x) #  x: (batch_size, max_seq_len, hidden_size)
        x = x[:, -1, :] # only take last hidden state per sentence
        x = self.classifier(x) #  x: (batch_size, 2)
        return x
    
model = RNNClassifier(
    rnn=nn.LSTM(
        input_size=dm.dims[-1],
        hidden_size=64,
        num_layers=2,
        batch_first=True,
        dropout=0.1,
    ),
    classifier=nn.Sequential(
        nn.Linear(64, 16),
        nn.BatchNorm1d(16),
        nn.ReLU(),
        nn.Linear(16, 8),
        nn.BatchNorm1d(8),
        nn.ReLU(),
        nn.Linear(8, 1),
        nn.Sigmoid(),
    )
)

In [None]:
net = SentimentAnalysisNet(
    model,
    lr=10e-3,
)

In [None]:
trainer = L.Trainer(
    max_epochs=5,
    #     # callbacks=trainer_params["callbacks"],
    #     # logger=wandb_logger,
)

In [62]:
print("start training...")
trainer.fit(model=net, datamodule=dm)
print("done!")

start training...


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type           | Params
---------------------------------------------
0 | model     | RNNClassifier  | 102 K 
1 | criterion | BCELoss        | 0     
2 | accuracy  | BinaryAccuracy | 0     
---------------------------------------------
102 K     Trainable params
0         Non-trainable params
102 K     Total params
0.410     Total estimated model params size (MB)
SLURM auto-requeueing enabled. Setting signal handlers.


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


done!
