# Project Phase 2
## Mohammad Amin Rami 98101588
## Milad Heidari 98101469
## Mohammad Reza Safavi 98106701

In [104]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
import os
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import string

device = "cuda" if torch.cuda.is_available() else "cpu"

### A custom dataset class

In [105]:
class MSCTD(Dataset):
    def __init__(self, root='data', mode='train', transform=None, target_transform=None):
        self.root = root
        self.mode = mode
        self.transform = transform
        self.target_transform = target_transform
        self.data_dir = os.path.join(self.root, 'MSCTD', self.mode, 'texts')
        self.texts = []
        self.targets = []
        self.read_data()

    
    def read_data(self):
        with open(os.path.join(self.data_dir, f'english_{self.mode}.txt')) as file:
            for sentence in file:
                sentence = MSCTD.pre_processing(sentence)
                self.texts.append(sentence)
                
        with open(os.path.join(self.data_dir, f'sentiment_{self.mode}.txt')) as file:
            for sentiment in file:
                self.targets.append(int(sentiment.strip()))   
    
    @staticmethod
    def pre_processing(sentence: str):
        #punc_tokenizer = RegexpTokenizer(r"\w+")
        sentence = sentence.strip().lower().translate(str.maketrans('', '', string.punctuation))
        #sentence = punc_tokenizer.tokenize(sentence)
        sentence = sentence.split(' ')
        sentence = [word for word in sentence if word not in stopwords.words('english')]
        sentence = " ".join(sentence)
        return sentence
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, index):
        text = self.texts[index]
        target = self.targets[index]
        if self.transform is not None:
            text = self.transform(text)

        if self.target_transform is not None:
            target = self.target_transform(target)

        return (text, target)


In [106]:
train_data = MSCTD()
test_data = MSCTD(mode='test')

In [4]:
vectorizer = TfidfVectorizer()
vectorizer.fit(train_data.texts)
train_data.transform = lambda x: torch.Tensor(vectorizer.transform([x]).toarray()).to(device).reshape((-1, ))
test_data.transform = lambda x: torch.Tensor(vectorizer.transform([x]).toarray()).to(device).reshape((-1, ))

In [5]:
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_data, batch_size=batch_size)

### TF-IDf based classification of sentences

In [6]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.stack = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(1024, 128),
            nn.Dropout(),
            nn.ReLU(),
            nn.Linear(128, 3)
        )
        
    def forward(self, x):
        return self.stack(x)



In [7]:
input_dim = len(vectorizer.get_feature_names_out())
model = MLP(input_dim=input_dim).to(device)

In [8]:
model.train()

MLP(
  (stack): Sequential(
    (0): Linear(in_features=9416, out_features=1024, bias=True)
    (1): Dropout(p=0.5, inplace=False)
    (2): ReLU()
    (3): Linear(in_features=1024, out_features=128, bias=True)
    (4): Dropout(p=0.5, inplace=False)
    (5): ReLU()
    (6): Linear(in_features=128, out_features=3, bias=True)
  )
)

In [9]:
learning_rate = 1e-4
epochs = 2
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    k = int(size/dataloader.batch_size/5)

    
    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        y = y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % k == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X = X.to(device)
            y = y.to(device)
            pred = model(X.float())
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [11]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}\n-------------------------------")
    try:
        train_loop(train_loader, model, loss_fn, optimizer)
        test_loop(test_loader, model, loss_fn)
    except KeyboardInterrupt:
        print('Training interrupted')
print("Done!")

Epoch 1
-------------------------------
loss: 1.091187  [    0/20240]
loss: 1.102074  [ 4032/20240]
loss: 1.069897  [ 8064/20240]
loss: 1.082272  [12096/20240]
loss: 1.125325  [16128/20240]
loss: 1.066297  [20160/20240]
Test Error: 
 Accuracy: 42.7%, Avg loss: 1.077940 

Epoch 2
-------------------------------
loss: 1.077286  [    0/20240]
loss: 1.052224  [ 4032/20240]
loss: 1.021806  [ 8064/20240]
loss: 0.996611  [12096/20240]
loss: 1.026363  [16128/20240]
loss: 1.021116  [20160/20240]
Test Error: 
 Accuracy: 50.8%, Avg loss: 1.007037 

Done!


### Classification using GloVe vectorization 

In [107]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.linear = nn.Linear(hidden_size, 3)
        
    def forward(self, x, h):
        output, hidden = self.lstm(x, h)
        output = self.linear(output)
        return output, hidden



In [88]:
glove_dict = {}
with open('GloVe/glove.6B.50d.txt') as file:
    for line in file:
        data = line.split(' ')
        word = data[0]
        tensor = torch.Tensor([float(num) for num in data[1:]]).to(device)
        tensor = tensor.reshape((1, -1))
        glove_dict[word] = tensor


In [121]:
def sentence2tensor(sentence):
    words = sentence.split(' ')
    words = [word for word in words if word in glove_dict]
    if len(words) == 0:
        return torch.zeros_like(glove_dict['the'])
    return torch.cat([glove_dict[word] for word in words], dim=0)

train_data.transform = sentence2tensor
test_data.transform = sentence2tensor

In [139]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    k = int(size/dataloader.batch_size/5)
    
    for batch, (X, y) in enumerate(dataloader):
        hidden_state = (torch.zeros((1, model.hidden_size)).to(device), torch.zeros((1, model.hidden_size)).to(device))
        seq_len = X.shape[1]
        y = y.to(device)
        X = X.reshape((seq_len, model.input_size))
        for i in range(seq_len):
            x = torch.reshape(X[i, :], (1, -1)).to(device)
            pred, hidden_state = model(x, hidden_state)

        loss = loss_fn(pred, y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if batch % k == 0:
            loss, current = loss.item(), batch 
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test_loop(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            hidden_state = (torch.zeros((1, model.hidden_size)).to(device), torch.zeros((1, model.hidden_size)).to(device))
            y = y.to(device)
            seq_len = X.shape[1]
            X = X.reshape((seq_len, model.input_size))
            for i in range(seq_len):
                x = torch.reshape(X[i, :], (1, -1)).to(device)
                pred, hidden_state = model(x, hidden_state)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")


In [140]:
train_loader = DataLoader(train_data, batch_size=1, shuffle=True)
test_loader = DataLoader(test_data, batch_size=1)

In [141]:
rnn_model = LSTMClassifier(50, 64).to(device)
rnn_model.train()

LSTMClassifier(
  (lstm): LSTM(50, 64)
  (linear): Linear(in_features=64, out_features=3, bias=True)
)

In [142]:
learning_rate = 1e-4
epochs = 10
loss_fn = nn.CrossEntropyLoss()
rnn_optimizer = torch.optim.Adam(rnn_model.parameters(), lr=learning_rate)

In [143]:
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}\n-------------------------------")
    try:
        train_loop(train_loader, rnn_model, loss_fn, rnn_optimizer)
        test_loop(test_loader, rnn_model, loss_fn)
    except KeyboardInterrupt:
        print('Training interrupted')
        break
print("Done!")

Epoch 1
-------------------------------
loss: 1.129162  [    0/20240]
loss: 0.999080  [ 4048/20240]
loss: 1.110698  [ 8096/20240]
loss: 0.761817  [12144/20240]
loss: 0.969027  [16192/20240]
Test Error: 
 Accuracy: 48.5%, Avg loss: 1.022370 

Epoch 2
-------------------------------
loss: 1.014664  [    0/20240]
loss: 1.017623  [ 4048/20240]
loss: 1.607960  [ 8096/20240]
loss: 1.056317  [12144/20240]
loss: 1.308104  [16192/20240]
Test Error: 
 Accuracy: 48.8%, Avg loss: 1.014362 

Epoch 3
-------------------------------
loss: 1.167966  [    0/20240]
loss: 1.574766  [ 4048/20240]
loss: 0.604439  [ 8096/20240]
loss: 1.181397  [12144/20240]
loss: 0.954669  [16192/20240]
Test Error: 
 Accuracy: 50.4%, Avg loss: 1.001538 

Epoch 4
-------------------------------
loss: 1.524819  [    0/20240]
loss: 0.403674  [ 4048/20240]
loss: 1.340206  [ 8096/20240]
loss: 0.786581  [12144/20240]
loss: 1.451468  [16192/20240]
Test Error: 
 Accuracy: 51.3%, Avg loss: 0.990702 

Epoch 5
------------------------

### Classification using Bert

In [144]:
from transformers import BertForSequenceClassification