# Text Classification

## Environment
You will only use Python 3.7 and PyTorch 1.9, which is already available on Colab


## Part 0. Checking GPU
In this section, you will make sure you are using the GPU of google colab

In [1]:
from platform import python_version
import torch

print("python", python_version())
print("torch", torch.__version__)

python 3.10.12
torch 2.1.0+cu118


In [2]:
#check GPU
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU is available


## Part 1. Downloading Dataset
In this section, you will download Stanford Sentiment Treebank (SST), a popular dataset for sentiment classification

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, huggingface-hub, datasets
Successfully installed datasets-2.14.

Download SST and print the first example:

In [4]:
from datasets import load_dataset
from pprint import pprint

sst_dataset = load_dataset('sst') #download sst dataset
pprint(sst_dataset['train'][0]) #printing the first (sentence,label) example in the dataset

Downloading builder script:   0%|          | 0.00/9.13k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.99k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.68k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.37M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/790k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8544 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1101 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2210 [00:00<?, ? examples/s]

{'label': 0.6944400072097778,
 'sentence': "The Rock is destined to be the 21st Century 's new `` Conan '' "
             "and that he 's going to make a splash even greater than Arnold "
             'Schwarzenegger , Jean-Claud Van Damme or Steven Segal .',
 'tokens': "The|Rock|is|destined|to|be|the|21st|Century|'s|new|``|Conan|''|and|that|he|'s|going|to|make|a|splash|even|greater|than|Arnold|Schwarzenegger|,|Jean-Claud|Van|Damme|or|Steven|Segal|.",
 'tree': '70|70|68|67|63|62|61|60|58|58|57|56|56|64|65|55|54|53|52|51|49|47|47|46|46|45|40|40|41|39|38|38|43|37|37|69|44|39|42|41|42|43|44|45|50|48|48|49|50|51|52|53|54|55|66|57|59|59|60|61|62|63|64|65|66|67|68|69|71|71|0'}


You will only use **'sentence'** and **'label'** of the data. Please ignore the other values. Note that the label is between 0 and 1. You will round it to either 0 or 1 for binary classification (1 means it is a positive review and 0 means it is a negative review)


## Part 2. Word Embedding
In this section, you will download a pretrained word embedding called Glove and use it to convert words in to a vector representation.


In [5]:
from torchtext.vocab import GloVe

glove = GloVe(name='6B',dim = 300)


.vector_cache/glove.6B.zip: 862MB [02:38, 5.43MB/s]                           
100%|█████████▉| 399999/400000 [01:05<00:00, 6080.25it/s]


In [12]:
print(glove['apple'])

tensor([-0.2084, -0.0197,  0.0640, -0.7140, -0.2118, -0.5928, -0.1532,  0.0442,
         0.6329, -0.8482, -0.2113, -0.1976,  0.1903, -0.5623,  0.2713,  0.2378,
        -0.5189, -0.2452,  0.0352,  0.0968,  0.2490,  0.7128,  0.0383, -0.1051,
        -0.4779, -0.3952, -0.2719, -0.4443,  0.0611, -0.2318, -0.3590, -0.1824,
         0.0355, -0.0877, -1.0816, -0.4252,  0.0032, -0.4599, -0.0435, -0.3903,
         0.5190,  0.2114, -0.2553,  1.1805, -0.1904, -0.1216,  0.0342, -0.0623,
         0.1442, -0.5337,  0.4742, -0.4471,  0.5805,  0.4358,  0.1321, -0.0957,
        -0.3718, -0.0138,  0.2060, -0.1010,  0.1068, -0.3372,  0.1099,  0.3480,
        -0.0998,  0.3694, -0.5292,  0.1241, -0.4613, -0.3848, -0.1011, -0.1763,
         0.3757,  0.1638, -0.2198, -0.2684,  0.8471, -0.3562, -0.0840, -0.2028,
        -0.5654,  0.1911, -0.1413, -0.7812,  0.6919, -0.0836, -0.5429,  0.1644,
         0.0376, -0.6890, -0.6871, -0.1337, -0.4779,  0.2013,  0.0851, -0.0639,
        -0.1710, -0.3243, -0.1762, -0.51

In [13]:
print(glove['notaword'])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [14]:
print(glove['Apple'])

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

Now you will convert every word in every training sentence with its corresponding word embedding vector

In [15]:
import nltk
nltk.download('punkt')
training_data = []
length = 16

for idx_, sentence in enumerate(sst_dataset['train']['sentence']):
  #tokenize word
  words = nltk.word_tokenize(sentence)

  #padding or truncating based on the length
  if len(words) > 16:
    words = words[:16]
  else:
    for i in range(0,16-len(words)):
      words.append('PAD')

  #convert words to their embeddings
  ret = glove.get_vecs_by_tokens(words, lower_case_backup = True)
  #print(ret.size())
  training_data.append(ret)

training_data = torch.stack(training_data) #convert list of tensors to tensors
print(training_data.size()) #note that now the training data is now of shape (#training data, length of the sentence, dimension of word vector representation) =  (8544, 16, 300)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


torch.Size([8544, 16, 300])


Do the same for testing data (except for padding and truncating because we will be inputting test sentence one by one during inference, so they don't have to be of equal length)

In [16]:
testing_data = []
for idx_, sentence in enumerate(sst_dataset['test']['sentence']):
  #tokenize word
  words = nltk.word_tokenize(sentence)

  #convert words to their embeddings
  ret = glove.get_vecs_by_tokens(words, lower_case_backup = True)
  testing_data.append(ret)

## Utility Functions
<br> <font color='red' > **Utility functions and code for Part 3~4. Please run this before doing Part 3 and Part 4. You do not need to change anything here**</font>

In [53]:
#utilities
from torch.utils.data import DataLoader,Dataset
from torch import nn

#function for creating dataloaders
def create_dataloader(data,label,train):
    #create DataLoader
    if (train == 0):
        print(data.size())
        print(label)
        print(torch.round(torch.Tensor(label)).to(torch.long))
        train = torch.utils.data.TensorDataset(data, torch.round(torch.Tensor(label)).to(torch.long))
        #train_data, val_data = torch.utils.data.random_split(train,[int(0.80*len(train)),len(train)-int(0.80*len(train))], generator= torch.Generator().manual_seed(42) )
        return DataLoader(train, batch_size = 16, shuffle=True, drop_last = True) #DataLoader(val_data, batch_size = 16, shuffle=True, drop_last = True)
    else:
        test = torch.utils.data.TensorDataset(torch.Tensor(input).to(torch.int32), (torch.round(torch.Tensor(data['label']))).to(torch.long))
        return DataLoader(test, batch_size = 16, shuffle=True)

def train(num_epoch, model, train_loader):
  for epoch in range(0,num_epoch):
      train_loss = 0
      model.train()
      for batch_id, (data,label) in enumerate(train_loader):
          data = data.to(device)
          label = label.to(device)
          optimizer.zero_grad()
          logits = model(data)
          loss = cel(logits,label)
          loss.backward()
          optimizer.step()
          train_loss += loss.item()
      average_loss = train_loss / len(train_loader.dataset)
      print('====> Epoch: {} Average training loss: {:.4f}'.format(
            epoch, average_loss))


def accuracy(pred,target):
    correct = 0
    for i in range(0,pred.size()[0]):
        if pred[i] == target[i]:
            correct += 1
    return correct

def test(model,test_data, test_label):
  test_label = torch.round(torch.Tensor(test_label)).to(torch.long).to(device)
  total_correct = 0
  total = 0
  model.eval()
  for idx in range(0,len(test_data)):
    data = test_data[idx].to(device).view(-1, test_data[idx].size()[0], test_data[idx].size()[1])
    label = test_label[idx].to(device).view(1)
    logits = model(data)
    pred = m(logits)
    pred = torch.argmax(pred,dim=1)
    total_correct += accuracy(pred,label)
    total += label.size()[0]

  print('Accuracy on the test data is: ' + str(total_correct/total))

## Part 3. Vanilla RNN
In this section, you implement a vanilla RNN and perform text classification with it.

In [31]:
import torch.nn as nn
import torch.nn.functional as F

In [52]:
#Vanilla RNN
class Model(nn.Module):
    def __init__(self, d):
        super(Model, self).__init__()

        self.input_dim = d
        self.hidden_dim = d
        self.output_dim = 2

        #x -> h layer
        self.U = nn.Linear(self.input_dim, self.hidden_dim) # Add your answer here

        #h -> h layer
        self.W = nn.Linear(self.hidden_dim, self.hidden_dim)# Add your answer here

        #h -> output layer
        self.V = nn.Linear(self.hidden_dim, self.output_dim) # Add your answer here

    def forward(self, x):  #x: size [BatchSize, Length, Word Vector Length]
        # Initializing hidden state for first input using method defined below
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)

        #length of sequence
        self.length = x.size()[1]

        #Iterate through sentence and input to RNN sequentially
        for t in range(0,self.length):
            xt = x[:,t,:] #shape [BatchSzie, Length of word embedding vector]
            m = nn.Tanh()
            #print(1)
            #print(self.W(hidden)+ self.U(xt))
            hidden = m( (self.W(hidden)+ self.U(xt)) )# Add your answer here

        #output logit using last sequence
        return self.V(hidden) # Add your answer here

    def init_hidden(self, batch_size):
        # This method generates the first hidden state of zeros which we'll use in the forward pass
        # We'll send the tensor holding the hidden state to the device we specified earlier as well
        hidden = torch.zeros(batch_size, self.hidden_dim).to(device)
        return hidden


Now perform text classification with your vanilla RNN. You will see your model's accuracy.

In [54]:
#dataloader
train_loader = create_dataloader(training_data, sst_dataset['train']['label'], 0)

#create model obejct
d = 300
rnn_model = Model(d).to(device)
m = nn.Softmax(1)
cel = nn.CrossEntropyLoss()

#optimizer
optimizer = torch.optim.SGD(rnn_model.parameters(), lr = 0.01)

#train
num_epoch = 15
train(num_epoch,rnn_model,train_loader)

#test
test(rnn_model,testing_data, sst_dataset['test']['label'])

torch.Size([8544, 16, 300])
[0.6944400072097778, 0.833329975605011, 0.625, 0.5, 0.7222200036048889, 0.833329975605011, 0.875, 0.7222200036048889, 0.833329975605011, 0.7361099720001221, 0.9027799963951111, 0.44444000720977783, 0.8055599927902222, 0.44444000720977783, 0.8194400072097778, 0.75, 0.6111099720001221, 0.44444000720977783, 0.8194400072097778, 0.7777799963951111, 0.8194400072097778, 0.6388900279998779, 0.5555599927902222, 0.875, 0.5555599927902222, 0.5138900279998779, 0.9444400072097778, 0.7222200036048889, 0.9305599927902222, 0.3333300054073334, 0.8194400072097778, 0.7777799963951111, 0.5694400072097778, 0.7361099720001221, 0.8611099720001221, 0.6805599927902222, 0.7361099720001221, 0.7222200036048889, 0.541670024394989, 0.6805599927902222, 0.7638900279998779, 0.833329975605011, 0.4027799963951111, 0.6527799963951111, 0.5277799963951111, 0.16666999459266663, 0.7777799963951111, 0.6111099720001221, 0.375, 0.9444400072097778, 0.75, 0.8611099720001221, 0.6388900279998779, 0.88889