# HW4 RNN
Sentiment classification on Twitter comments.

Import packages.

In [None]:
import torch
import os
import csv
import random
import numpy as np
import pandas as pd
import torch.nn.functional as F

from torch.nn.utils.rnn import pad_sequence
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

Download the dataset and unzip it.

In [None]:
!pip install --upgrade gdown
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1cwPgbbAMNPZ9nCoyOW2WuavimYymCKKy' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1cwPgbbAMNPZ9nCoyOW2WuavimYymCKKy" -O DATASET.zip && rm -rf /tmp/cookies.txt

# !gdown --id "1cwPgbbAMNPZ9nCoyOW2WuavimYymCKKy" --output DATASET.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.5.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.5.4
--2022-11-28 15:31:26--  https://docs.google.com/uc?export=download&confirm=t&id=1cwPgbbAMNPZ9nCoyOW2WuavimYymCKKy
Resolving docs.google.com (docs.google.com)... 142.251.10.101, 142.251.10.139, 142.251.10.100, ...
Connecting to docs.google.com (docs.google.com)|142.251.10.101|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://doc-10-88-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/f44s9frm7jvuvcjkkvlsd3at61h48or7/1669649475000/03249990876179673050/*/1cwPgbbAMNPZ9nCoyOW2WuavimYymCKKy?e=download&uuid=ae93704e-4597-4eb5-92a8-c0fa54a3ca3f [fol

In [None]:
!unzip DATASET.zip

Archive:  DATASET.zip
  inflating: HW4_dataset/test.csv    
  inflating: HW4_dataset/train.csv   
  inflating: HW4_dataset/train_nolabel.csv  


check data

In [None]:
import pandas as pd
import os

pd.set_option("display.max_colwidth", 2000)
dtr = pd.read_csv(os.path.join(os.getcwd(),"HW4_dataset/train.csv"))
dnl = pd.read_csv(os.path.join(os.getcwd(),"HW4_dataset/train_nolabel.csv"))
# dtr[dtr["label"]==1]["text"]
dnl

Unnamed: 0,id,text
0,170000,I really feel like a vanilla slice but the shop sold out
1,170001,@kelly208 I guess we should be thankful that the tweeps haven't tried to kill us with their event suggestions.
2,170002,Last night was shit.&amp; now I have to cover someones shift meh what's going on tonight people? Anything? I'm in summer mode for a week or 2
3,170003,Iris by Goo Goo Dolls make me cry
4,170004,My ankle is trobbing. Tequilla helps a loottt haha lmao drunk as a skunk (WTF does that mean?) who cares haha I don't going back to party
...,...,...
629995,799995,@stoopidgerl Im sorry to hear about your loss. Its always horrible to lose a pet and member of the family.
629996,799996,@RhapsodyInBleh if you go on skype sure
629997,799997,watching daisy of love
629998,799998,"@HoptonHouseBnB @violetbakes yes I should be, but my 7yr old son doesn't share this view"


Basic setup of hyperparameters

In [None]:
BATCH_SIZE = 256
EPOCH_NUM = 300
EARLY_STOP = 50
MAX_POSITIONS_LEN = 100
SEED = 999
MODEL_DIR = 'model.pth'
lr = 1e-5

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
random.seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

w2v_config = {'path': 'w2v.model', 'dim': 64}
net_config = {'hidden_dim': 64, 'num_layers': 8, 'bidirectional': True, 'fix_embedding': False}
header_config = {'dropout': 0.5, 'hidden_dim': 64}
assert header_config['hidden_dim'] == net_config['hidden_dim'] or header_config['hidden_dim'] == net_config['hidden_dim'] * 2

Auxiliary functions and classes definition

In [None]:
def parsing_text(text_list):
    new_list = []
    for text in text_list:
      if "@" in text:
        text = "[person]"
      elif "http" in text:
        text = "[url]"
      elif "$" in text:
        text = "[dollar]"
      new_list.append(text)

    return new_list

def load_train_label(path='HW4_dataset/train.csv'):
    tra_lb_pd = pd.read_csv(path)
    label = torch.FloatTensor(tra_lb_pd['label'].values)
    idx = tra_lb_pd['id'].tolist()
    text = [parsing_text(s.split(' ')) for s in tra_lb_pd['text'].tolist()]
    return idx, text, label

def load_train_nolabel(path='HW4_dataset/train_nolabel.csv'):
    tra_nlb_pd = pd.read_csv(path)
    text = [parsing_text(s.split(' ')) for s in tra_nlb_pd['text'].tolist()]
    return None, text, None

def load_test(path='HW4_dataset/test.csv'):
    tst_pd = pd.read_csv(path)
    idx = tst_pd['id'].tolist()
    text = [parsing_text(s.split(' ')) for s in tst_pd['text'].tolist()]
    return idx, text

In [None]:
class Preprocessor:
    def __init__(self, sentences, w2v_config):
        self.sentences = sentences
        self.idx2word = []
        self.word2idx = {}
        self.embedding_matrix = []
        self.build_word2vec(sentences, **w2v_config)
        
    def build_word2vec(self, x, path, dim):
        print(path, dim)
        if os.path.isfile(path):
            print("loading word2vec model ...")
            w2v_model = Word2Vec.load(path)
        else:
            print("training word2vec model ...")
            w2v_model = Word2Vec(x, size=dim, window=5, min_count=2, workers=12, iter=5, sg=1)
            print("saving word2vec model ...")
            w2v_model.save(path)
            
        self.embedding_dim = w2v_model.vector_size
        for i, word in enumerate(w2v_model.wv.vocab):
            self.word2idx[word] = len(self.word2idx)
            self.idx2word.append(word)
            self.embedding_matrix.append(w2v_model[word])
        
        self.embedding_matrix = torch.tensor(self.embedding_matrix)
        self.add_embedding('<PAD>')
        self.add_embedding('<UNK>')
        print("total words: {}".format(len(self.embedding_matrix)))
        
    def add_embedding(self, word):
        vector = torch.empty(1, self.embedding_dim)
        torch.nn.init.uniform_(vector)
        self.word2idx[word] = len(self.word2idx)
        self.idx2word.append(word)
        self.embedding_matrix = torch.cat([self.embedding_matrix, vector], 0)   
        
    def sentence2idx(self, sentence):
        sentence_idx = []
        for word in sentence:
            if word in self.word2idx.keys():
                sentence_idx.append(self.word2idx[word])
            else:
                sentence_idx.append(self.word2idx["<UNK>"])
        return torch.LongTensor(sentence_idx)
    
class TwitterDataset(torch.utils.data.Dataset):
    def __init__(self, id_list, sentences, labels, preprocessor):
        self.id_list = id_list
        self.sentences = sentences
        self.labels = labels
        self.preprocessor = preprocessor
    
    def __getitem__(self, idx):
        if self.labels is None: return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx])
        return self.id_list[idx], self.preprocessor.sentence2idx(self.sentences[idx]), self.labels[idx]
    
    def __len__(self):
        return len(self.sentences)
    
    def collate_fn(self, data):
        id_list = torch.LongTensor([d[0] for d in data])
        lengths = torch.LongTensor([len(d[1]) for d in data])
        texts = pad_sequence(
            [d[1] for d in data], batch_first=True).contiguous()
     
        if self.labels == None: 
            return id_list, lengths, texts
        else:
          labels = torch.FloatTensor([d[2] for d in data])
          return id_list, lengths, texts, labels

In [None]:
train_idx, train_label_text, label = load_train_label('HW4_dataset/train.csv')
ultrain_idx, ultrain_label_text, ullabel = load_train_nolabel('HW4_dataset/train_nolabel.csv')

In [None]:
preprocessor = Preprocessor(train_label_text+ultrain_label_text, w2v_config)

w2v.model 64
training word2vec model ...
saving word2vec model ...




total words: 164541




In [None]:
train_idx, valid_idx, train_label_text, valid_label_text, train_label, valid_label = train_test_split(train_idx, train_label_text, label, test_size=0.5)
train_dataset, valid_dataset = TwitterDataset(train_idx, train_label_text, train_label, preprocessor), TwitterDataset(valid_idx, valid_label_text, valid_label, preprocessor)

test_idx, test_text = load_test('HW4_dataset/test.csv')
test_dataset = TwitterDataset(test_idx, test_text, None, preprocessor)

train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = True,
                                            collate_fn = train_dataset.collate_fn,
                                            num_workers = 8)
valid_loader = torch.utils.data.DataLoader(dataset = valid_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = valid_dataset.collate_fn,
                                            num_workers = 8)
test_loader = torch.utils.data.DataLoader(dataset = test_dataset,
                                            batch_size = BATCH_SIZE,
                                            shuffle = False,
                                            collate_fn = test_dataset.collate_fn,
                                            num_workers = 8)

  cpuset_checked))


Definition of RNN network

In [None]:
class Backbone(torch.nn.Module):
    def __init__(self, embedding, hidden_dim, num_layers, bidirectional, fix_embedding=True):
        super(Backbone, self).__init__()
        self.embedding = embedding
        self.encoderlayer = torch.nn.TransformerEncoderLayer(d_model=embedding.size(1), nhead=8, batch_first=True)
        self.net = torch.nn.TransformerEncoder(self.encoderlayer, num_layers=6)
        
    def forward(self, inputs):
        print("before:", inputs.shape)
        inputs = self.embedding[inputs]
        print("after:", inputs.shape)
        x = self.net(inputs)
        return x
    
class Header(torch.nn.Module):
    def __init__(self, dropout, hidden_dim):
        super(Header, self).__init__()
        # TODO: you should design your classifier module
        self.classifier = torch.nn.Sequential(torch.nn.Linear(hidden_dim, int(hidden_dim/2)),
                            torch.nn.ReLU(),
                            torch.nn.Linear(int(hidden_dim/2), 1),
                            torch.nn.Sigmoid())
        
    @ torch.no_grad()
    def _get_length_masks(self, lengths):
        # lengths: (batch_size, ) in cuda
        ascending = torch.arange(MAX_POSITIONS_LEN)[:lengths.max().item()].unsqueeze(
            0).expand(len(lengths), -1).to(lengths.device)
        length_masks = (ascending < lengths.unsqueeze(-1)).unsqueeze(-1)
        return length_masks
    
    def forward(self, inputs, lengths):
        # the input shape should be (N, L, D∗H)
        pad_mask = self._get_length_masks(lengths)
        inputs = inputs * pad_mask
        inputs = inputs.sum(dim=1)
        out = self.classifier(inputs).squeeze()
        return out

Trainer

In [None]:
def train(train_loader, backbone, header, optimizer, criterion, device, epoch):

    total_loss = []
    total_acc = []
    
    for i, (idx_list, lengths, texts, labels) in enumerate(train_loader):
        lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        if not backbone is None:
            inputs = backbone(inputs)
        soft_predicted = header(inputs, lengths)
        loss = criterion(soft_predicted, labels)
        loss.backward()
        optimizer.step()
        
        with torch.no_grad():
            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            batch_size = len(labels)
        
            print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)

def valid(valid_loader, backbone, header, criterion, device, epoch):
    backbone.eval()
    header.eval()
    with torch.no_grad():
        total_loss = []
        total_acc = []
        
        for i, (idx_list, lengths, texts, labels) in enumerate(valid_loader):
            lengths, inputs, labels = lengths.to(device), texts.to(device), labels.to(device)

            if not backbone is None:
                inputs = backbone(inputs)
            soft_predicted = header(inputs, lengths)
            loss = criterion(soft_predicted, labels)
            total_loss.append(loss.item())
            
            hard_predicted = (soft_predicted >= 0.5).int()
            correct = sum(hard_predicted == labels).item()
            acc = correct * 100 / len(labels)
            total_acc.append(acc)
            
            print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f}'.format(epoch+1, np.mean(total_loss), np.mean(total_acc)), end='\r')
    backbone.train()
    header.train()
    return np.mean(total_loss), np.mean(total_acc)

            
def run_training(train_loader, valid_loader, backbone, header, epoch_num, lr, early_stop, device, model_dir): 
    def check_point(backbone, header, loss, acc, model_dir):
        torch.save({'backbone': backbone, 'header': header}, model_dir)
        print(f"save at {epoch}: {loss}")
    def is_stop(loss, min_loss):
        if loss > min_loss:
          return True
        else:
          return False
    
    if backbone is None:
        trainable_paras = header.parameters()
    else:
        trainable_paras = list(backbone.parameters()) + list(header.parameters())
        
    optimizer = torch.optim.Adam(trainable_paras, lr=lr, eps=1e-08)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)

    backbone.train()
    header.train()
    backbone = backbone.to(device)
    header = header.to(device)
    criterion = torch.nn.BCELoss()
    min_loss = 50
    es = 0
    for epoch in range(epoch_num):
        train(train_loader, backbone, header, optimizer, criterion, device, epoch)
        lr_scheduler.step()
        loss, acc = valid(valid_loader, backbone, header, criterion, device, epoch)
        print('[Validation in epoch {:}] loss:{:.3f} acc:{:.3f} '.format(epoch+1, loss, acc))
        if loss < min_loss:
            check_point(backbone, header, loss, acc, model_dir)
            min_loss = loss
        if is_stop(loss, min_loss):
          es += 1
          if es >= early_stop:
            break
        else:
          es = 0

Training

In [None]:
backbone = Backbone(preprocessor.embedding_matrix.to(device), **net_config)
header = Header(**header_config)

run_training(train_loader, valid_loader, backbone, header, EPOCH_NUM, lr, EARLY_STOP, device, MODEL_DIR)

Testing

In [None]:
def run_testing(test_loader, backbone, header, device, output_path):
  with open(output_path, 'w') as f:
    backbone.eval()
    header.eval()
    writer = csv.writer(f)
    writer.writerow(['id', 'label'])
    with torch.no_grad():
      for i, (idx_list, lengths, texts) in enumerate(test_loader):
        lengths, inputs = lengths.to(device), texts.to(device)
        if not backbone is None:
          print(inputs.shape, lengths.shape)
          inputs = backbone(inputs)
        soft_predicted = header(inputs, lengths)
        hard_predicted = (soft_predicted >= 0.5).int()
        for i, p in zip(idx_list, hard_predicted):
          writer.writerow([str(i.item()), str(p.item())])

In [None]:
pred_file = 'pred.csv'
run_testing(test_loader, backbone, header, device, pred_file)


In [None]:
from google.colab import files
files.download(pred_file)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## test

In [None]:
s1 = "my heart beating like a hammer"
s2 = "my hammer beating like a heart"
s1_embed = preprocessor.sentence2idx(parsing_text(s1.split(" "))).unsqueeze(0)
s2_embed = preprocessor.sentence2idx(parsing_text(s2.split(" "))).unsqueeze(0)

In [None]:
backbone = torch.load("model_encoder_loss444.pth")["backbone"].to(device)
header = torch.load("model_encoder_loss444.pth")["header"].to(device)

In [None]:
backbone.eval()
header.eval()

inputs = torch.cat((s1_embed, s2_embed))
lengths = torch.LongTensor([6,6]).to(device)
with torch.no_grad():
  inputs = backbone(inputs)
  soft_predicted = header(inputs, lengths)
  hard_predicted = (soft_predicted >= 0.5).int()