In [1]:
import random
import gc

from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import AutoModel, AutoTokenizer, AutoConfig, AdamW

gc.collect()

0

In [2]:
random.seed(42)
torch.manual_seed(42)
np.random.seed(42)

# Prepare data

In [3]:
!apt-get install unzip
!unzip ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip test.tsv
!unzip ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip train.tsv




unzip is already the newest version (6.0-21ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.
Archive:  ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip
  inflating: test.tsv                
Archive:  ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip
  inflating: train.tsv               


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sample_submission = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')

train_df = pd.read_csv('train.tsv', sep='\t')
print(train_df.shape)
print(train_df.info())
train_df.head()

(156060, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
None


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test_df = pd.read_csv('test.tsv', sep='\t')
print(test_df.shape)
print(test_df.info())
test_df.head()

(66292, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB
None


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


# Text Processing

In [6]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', lower=True)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [7]:
class MovieReviewsDataset(Dataset):
    def __init__(self, df, max_len, test_only=False):
        self.max_len = max_len
        self.test_only = test_only
        self.text = df['Phrase'].tolist()
        if not self.test_only:
            self.sentiments = df['Sentiment'].values
            
        self.encode = tokenizer.batch_encode_plus(
            self.text,
            padding='max_length',
            max_length=self.max_len,
            truncation=True,
            return_attention_mask=True
        )
        
    def __getitem__(self, i):
        input_ids = torch.tensor(self.encode['input_ids'][i])
        attention_mask = torch.tensor(self.encode['attention_mask'][i])
        
        if self.test_only:
            return (input_ids, attention_mask)
        else:
            sentiments = self.sentiments[i]
            return (input_ids, attention_mask, sentiments)
    
    def __len__(self):
        return len(self.text)


In [8]:
max_len = 64
train_dataset = MovieReviewsDataset(train_df, max_len)
test_dataset = MovieReviewsDataset(test_df, max_len, test_only=True)

lengths = [int(len(train_dataset) * 0.8), int(len(train_dataset) * 0.2)]
train_dataset, valid_dataset = random_split(train_dataset, lengths=lengths, generator=torch.Generator().manual_seed(42))

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_dataloader = DataLoader(valid_dataset, batch_size=128)
test_dataloader = DataLoader(test_dataset, batch_size=128)

# Modeling

In [9]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()

        bert_base_config = AutoConfig.from_pretrained('bert-base-uncased')
        self.bert_base = AutoModel.from_pretrained('bert-base-uncased', config=bert_base_config)
        self.classifier = nn.Linear(bert_base_config.hidden_size, 5)

    def forward(self, input_ids, attention_mask):
        bert_base_output = self.bert_base(input_ids=input_ids, attention_mask=attention_mask)
        # get last hidden state
        # bert_base_last_hidden_state = bert_base_output[0]
        # or
        # roberta_base_last_hidden_state = roberta_base_output.hidden_states[-1]

        # pooler_output – Last layer hidden-state of the first token of the sequence 
        # (classification token) further processed by a Linear layer and a Tanh activation function
        pooler_output = bert_base_output[1] # [batch_size, hidden] 
        out = self.classifier(pooler_output)
        return out


gc.collect()

20

In [10]:
model = Model()
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criteron = nn.CrossEntropyLoss()
gc.collect()

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


218

In [11]:
total_loss = []
total_val_acc = []
for epoch in range(3):
    model.train()
    epoch_loss = []
    for input_ids, attention_mask, target in tqdm(train_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)            
        target = target.to(device)
        
        optimizer.zero_grad()
        
        y_pred = model(input_ids, attention_mask)
        
        loss = criteron(y_pred, target)
        loss.backward()
        optimizer.step()
        
        epoch_loss.append(loss.item())

    input_ids = input_ids.to(torch.device('cpu'))
    attention_mask = attention_mask.to(torch.device('cpu'))            
    target = target.to(torch.device('cpu'))
    gc.collect()

    val_accs = []
    model.eval()
    for input_ids, attention_mask, target in tqdm(val_dataloader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)        
        y_pred = model(input_ids, attention_mask)
        _, y_pred = torch.max(y_pred, -1)
        acc = torch.mean((torch.tensor(y_pred.cpu() == target.cpu(), dtype=torch.float)))
        val_accs.append(acc.cpu())

    el = sum(epoch_loss)/len(epoch_loss)
    total_loss.append(el)
    acc = np.array(val_accs).mean()
    total_val_acc.append(acc)
    print("Epoch:", epoch+1, "-- loss:", el, "-- acc:", acc)
    gc.collect()

100%|██████████| 976/976 [11:29<00:00,  1.41it/s]
100%|██████████| 244/244 [00:57<00:00,  4.23it/s]
  0%|          | 0/976 [00:00<?, ?it/s]

Epoch: 1 -- loss: 0.82638745718315 -- acc: 0.69059867


100%|██████████| 976/976 [11:30<00:00,  1.41it/s]
100%|██████████| 244/244 [00:57<00:00,  4.23it/s]
  0%|          | 0/976 [00:00<?, ?it/s]

Epoch: 2 -- loss: 0.6773299087205382 -- acc: 0.70433456


100%|██████████| 976/976 [11:31<00:00,  1.41it/s]
100%|██████████| 244/244 [00:57<00:00,  4.23it/s]


Epoch: 3 -- loss: 0.6130578249815057 -- acc: 0.69649243


# Prepare Submission

In [12]:
model.eval()
predictions = []
for text, attention_mask in tqdm(test_dataloader):
    text = text.to(device)
    attention_mask = attention_mask.to(device)
    preds = model(text, attention_mask)
    _, preds = torch.max(preds, -1)
    for pred in preds: predictions.append(pred.item())
print(len(predictions))

100%|██████████| 518/518 [02:03<00:00,  4.20it/s]

66292





In [13]:
submission = pd.DataFrame()
submission['PhraseId'] = test_df['PhraseId']
submission['Sentiment'] = predictions
submission.to_csv("submission.csv", index=False)
print("Sumbssion is ready!")

Sumbssion is ready!
