In [1]:
import random

import re

from tqdm import tqdm
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split

In [2]:
random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f87797cbcf0>

# Prepare data

In [3]:
!apt-get install unzip
!unzip ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip test.tsv
!unzip ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip train.tsv




unzip is already the newest version (6.0-21ubuntu1.1).
0 upgraded, 0 newly installed, 0 to remove and 12 not upgraded.
Archive:  ../input/sentiment-analysis-on-movie-reviews/test.tsv.zip
  inflating: test.tsv                
Archive:  ../input/sentiment-analysis-on-movie-reviews/train.tsv.zip
  inflating: train.tsv               


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sample_submission = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/sampleSubmission.csv')

train = pd.read_csv('train.tsv', sep='\t')
print(train.shape)
print(train.info())
train.head()

(156060, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   PhraseId    156060 non-null  int64 
 1   SentenceId  156060 non-null  int64 
 2   Phrase      156060 non-null  object
 3   Sentiment   156060 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 4.8+ MB
None


Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test = pd.read_csv('test.tsv', sep='\t')
print(test.shape)
print(test.info())
test.head()

(66292, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66292 entries, 0 to 66291
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   PhraseId    66292 non-null  int64 
 1   SentenceId  66292 non-null  int64 
 2   Phrase      66292 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.5+ MB
None


Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


# Text Processing

In [6]:
# def clean_phrase(df):
#     phrases = []
#     for sent in tqdm(df['Phrase']):
#         text = re.sub("[^a-zA-Z]"," ",sent)
#         words = word_tokenize(text.lower())
#         lemmatizer = WordNetLemmatizer()
#         lem_word = [lemmatizer.lemmatize(word) for word in words]
#         phrases.append(lem_word)

#     return phrases

In [7]:
# train_phrase = clean_phrase(train)
# test_phrase = clean_phrase(test)
# print("Train:", train_phrase[:10])
# print("\n Test:", test_phrase[:10])

In [8]:
# max_len = 0
# for sample in train_phrase:
#     if len(sample) > max_len:
#         max_len = len(sample)

# for sample in test_phrase:
#     if len(sample) > max_len:
#         max_len = len(sample)
# max_len

In [9]:
vectorizer = CountVectorizer()
full_corpus = train['Phrase'].tolist() + test['Phrase'].tolist()
vectorizer.fit(full_corpus)

CountVectorizer()

In [10]:
class MovieReviewsDataset(Dataset):
    def __init__(self, vectorizer, df, max_len, test=False):
        self.max_len = max_len
        self.test = test
        text = df['Phrase'].tolist()
        if not self.test:
            sentiments = df['Sentiment'].tolist()

        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()

        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x) if token in self.token2idx]
        self.pad = lambda x: x + (self.max_len - len(x)) * [self.token2idx['<PAD>']]
        
        texts = [self.encode(sample) [:self.max_len] for sample in text]
        if not self.test:
            texts, self.labels = zip(*[(text, label) for text, label in zip(texts, sentiments) if text])
        
        self.texts = [self.pad(text) for text in texts]
        
    def __getitem__(self, i):
        if not self.test:
            return self.texts[i], self.labels[i]
        return self.texts[i]
    
    def __len__(self):
        return len(self.texts)


In [11]:
max_len = 64
train_dataset = MovieReviewsDataset(vectorizer, train, max_len)
test_dataset = MovieReviewsDataset(vectorizer, test, max_len, test=True)
print(len(train_dataset))
print(len(test_dataset))
print(len(train_dataset.token2idx))
print(len(test_dataset.token2idx))

155907
66292
17730
17730


# Load Embeddings

In [12]:
glove_embeddings = {}
glove_file = open('../input/glove6b/glove.6B.300d.txt')
print('Started preparing glove embeddings')
for line in tqdm(glove_file):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embeddings[word] = coefs
glove_file.close()
print(f'Found {len(glove_embeddings)} word vectors.')

0it [00:00, ?it/s]

Started preparing glove embeddings


400000it [00:48, 8187.98it/s]

Found 400000 word vectors.





In [13]:
embedding_matrix = np.zeros((len(train_dataset.token2idx) + 1, 300))
for word, idx in train_dataset.token2idx.items():
    if word in glove_embeddings.keys():
        embedding_matrix[idx] = glove_embeddings[word]
print(embedding_matrix.shape)
print(len(train_dataset.token2idx))

(17731, 300)
17730


In [14]:
lengths = [int(len(train_dataset) * 0.8), int(len(train_dataset) * 0.2 + 1)]
train_dataset, valid_dataset = random_split(train_dataset, lengths=lengths, generator=torch.Generator().manual_seed(42))

In [15]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.LongTensor([item[1] for item in batch])
    return inputs, target

def test_collate(batch):
    inputs = torch.LongTensor([item for item in batch])
    return inputs

train_dataloader = DataLoader(train_dataset, batch_size=128, collate_fn=collate, shuffle=True)
val_dataloader = DataLoader(valid_dataset, batch_size=128, collate_fn=collate)
test_dataloader = DataLoader(test_dataset, batch_size=128, collate_fn=test_collate)

In [16]:
for text, s in train_dataloader:
    print(text)
    print(text.shape)
    break
print(len(train_dataloader))
print(len(test_dataloader))

tensor([[16963,  4842,  1230,  ..., 17729, 17729, 17729],
        [15745,   705,   373,  ..., 17729, 17729, 17729],
        [14653, 15714,   909,  ..., 17729, 17729, 17729],
        ...,
        [ 2534,   726, 14478,  ..., 17729, 17729, 17729],
        [ 8408,     4,  9948,  ..., 17729, 17729, 17729],
        [ 4285,  2489,  9749,  ..., 17729, 17729, 17729]])
torch.Size([128, 64])
975
518


In [17]:
class Model(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, embedding_matrix):
        super(Model, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.embeddings.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embeddings.weight.requires_grad = False

        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_size,
            num_layers=2,
            bidirectional=True,
            batch_first=True
        )
        
        self.fc1 = nn.Linear(2*hidden_size, 5)

    def forward(self, x):
        batch_size = x.size(0)
        out = self.embeddings(x)
        out, _ = self.lstm(out)
        out = self.fc1(out[:, -1, :])
        return out

In [18]:
model = Model(
    vocab_size=embedding_matrix.shape[0],
    embedding_dim=embedding_matrix.shape[1],
    hidden_size=128,
    embedding_matrix=embedding_matrix,
)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criteron = nn.CrossEntropyLoss()

In [19]:
n_epochs = 10
total_loss = []
total_val_acc = []
for i in range(n_epochs):
    loss_per_epoch = []
    model.train()
    for text, label in tqdm(train_dataloader):
        optimizer.zero_grad()

        text = text.to(device)
        label = label.to(device)

        y_pred = model(text)
        loss = criteron(y_pred, label)
        loss.backward()
        optimizer.step()
        loss_per_epoch.append(loss.item())
    
    val_accs = []
    model.eval()
    for text, label in tqdm(val_dataloader):
        text, label = text.to(device), label.to(device)
        y_pred = model(text)
        _, y_pred = torch.max(y_pred, -1)
        acc = torch.mean((torch.tensor(y_pred.cpu() == label.cpu(), dtype=torch.float)))
        val_accs.append(acc.cpu())
    
    val_acc = np.array(val_accs).mean()
    loss = sum(loss_per_epoch)/len(loss_per_epoch)
    print("EPOCH: ", i+1, "-- loss:", loss, "-- val acc:", val_acc)
    total_loss.append(loss)
    total_val_acc.append(val_acc)

100%|██████████| 975/975 [00:17<00:00, 54.31it/s]
100%|██████████| 244/244 [00:01<00:00, 148.17it/s]
  1%|          | 6/975 [00:00<00:19, 50.75it/s]

EPOCH:  1 -- loss: 1.288803624373216 -- val acc: 0.512112


100%|██████████| 975/975 [00:17<00:00, 54.67it/s]
100%|██████████| 244/244 [00:01<00:00, 149.87it/s]
  1%|          | 5/975 [00:00<00:19, 49.58it/s]

EPOCH:  2 -- loss: 1.285836686232151 -- val acc: 0.512112


100%|██████████| 975/975 [00:17<00:00, 54.36it/s]
100%|██████████| 244/244 [00:01<00:00, 149.82it/s]
  1%|          | 5/975 [00:00<00:19, 49.33it/s]

EPOCH:  3 -- loss: 1.2853986379427789 -- val acc: 0.512112


100%|██████████| 975/975 [00:18<00:00, 54.10it/s]
100%|██████████| 244/244 [00:01<00:00, 148.44it/s]
  1%|          | 6/975 [00:00<00:19, 50.79it/s]

EPOCH:  4 -- loss: 1.0601789264189891 -- val acc: 0.6275401


100%|██████████| 975/975 [00:17<00:00, 54.32it/s]
100%|██████████| 244/244 [00:01<00:00, 148.63it/s]
  1%|          | 6/975 [00:00<00:19, 50.91it/s]

EPOCH:  5 -- loss: 0.8580988224958762 -- val acc: 0.64409363


100%|██████████| 975/975 [00:17<00:00, 54.18it/s]
100%|██████████| 244/244 [00:01<00:00, 148.95it/s]
  1%|          | 5/975 [00:00<00:19, 49.09it/s]

EPOCH:  6 -- loss: 0.7948868875625806 -- val acc: 0.6644114


100%|██████████| 975/975 [00:18<00:00, 53.48it/s]
100%|██████████| 244/244 [00:01<00:00, 146.87it/s]
  1%|          | 6/975 [00:00<00:17, 54.50it/s]

EPOCH:  7 -- loss: 0.7469737071257371 -- val acc: 0.663675


100%|██████████| 975/975 [00:17<00:00, 54.44it/s]
100%|██████████| 244/244 [00:01<00:00, 150.70it/s]
  1%|          | 5/975 [00:00<00:19, 49.65it/s]

EPOCH:  8 -- loss: 0.7090354856466635 -- val acc: 0.67110574


100%|██████████| 975/975 [00:17<00:00, 54.43it/s]
100%|██████████| 244/244 [00:01<00:00, 149.51it/s]
  1%|          | 5/975 [00:00<00:19, 49.28it/s]

EPOCH:  9 -- loss: 0.6806810829883967 -- val acc: 0.6681806


100%|██████████| 975/975 [00:17<00:00, 54.32it/s]
100%|██████████| 244/244 [00:01<00:00, 151.51it/s]

EPOCH:  10 -- loss: 0.65491398196954 -- val acc: 0.6710121





# Prepare submission

In [20]:
model.eval()
predictions = []
for text in tqdm(test_dataloader):
    text = text.to(device)
    preds = model(text)
    _, preds = torch.max(preds, -1)
    for pred in preds: predictions.append(pred.item())
print(len(predictions))

100%|██████████| 518/518 [00:04<00:00, 121.77it/s]

66292





In [21]:
submission = pd.DataFrame()
submission['PhraseId'] = test['PhraseId']
submission['Sentiment'] = predictions
submission.to_csv("submission.csv", index=False)
print("Sumbssion is ready!")

Sumbssion is ready!
