In [None]:
import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'digit-recognizer:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3004%2F861823%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240930%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240930T085612Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2dc596b6edabb2a3f1b986abf6c3e2025ed4223ead542212c9594fe9046e82caf4293719ea450eacb830a85b29cf6e5904b2e8b1d1e791add247fe77bba80c91a7b0a9e0f0240349cc6554a0f32418f2bfd90792fafad1b255e44897c000bd13179059b9d867f38f9f49c4e87d5248606671e7d00a6dbfc669a8d180fe5988b1e5954460c92e20ea239c93d6bd32bd19e23f69cce0d5b0942687543737985ad2d64e7c8673a50db5ad90dd538db7dbb0e5b05f7afd1f77fb44e942cccd9f86ec6fb544d287bb895eac3485f428e281ead74a199234cf433f8e1bf5865933100fd53cdef11df1506cd966c9f7812359c7c60a01645c67179d61665bb606f5def8'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Failed to load (likely expired) https://storage.googleapis.com/kaggle-competitions-data/kaggle-v2/3004/861823/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20240930%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20240930T085612Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=2dc596b6edabb2a3f1b986abf6c3e2025ed4223ead542212c9594fe9046e82caf4293719ea450eacb830a85b29cf6e5904b2e8b1d1e791add247fe77bba80c91a7b0a9e0f0240349cc6554a0f32418f2bfd90792fafad1b255e44897c000bd13179059b9d867f38f9f49c4e87d5248606671e7d00a6dbfc669a8d180fe5988b1e5954460c92e20ea239c93d6bd32bd19e23f69cce0d5b0942687543737985ad2d64e7c8673a50db5ad90dd538db7dbb0e5b05f7afd1f77fb44e942cccd9f86ec6fb544d287bb895eac3485f428e281ead74a199234cf433f8e1bf5865933100fd53cdef11df1506cd966c9f7812359c7c60a01645c67179d61665bb606f5def8 to path /kaggle/input/digit-recognizer
Data source import complete.


1. Sentiment Analysis on IMDB Dataset
2. Scope of improvement:
    2.1 Better network architecture.
    2.2 measures to address overfitting.
    2.3 Variable length sequences can be better handled.
    2.4 Better training

In [None]:
!pip install torch==2.0.1 torchtext==0.15.2

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl.metadata (24 kB)
Collecting torchtext==0.15.2
  Downloading torchtext-0.15.2-cp310-cp310-manylinux1_x86_64.whl.metadata (7.4 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_nvrtc_cu11-11.7.99-2-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu11==11.7.99 (from torch==2.0.1)
  Downloading nvidia_cuda_runtime_cu11-11.7.99-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-cupti-cu11==11.7.101 (from torch==2.0.1)
  Downloading nvidia_cuda_cupti_cu11-11.7.101-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu11==8.5.0.96 (from torch==2.0.1)
  Downloading nvidia_cudnn_cu11-8.5.0.96-2-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu11==11.10.3.66 (from torch==2.0.1)
  Downloading nvidia_cublas_cu11-11.10.3.66-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Co

In [None]:
!pip install 'portalocker>=2.0.0'
#restart kernel after installation

Collecting portalocker>=2.0.0
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.10.1


In [None]:
from functools import partial
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch import optim

from torchtext import datasets
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [None]:
if torch.cuda.is_available():
    device=torch.device(type='cuda',index=0)
else:
    device=torch.device(type='cpu',index=0)

In [None]:
train_data=datasets.IMDB(split='train') #data is ShardingFilterIterDataPipe
#DataPipe that yields tuple of label 1 or 2 and text containing the movie review

In [None]:
eval_data=datasets.IMDB(split='test')

In [None]:
mapped_train_data=to_map_style_dataset(train_data) #mapped_train_data is _MapStyleDataset

In [None]:
#check
print("Type of Mapped Train Data:",type(mapped_train_data))
print("0th data point",mapped_train_data[0])
print("Type of 0th data point",type(mapped_train_data[0]))
label,review=mapped_train_data[0]
print("Label=",label)
print("Review=",review)
print("Type of Label=",type(label))
print("Type of Review=",type(review))

print("iterating over 1 pair:")
for label,review in mapped_train_data:
    print(label)
    print(review)
    break

Type of Mapped Train Data: <class 'torchtext.data.functional.to_map_style_dataset.<locals>._MapStyleDataset'>
0th data point (1, 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CUR

In [None]:
mapped_eval_data=to_map_style_dataset(eval_data)

In [None]:
tokenizer = get_tokenizer("basic_english", language="en")

In [None]:
#build the vocab now
min_word_freq=2
def build_vocab(mapped_train_data, tokenizer):
    reviews = [review for label, review in mapped_train_data]
    vocab = build_vocab_from_iterator(
        map(tokenizer, reviews),
        specials=["<unk>","<eos>","<pad>"],
        min_freq=min_word_freq
    )
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [None]:
vocab=build_vocab(mapped_train_data,tokenizer)

In [None]:
vocab_size=vocab.__len__()
print(vocab_size)

51719


In [None]:
max_seq_len=256
max_norm=1
embed_dim=300
batch_size=16
text_pipeline = lambda x: vocab(tokenizer(x))
#receives string, returns list of ids

In [None]:
sample=text_pipeline("Hello World")
print(sample)
print(type(sample))

[4646, 187]
<class 'list'>


In [None]:
def collate_data(batch, text_pipeline):

     reviews, targets = [], []

     for label,review in batch:

         review_tokens_ids = text_pipeline(review)


         if max_seq_len:
             review_tokens_ids = review_tokens_ids[:max_seq_len]

         review_tokens_ids.append(1)
         l=len(review_tokens_ids)


         x=[2]*257
         x[:l]=review_tokens_ids

         reviews.append(x)
         targets.append(label)

     reviews = torch.tensor(reviews, dtype=torch.long)
     targets = torch.tensor(targets, dtype=torch.long)

     return reviews, targets

In [None]:
traindl = DataLoader(
        mapped_train_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_data,text_pipeline=text_pipeline)
    )


evaldl= DataLoader(
        mapped_eval_data,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(collate_data,text_pipeline=text_pipeline)
    )

In [None]:
for i,(labels,reviews) in enumerate(traindl):
    print(labels.shape, reviews.shape)
    break

torch.Size([16, 257]) torch.Size([16])


In [None]:
print(vocab(["<unk>","<eos>","<pad>"]))

[0, 1, 2]


In [None]:
#create the architecture of a neural network
class SentiNN(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size):
        super().__init__()
        self.e=nn.Embedding(input_size, embed_size)
        self.dropout=nn.Dropout(0.2)
        self.rnn=nn.GRU(embed_size,hidden_size, batch_first=True)
        self.out=nn.Linear(in_features=hidden_size,out_features=2)

    def forward(self,x):
        x=self.e(x)
        x=self.dropout(x)
        outputs, hidden=self.rnn(x) # hidden is 1 x batch_size x hidden_size
        hidden.squeeze_(0) #now, batch_size x hidden_size
        logits=self.out(hidden)
        return logits

In [None]:
embed_size=128
hidden_size=256

#create instance of a neural network
sentinn=SentiNN(vocab_size,embed_size,hidden_size).to(device) #translation-direction sensitive

#specify loss, learning rate, and an optimizer
loss_fn=nn.CrossEntropyLoss(ignore_index=2).to(device)
lr=0.001
opt=optim.Adam(params=sentinn.parameters(), lr=lr)

In [None]:
#training function
def train_one_epoch():
    sentinn.train()
    track_loss=0
    num_correct=0

    for i, (reviews_ids,sentiments) in enumerate(traindl):
        #print(review_ids.shape, sentiments.shape)
        reviews_ids=reviews_ids.to(device)
        sentiments=sentiments.to(device)-1
        logits=sentinn(reviews_ids)
        #print(logits.shape)
        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)

        opt.zero_grad()
        loss.backward()
        opt.step()


    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [None]:
#eval function (written assuming batch_size=1)
def eval_one_epoch():
    sentinn.eval()
    track_loss=0
    num_correct=0

    for i, (reviews_ids,sentiments) in enumerate(evaldl):

        reviews_ids=reviews_ids.to(device)
        sentiments=sentiments.to(device)-1
        logits=sentinn(reviews_ids)

        loss=loss_fn(logits,sentiments)


        track_loss+=loss.item()
        num_correct+=(torch.argmax(logits,dim=1)==sentiments).type(torch.float).sum().item()

        running_loss=round(track_loss/(i+(reviews_ids.shape[0]/batch_size)),4)
        running_acc=round((num_correct/((i*batch_size+reviews_ids.shape[0])))*100,4)




    epoch_loss=running_loss
    epoch_acc=running_acc
    return epoch_loss, epoch_acc

In [None]:
n_epochs=1

for e in range(n_epochs):
    print("Epoch=",e+1, sep="", end=", ")
    epoch_loss,epoch_acc=train_one_epoch()
    print("Train Loss=", epoch_loss, "Train Acc", epoch_acc)
    epoch_loss,epoch_acc=eval_one_epoch()
    print("Eval Loss=", epoch_loss, "Eval Acc", epoch_acc)

Epoch=1, Train Loss= 0.5325 Train Acc 69.952
Eval Loss= 0.3468 Eval Acc 84.944
