# IMDB Classification with Pytorch

Comment:

We have an already downloaded copy of the IMDB dataset, which is prepared to use with keras.utils.text_dataset_from_directory.

In PyTorch, the equivalent function to Keras' text_dataset_from_directory for creating a text dataset from a directory is not available out-of-the-box. However, you can achieve similar functionality by leveraging PyTorch's data loading utilities and some custom code. Thanks to ChatGPT

In [None]:
import os
import pathlib
import numpy as np
import pandas as pd
import os.path as op
from tqdm import tqdm

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint


DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(DEVICE)

from torch.utils.data import Dataset, DataLoader
from torchvision.datasets.folder import default_loader
from torchvision.datasets.utils import check_integrity
from torchvision import datasets
from torchvision.transforms import ToTensor

import torch
import torch.nn as nn

from torch.utils.data.dataset import random_split
import torchtext
from torchtext.datasets import IMDB
print(torch.__version__)
print(torchtext.__version__)

In [None]:
%load_ext watermark
%watermark -p torch,lightning,pandas --conda

In [None]:

class TextDatasetFromDir(Dataset):
    '''
    This class is equivalent to keras.utils.text_dataset_from_directory
    
    it helps to load every dataset from the pre-configured folder structure
    
    '''
    def __init__(self, root, transform=None, target_transform=None, loader=default_loader):
        self.root = root
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

        self.classes = sorted([d.name for d in os.scandir(self.root) if d.is_dir()])
        self.class_to_idx = {cls_name: idx for idx, cls_name in enumerate(self.classes)}
        self.samples = self._load_samples()

    def _load_samples(self):
        samples = []
        
        for class_name in self.classes:
            class_dir = os.path.join(self.root, class_name)
            if not os.path.isdir(class_dir):
                continue
            with tqdm(total=12000) as pbar:
                for filename in os.listdir(class_dir):
                    path = os.path.join(class_dir, filename)
                    if not os.path.isfile(path):
                        continue
                    if self._has_valid_extension(filename):
                        item = (path, self.class_to_idx[class_name])
                        samples.append(item)
                        pbar.update()
        return samples

    def _has_valid_extension(self, filename):
        valid_extensions = ['.txt']  # Add more extensions if needed
        return any(filename.endswith(ext) for ext in valid_extensions)

    def __getitem__(self, index):
        path, target = self.samples[index]
        sample = self.loader(path)
        if self.transform is not None:
            sample = self.transform(sample)
        if self.target_transform is not None:
            target = self.target_transform(target)
        return sample, target

    def __len__(self):
        return len(self.samples)


In [None]:

'''
The default loader from Pytorch will point to the one from torchvision which at the same time will look for PIL Images,
leading to an error.

If we want to use the already downloaded IMDB dataset as it was prepared for keras, it is needed to override the loader
of the TextDataset class we have created with the function below.

'''
def load_text(filename):
    with open(filename, "r") as f:
        text = f.read()
    return text

#root_dir = '/path/to/dataset'  # Path to the root directory of your text dataset
train_dir = pathlib.Path('C:/Users/MRM/Desktop/Data_Analytics/Medium_and_PPB/Machine_Learning/Machine_Learning_Projects/NLP/Intro_to_deep_learning_for_text/aclImdb/train')
val_dir = pathlib.Path('C:/Users/MRM/Desktop/Data_Analytics/Medium_and_PPB/Machine_Learning/Machine_Learning_Projects/NLP/Intro_to_deep_learning_for_text/aclImdb/val')
test_dir = pathlib.Path('C:/Users/MRM/Desktop/Data_Analytics/Medium_and_PPB/Machine_Learning/Machine_Learning_Projects/NLP/Intro_to_deep_learning_for_text/aclImdb/test')

batch_size = 32

train_dataset = TextDatasetFromDir(train_dir, transform=None, loader = load_text)

val_dataset = TextDatasetFromDir(val_dir, transform=None,loader = load_text)

test_dataset = TextDatasetFromDir(test_dir, transform=None,loader = load_text)

# Later on they will be needed other operations depending on the preprocessig approach, either BoW or Embeddings

In [None]:

for (data, target) in enumerate(train_dataset):
    # Do something with the data and target tensors
    print('Train Set')
    print(' data: ', data, ' Review: ', target[0], 'Sentiment:',  target[1])
    if data > 3: 
        break
    

In [None]:
type(train_dataset)

In [None]:
# We can also assign them to dataframes if we want to operate over them with pandas or scikit-learn

review_train = []
sent_train = []

for (data, target) in enumerate(train_dataset):
    # Do something with the data and target tensors
    review_train.append(target[0])
    sent_train.append(target[1])

train_df= pd.DataFrame(list(zip(review_train, sent_train)), columns = ['Review', 'Sentiment'])
train_df.head()

In [None]:

review_val = []
sent_val = []

for (data, target) in enumerate(val_dataset):
    # Do something with the data and target tensors
    review_val.append(target[0])
    sent_val.append(target[1])

val_df= pd.DataFrame(list(zip(review_val, sent_val)), columns = ['Review', 'Sentiment'])
val_df.head()

In [None]:
review_test = []
sent_test = []

for (data, target) in enumerate(test_dataset):
    # Do something with the data and target tensors
    review_test.append(target[0])
    sent_test.append(target[1])

test_df= pd.DataFrame(list(zip(review_test, sent_test)), columns = ['Review', 'Sentiment'])
test_df.head()

In [None]:
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

In [None]:
#import portalocker
# In order to work you must install portalocker in your environment in miniforge or conda with
# pip install 'portalocker>=2.0.0
train_new = IMDB(split='train')
test_new = IMDB(split='test')

In [None]:
for example in train_new:
    print(' Review: ', example[0], ' Sentiment: ', example[1])
    break

As seen above in the IMBD loaded from torchtext the dataset has changed columns, in the first columns are the
sentiment classification and in the second element of the tuple is the comment in text.
In order to use the 'collate_batch' function below from Sebastian Raschka, some changes need to be made.

In [None]:
# Tokenize (find unique words) and Counter frequencies of words

import re
from collections import Counter, OrderedDict

token_counts_train = Counter()

def tokenizer(text):
    # Standarize text. 
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    # Split the words into tokens
    tokenized = text.split()
    return tokenized

# Changed from the original in the book
# Tokenize and count on the train_dataset
with tqdm(total=len(train_dataset)) as pbar:
    for line, label in train_dataset:
        tokens_train = tokenizer(line)
        token_counts_train.update(tokens_train)
    pbar.update()
    
print('Vocab-size:', len(token_counts_train))

In [None]:
tokens_train[0:10]

In [None]:
token_counts_train

The same that was done with the Counter can be done with the Scikit-Learn class CountVectorizer.
We need to extract the reviews from the train_dataset tuple to a dictionary and the keys with the text will be passed to the CountVectorizer.

In [None]:
train_ds_dict = dict(train_dataset)
list(train_ds_dict.keys())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Convert a collection of text documents to a matrix of token counts.
vectorizer = CountVectorizer(lowercase = True, max_features=20000, stop_words = 'english', tokenizer = tokenizer)

# If we set up the max_features to 70000 we get the same vocabulary length thant with the Counter class previously used

# If we don´t use stop_words we will get the same result than with the Counter class before
#vectorizer = CountVectorizer(lowercase = True, max_features=50000)

In [None]:
vectorizer.fit(list(train_ds_dict.keys()))

In [None]:
# The following command will give us the position of each word in the 20000 more frequent words
vectorizer.vocabulary_

In [None]:
len(vectorizer.vocabulary_)

In [None]:
# Encoding each unique token into integers
from torchtext.vocab import vocab

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)

vocab.insert_token("<pad>", 0)
vocab.insert_token("<unk>", 1)
vocab.set_default_index(1)

print([vocab[token] for token in ['this', 'is', 'an', 'example']])

In [None]:
## Step 3-A: define the functions for transformation
device = 'cpu'

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
label_pipeline = lambda x: 1. if x == 'pos' else 0.


## Step 3-B: wrap the encode and transformation function
def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), 
                                      dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list)
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(
        text_list, batch_first=True)
    return padded_text_list.to(device), label_list.to(device), lengths.to(device)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False, collate_fn = collate_batch)

val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn = collate_batch)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn = collate_batch)


In [None]:

for (data, target) in enumerate(train_dataloader):
    # Do something with the data and target tensors
    print('Train Set')
    print('Sentiment Coded: ',  target[0])
    print('Label list Coded: ', target[1])
    break
    

In [None]:
text_batch, label_batch, length_batch = next(iter(train_dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)


## Embeedding Approach

## Bag of Words Approach

In [None]:
train_ds_dict = dict(train_dataset)
val_ds_dict = dict(val_dataset)
test_ds_dict = dict(test_dataset)

X_train = vectorizer.transform(train_ds_dict.keys())
X_val = vectorizer.transform(val_ds_dict.keys())
X_test = vectorizer.transform(test_ds_dict.keys())

In [None]:
X_train.shape

In [None]:
import numpy as np
np.array(X_train[0].todense())[0]

In [None]:

np.bincount(np.array(X_train[0].todense())[0])

In [None]:
np.array(X_train.todense()).shape