# Exercise 2

In [2]:
import os
import re
import glob
import string
import numpy as np
import os

In [3]:
# prompt: load and extract files from https://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz

import tarfile

!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
  tar.extractall()

# Check if the directory was created
assert os.path.exists('aclImdb')
print('aclImdb_v1.tar.gz extracted successfully.')

--2025-05-14 21:28:10--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.9’


2025-05-14 21:28:23 (6.72 MB/s) - ‘aclImdb_v1.tar.gz.9’ saved [84125825/84125825]

aclImdb_v1.tar.gz extracted successfully.


# Raw Reviews

## Load data

Like in Exercise 1, read all the training data, including the reviews and the scores associated to each one. You can use the following helper functions to read the data:

1. Be sure to have downloaded the dataset from the link provided in the exercise and have read the README file
1. Be sure to have copied the dataset next to this Jupyter (.ipynb file)
1. Be sure to have installed:
    * Pytorch
    * NLTK (This library is going to be used only for the stemming process, no more)
    * Sklearn (Only for building a Random Forest)

In [4]:
def sorter(item):
    """ Function tha gets only the first number of the name of the file and organizes the files base on that"""

    return int(os.path.basename(item).split('_')[0])

def read_raw_text(path_data):
    """ Function for reading the raw data in the .txt files.

    Parameters
    ----------
    path_data: str
        path of the folder that contains the data that is going to be used. (should be test or test)

    Returns
    ---------
    data,scores: array_like
        Data arrays, X is an array of shape [#documents of the dataset, #words in the vocabulary], y is an array of shape [#documents,]
    """

    data = []
    scores = []

    sentiments = ['pos', 'neg']
    for sentiment in sentiments:
        path_vocab_pos = os.path.join(".", "aclImdb", path_data, sentiment, "*.txt")

        for filename in sorted(glob.glob(path_vocab_pos), key=sorter):

            with open(filename) as f:

                lines = f.read()

                data.append(lines)
                scores.append(int(os.path.basename(filename).split('_')[1].strip('.txt')))
    return data, scores


def read_vocab():
    """ Function for reading the vocabulary (.vocab file).

    Parameters
    ----------
    None

    Returns
    ---------
    vocab: list
        list with the values different tokens that compose the vocabulary ......
    """

    path_vocab = os.path.join(".", "aclImdb", "imdb.vocab")

    with open(path_vocab, encoding='utf-8') as f:
        lines = f.read()

    lines = lines.split('\n')

    vocab = []
    for line in lines:
        vocab.append(line)

    return vocab

In [5]:
# Read the vocabulary
vocabulary = read_vocab()
# Read reviews and y
data, scores = read_raw_text('train')
print(scores)
data_test, scores_test = read_raw_text('test')


[9, 7, 9, 10, 8, 10, 10, 7, 7, 7, 9, 9, 9, 7, 10, 7, 7, 9, 7, 10, 9, 7, 8, 7, 8, 7, 9, 10, 10, 10, 7, 8, 10, 7, 8, 8, 10, 9, 10, 9, 8, 9, 10, 10, 8, 10, 9, 8, 7, 10, 10, 10, 10, 10, 10, 9, 10, 10, 9, 7, 8, 10, 10, 10, 7, 10, 8, 10, 10, 10, 9, 10, 7, 7, 8, 8, 7, 7, 10, 10, 9, 10, 8, 10, 10, 10, 10, 10, 9, 7, 7, 8, 9, 10, 10, 10, 10, 9, 10, 8, 7, 8, 10, 7, 10, 7, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 10, 8, 10, 9, 10, 10, 7, 10, 7, 7, 9, 9, 10, 9, 10, 10, 7, 10, 7, 7, 10, 8, 9, 8, 7, 8, 10, 10, 9, 9, 10, 8, 10, 9, 10, 8, 10, 8, 9, 10, 10, 9, 8, 8, 10, 10, 7, 7, 7, 9, 8, 10, 8, 10, 7, 7, 7, 7, 9, 7, 8, 9, 10, 10, 8, 8, 9, 8, 8, 7, 9, 10, 9, 9, 7, 8, 8, 9, 9, 8, 10, 10, 10, 10, 7, 10, 8, 10, 8, 9, 8, 10, 9, 9, 9, 7, 8, 8, 8, 9, 8, 10, 9, 10, 9, 10, 9, 10, 10, 7, 10, 9, 10, 10, 7, 10, 10, 9, 10, 10, 7, 10, 8, 8, 10, 10, 9, 7, 10, 10, 10, 7, 10, 9, 7, 8, 10, 9, 7, 7, 8, 7, 8, 8, 9, 7, 7, 7, 7, 8, 8, 10, 10, 10, 9, 7, 10, 10, 8, 9, 9, 8, 10, 9, 8, 10, 10, 10, 9, 10, 10, 9, 10, 10

## Task 1: Pipeline for Cleaning the Raw Reviews

> **Hint**: You can use the functions you have already implemented in the previous exercise.

In [6]:
from nltk.corpus import stopwords
from collections import Counter
import math
import nltk
nltk.download('stopwords')

def tokenize(data):
    """ Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    Returns
    ---------
    tokens: list (after after tokenizing on characters that are not letters, numbers, apostrophs nor hyphens in data)
    """

    tokens = []
    for review in data:
        words = re.split(r"[^a-zA-Z0-9'-]+", review)
        tokens.append(words)
    return tokens

def convert_to_lower(data):
    """ Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    Returns
    ---------
    tokens: list (after processing as appears in the function definition)
    """
    lowercase_data = [[word.lower() for word in docu] for docu in data]
    return lowercase_data

def remove_punctuation(data):
    """ Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    tokens: list (after processing as appears in the function definition)
    """
    cleaned_list = [[item.translate(str.maketrans('', '', string.punctuation)) for item in words] for words in data]
    return cleaned_list

def filter_terms(tokens, top_perc):
    """ Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    tokens: list (after processing as appears in the function definition)
    """
    word_counts = {}
    total_words = 0

    for docu in tokens:
        for word in docu:
            total_words += 1
            word_counts[word] = word_counts.get(word, 0) + 1

    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    num_top_words = int(len(sorted_words) * top_perc)
    high_freq_words = set(word for word, count in sorted_words[:num_top_words])

    filtered_tokens = [[word for word in docu if word not in high_freq_words]for docu in tokens]

    return filtered_tokens

def num_to_token(data, special_token="<NUM>"):
    """ Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    tokens: list (after processing as appears in the function definition)
    """

    cleaned_tokens = [[re.sub(r'\d', special_token, word) for word in words if "_" not in word] for words in data]
    return cleaned_tokens

def remove_stopwords(data):
    """ Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    tokens: list (after processing as appears in the function definition)
    """
    stop_words = set(stopwords.words('english'))
    sw = [[word for word in words if word.lower() not in stop_words] for words in data]
    return sw

print("first few elements:", data[:2])

train_tokens = tokenize(data)
print (f'tokens: {train_tokens[:2]} ')


testlist= [["one", "TWO", "Three"],["ONE", "two", "THree"],["one", "TWO", "Three"]]
train_tokens_LC = convert_to_lower(train_tokens)
print (f'tokens after coverting to lower: {train_tokens_LC[:2]} ')
print(f'tokens size: {sum(len(sublist) for sublist in train_tokens_LC)}')

train_tokens_LC = remove_punctuation(train_tokens_LC)
print (f'tokens after removing punctuation: {train_tokens_LC[:2]} ')
print(f'tokens size: {sum(len(sublist) for sublist in train_tokens_LC)}')

filter_perc = 0.1
train_tokens_LC = filter_terms(train_tokens_LC, filter_perc)
print (f'tokens after filtering top {filter_perc*100}% tokens: {train_tokens_LC[:2]} ')
print(f'tokens size: {sum(len(sublist) for sublist in train_tokens_LC)}')


train_tokens_LC = num_to_token(train_tokens_LC)
print (f'tokens after removing/replacing numbers: {train_tokens_LC[:2]} ')
print(f'tokens size: {sum(len(sublist) for sublist in train_tokens_LC)}')


train_tokens_LC = remove_stopwords(train_tokens_LC)
print (f'tokens after removing stopword: {train_tokens_LC[:2]} ')
print(f'tokens size: {sum(len(sublist) for sublist in train_tokens_LC)}')



test_tokens= tokenize(data_test)
test_tokens = convert_to_lower(test_tokens)
test_tokens = remove_punctuation(test_tokens)
test_tokens = filter_terms(test_tokens,0.01)
test_tokens = num_to_token(test_tokens)
test_tokens = remove_stopwords(test_tokens)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


first few elements: ['Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!', "If you like adult comedy cartoons, like South Park, then this is nearly a similar format about the small adventures of three teenage girls at Bromwell High. Keisha,

Padding.

## Task 2: Representations of Data

### BOW
Each review is represented by a vector. It has the length of the vocabulary and for each word in the review, the vector contains the number of appearances on the review

In [7]:
import torch

def create_bow(tokenized_words):

    indices = []
    values = []
    word_to_index_mapping = {word: idx for idx, word in enumerate(vocabulary)}

    for i, review in enumerate(tokenized_words):
        for word in review:
            if word in word_to_index_mapping:
                word_idx = word_to_index_mapping[word]
                indices.append([i, word_idx])
                values.append(1)
    indices = torch.tensor(indices, dtype=torch.long).t()
    values = torch.tensor(values, dtype=torch.float32)
    bow_matrix = torch.sparse_coo_tensor(indices, values,
                                         (len(tokenized_words), len(vocabulary)),
                                         dtype=torch.float32)
    print("BoW shape:", len(bow_matrix), "x", len(word_to_index_mapping))
    print("BoW vector (first document):", bow_matrix[0])
    return bow_matrix

train_bow = create_bow(train_tokens_LC)
test_bow = create_bow(test_tokens)


BoW shape: 25000 x 89527
BoW vector (first document): tensor(indices=tensor([[22409, 22409, 16857, 24551, 10480, 20604, 47304, 14239,
                        22409, 22409]]),
       values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
       size=(89527,), nnz=10, layout=torch.sparse_coo)
BoW shape: 25000 x 89527
BoW vector (first document): tensor(indices=tensor([[26903,  1868,   937,  5447, 12803, 15612, 15612,  2909,
                        14176,  1773,  9702,   947,  7282, 12605,  1867,  3009,
                         1383,  2956,  2984,  2081,  1619,  1615, 13914,  1615,
                         2300,  2653,  2510,  1409,  1818]]),
       values=tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
                      1.]),
       size=(89527,), nnz=29, layout=torch.sparse_coo)


### Vector 1|0

Representation of the reviews with a binary representation of the tokens: 1 = presence and 0 = absence

In [8]:
def create_binary_vector(words):
    """Function for tokenizing the words read from the review files.

    Parameters
    ----------
    words (essentially data read from files)

    Returns
    ---------
    tokens: list
        ds"""

    vocab_dict = {word: idx for idx, word in enumerate(vocabulary)}

    binary_matrix = np.zeros((len(words), len(vocabulary)), dtype=int)

    for i, review in enumerate(words):
        for word in set(review):
            if word in vocab_dict:
                binary_matrix[i, vocab_dict[word]] = 1

    print("BoW shape:", len(binary_matrix))
    print("BoW vector (first document):", binary_matrix[0])
    return binary_matrix

train_bvector = create_binary_vector(data)
test_bvector = create_binary_vector(data_test)

BoW shape: 25000
BoW vector (first document): [0 0 1 ... 0 0 0]
BoW shape: 25000
BoW vector (first document): [0 0 1 ... 0 0 0]


In [9]:
# change scores to positive or negative
print(scores)
def categorize(scores ):
  return [1 if score >= 5 else 0 for score in scores]
train_scores = categorize(scores)
test_scores = categorize(scores_test )

print(train_scores)


[9, 7, 9, 10, 8, 10, 10, 7, 7, 7, 9, 9, 9, 7, 10, 7, 7, 9, 7, 10, 9, 7, 8, 7, 8, 7, 9, 10, 10, 10, 7, 8, 10, 7, 8, 8, 10, 9, 10, 9, 8, 9, 10, 10, 8, 10, 9, 8, 7, 10, 10, 10, 10, 10, 10, 9, 10, 10, 9, 7, 8, 10, 10, 10, 7, 10, 8, 10, 10, 10, 9, 10, 7, 7, 8, 8, 7, 7, 10, 10, 9, 10, 8, 10, 10, 10, 10, 10, 9, 7, 7, 8, 9, 10, 10, 10, 10, 9, 10, 8, 7, 8, 10, 7, 10, 7, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 8, 10, 8, 10, 9, 10, 10, 7, 10, 7, 7, 9, 9, 10, 9, 10, 10, 7, 10, 7, 7, 10, 8, 9, 8, 7, 8, 10, 10, 9, 9, 10, 8, 10, 9, 10, 8, 10, 8, 9, 10, 10, 9, 8, 8, 10, 10, 7, 7, 7, 9, 8, 10, 8, 10, 7, 7, 7, 7, 9, 7, 8, 9, 10, 10, 8, 8, 9, 8, 8, 7, 9, 10, 9, 9, 7, 8, 8, 9, 9, 8, 10, 10, 10, 10, 7, 10, 8, 10, 8, 9, 8, 10, 9, 9, 9, 7, 8, 8, 8, 9, 8, 10, 9, 10, 9, 10, 9, 10, 10, 7, 10, 9, 10, 10, 7, 10, 10, 9, 10, 10, 7, 10, 8, 8, 10, 10, 9, 7, 10, 10, 10, 7, 10, 9, 7, 8, 10, 9, 7, 7, 8, 7, 8, 8, 9, 7, 7, 7, 7, 8, 8, 10, 10, 10, 9, 7, 10, 10, 8, 9, 9, 8, 10, 9, 8, 10, 10, 10, 9, 10, 10, 9, 10, 10

## Task 3: Logistic Regression

In [10]:
import torch
import torch.nn as nn
from torch import optim

In [11]:
# Logistic Regression model ---- is everything ready?
# Please refer to this link to the basics of bulding a model with Pytorch
#        - https://pytorch.org/tutorials/beginner/introyt/modelsyt_tutorial.html

class LogisticRegression(nn.Module):
    def __init__(self, input_d, output_d,batch_size, lr):
        super(LogisticRegression, self).__init__()
        self.linear = torch.nn.Linear(input_d, output_d)
        self.learning_rate= lr
        self.batch_size = batch_size


    def forward(self, data):
         return torch.sigmoid(self.linear(data))




In [12]:
### Define the input dimension input_d, output dimension output_d, batch size, number of epochs, iterations, etc..

input_d = len(vocabulary)
output_d =1
batch_size = 256
epochs = 10
iterations = 98
learning_rate = 0.001
#RelU
#SGD
#BCE
#acc & F1

In [16]:
# Instantiate the LR model
from torch.utils.data import Dataset, DataLoader
class SparseBoWDataset(Dataset):
    def __init__(self, bow_sparse, scores):
        self.bow = bow_sparse
        self.scores = torch.tensor(scores, dtype=torch.float32)

    def __len__(self):
        return len(self.scores)

    def __getitem__(self, idx):
        if isinstance(self.bow, torch.Tensor) and self.bow.is_sparse:
            bow_vector = self.bow[idx].to_dense()
        else:
            bow_vector = self.bow[idx]

        bow_vector = bow_vector.float()

        score = self.scores[idx]
        return bow_vector, score

train_dataset = SparseBoWDataset(train_bow, train_scores)
test_dataset = SparseBoWDataset(test_bow, test_scores)

LR = LogisticRegression(input_d=input_d, output_d=output_d, batch_size=batch_size, lr = learning_rate)
optimizer = torch.optim.Adam(LR.parameters(), lr=LR.learning_rate)
criterion = torch.nn.BCELoss()
loss=[]
acc = []
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range (epochs):
  epoch_loss = 0
  correct = 0
  total = 0
  for data, scores in train_loader:
    optimizer.zero_grad()
    scores = scores.view(-1, 1)
    outputs = LR(data)
    loss = criterion(outputs, scores)
    loss.backward()
    optimizer.step()

  print(f'epoch {epoch}: Loss = {loss.item():.4f}')



# Loss class

# Instantiate the Optimizer Class. Do not forget to set the learning rate

# Define the labels


Epoch 0: Loss = 0.6812
Epoch 1: Loss = 0.5597
Epoch 2: Loss = 0.5449
Epoch 3: Loss = 0.5142
Epoch 4: Loss = 0.4776
Epoch 5: Loss = 0.3273
Epoch 6: Loss = 0.4538
Epoch 7: Loss = 0.3654
Epoch 8: Loss = 0.2745
Epoch 9: Loss = 0.2976


"            for words in enumerate(testdata):\n                outputs = log_regre(testdata)\n                _, predicted = torch.max(outputs, data, 1)\n                correct += (predicted == scores).sum()\n            accuracy = 100* (correct.item()) / len(testdata)\n            acc.append(accuracy)\n            print(f'epoch: {epoch} loss: {loss.item():.4f} Accuracy: {accuracy:.4f}')\n\n"

In [None]:
# Define the training loop


In [21]:
# Evaluate the model

LR.eval()
correct = 0
total = 0
acc = []

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

with torch.no_grad():
    for data, scores in test_loader:
        scores = scores.view(-1, 1)
        outputs = LR(data)
        predicted = (outputs >= 0.5).float()
        correct += (predicted == scores).sum().item()
        total += scores.size(0)
        accuracy = 100 * correct / total
        acc.append(accuracy)

print(f'accuracy score: {accuracy:.4f}%')

Final Accuracy: 63.3560%


In [33]:
# Logistic regression with 4 different sentiments
def categorize_scores(scores):
    categories = []

    for score in scores:
        if score in [9, 10]:
            categories.append(4) #wonderful
        elif score in [7, 8]:
            categories.append(3) #pleased
        elif score in [3, 4]:
            categories.append(1) #disgusted
        elif score in [1, 2]:
            categories.append(0) #annoyed
        else:
            categories.append(2)  #neutral

    return categories

train_scores_cat = categorize_scores(train_scores)
test_scores_cat = categorize_scores(test_scores)

train_dataset = SparseBoWDataset(train_bow, train_scores_cat)
test_dataset = SparseBoWDataset(test_bow, test_scores_cat)

LR = LogisticRegression(input_d=input_d, output_d=4, batch_size=batch_size, lr = learning_rate)
optimizer = torch.optim.Adam(LR.parameters(), lr=LR.learning_rate)
criterion = torch.nn.CrossEntropyLoss()
loss=[]
acc = []
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

for epoch in range (epochs):
  epoch_loss = 0
  correct = 0
  total = 0
  for data, scores in train_loader:
    optimizer.zero_grad()
    outputs = LR(data)
    loss = criterion(outputs, scores.long())
    loss.backward()
    optimizer.step()
    _, predicted = torch.max(outputs, 1)

    correct += (predicted == scores).sum().item()
    total += scores.size(0)
    epoch_loss += loss.item()

  accuracy = 100 * correct / total
  print(f'Epoch {epoch} Loss: {epoch_loss/len(train_loader):.4f} accuracy: {accuracy:.2f}%')



Epoch 0 Loss: 1.2554 accuracy: 68.11%
Epoch 1 Loss: 1.0999 accuracy: 85.19%
Epoch 2 Loss: 1.0392 accuracy: 87.84%
Epoch 3 Loss: 1.0048 accuracy: 88.86%
Epoch 4 Loss: 0.9804 accuracy: 89.31%
Epoch 5 Loss: 0.9610 accuracy: 89.71%
Epoch 6 Loss: 0.9449 accuracy: 90.32%
Epoch 7 Loss: 0.9311 accuracy: 90.70%
Epoch 8 Loss: 0.9191 accuracy: 91.22%
Epoch 9 Loss: 0.9087 accuracy: 91.69%


# Feed forward model

In [None]:
class feedforward(nn.Module):
    def __init__(self, input_d, output_d):
        super(feedforward, self).__init__()
        self.layer1 = nn.Linear(89527,2048)
        self.layer2 = nn.Linear(2048,512)
        self.output = nn.Linear(512,1)

    def forward(self, forw):
        forw = torch.relu(self.layer1(forw))
        forw = torch.relu(self.layer2(forw))
        forw = self.output(forw)

        return forw

In [44]:
train_scores = torch.tensor(categorize(scores), dtype=torch.long)
test_scores = torch.tensor(categorize(scores_test), dtype=torch.long)

train_dataset = SparseBoWDataset(train_bow, train_scores)
test_dataset = SparseBoWDataset(test_bow, test_scores)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

ffmodel = feedforward(input_d = len(vocabulary), output_d=1)
criterion = nn.BCELoss()
optimizer = optim.Adam(ffmodel.parameters(), lr = 0.001)

epochs = 10

for epoch in range(epochs):
  ffmodel.train()
  for data, sc in train_loader:
    optimizer.zero_grad()
    sc = sc.view(-1, 1).float()
    outputs = ffmodel(data)
    loss = criterion(outputs, sc)

    loss.backward()
    optimizer.step()

    print(f'epoch {epoch}, loss {loss.item():.4f}')

##evaluation

  self.scores = torch.tensor(scores, dtype=torch.float32)


epoch 0, loss 0.1317
epoch 1, loss 0.1210
epoch 2, loss 0.1112
epoch 3, loss 0.1008
epoch 4, loss 0.0896
epoch 5, loss 0.0773


RuntimeError: all elements of input should be between 0 and 1

what's the differene between macro and micros evaluations, and what would happen if you have an imbalance in the data
--what happens when you let the gradient compute when you are evaluating the model.
-- is logistic regression the same as an MLP with one layer? no, (Linearity)
MCC as a metric for model evaluation
