# Pre-processing function

In [None]:
import re # https://www.w3schools.com/python/python_regex.asp

from sklearn.feature_extraction.text import CountVectorizer

import os
import numpy as np
import pandas as pd

In [None]:
def clean_text(text):
    """
    Applies some pre-processing on the given text.

    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """

    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\'", "", text)
    text = re.sub(r"\"", "", text)

    # convert text to lowercase
    # text = text.strip().lower()

    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

# Basic text classification with BOW featurizer, called by Scikit-learn

In [None]:
training_texts = [
    "This is a good cat",
    "This is a bad day"
]

test_texts = [
    "This day is a good day"
]

# this vectorizer will skip stop words
vectorizer = CountVectorizer(
    stop_words="english",
    preprocessor=clean_text
)

# fit the vectorizer on the training text
vectorizer.fit(training_texts)

# get the vectorizer's vocabulary
inv_vocab = {v: k for k, v in vectorizer.vocabulary_.items()}
vocabulary = [inv_vocab[i] for i in range(len(inv_vocab))]

# vectorization example
pd.DataFrame(
    data=vectorizer.transform(test_texts).toarray(),
    index=["test sentence"],
    columns=vocabulary
)


Unnamed: 0,This,bad,cat,day,good
test sentence,1,0,0,2,1


# Loading IMDB dataset

In [None]:
def load_train_test_imdb_data(data_dir):
    """Loads the IMDB train/test datasets from a folder path.
    Input:
    data_dir: path to the "aclImdb" folder.

    Returns:
    train/test datasets as pandas dataframes.
    """

    data = {}
    for split in ["train", "test"]:
        data[split] = []
        for sentiment in ["neg", "pos"]:
            score = 1 if sentiment == "pos" else 0

            path = os.path.join(data_dir, split, sentiment)
            file_names = os.listdir(path)
            for f_name in file_names:
                with open(os.path.join(path, f_name), "r",
                encoding="utf-8") as f:
                    review = f.read()
                    data[split].append([review, score])

    np.random.shuffle(data["train"])
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])

    np.random.shuffle(data["test"])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2025-01-21 04:50:04--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-01-21 04:50:15 (7.58 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
!tar -xvzf /content/aclImdb_v1.tar.gz

tar (child): /content/aclImdb_v1.tar.gz: Cannot open: No such file or directory
tar (child): Error is not recoverable: exiting now
tar: Child returned status 2
tar: Error is not recoverable: exiting now


In [None]:
train_data, test_data = load_train_test_imdb_data(
    data_dir="/content/aclImdb/")

In [None]:
train_data["text"][0]

'This video nasty was initially banned in Britain, and allowed in last November without cuts.<br /><br />It features the Playboy Playmate of the Month October 1979, Ursula Buchfellner. The opening cuts back and forth between Buchfellner and foggy jungle pictures. I am not sure what the purpose of that was. It would have been much better to focus on the bathtub scene.<br /><br />Laura (Buchfellner) is kidnapped and held in the jungle for ransom. Peter (Al Cliver - The Beyond, Zombie) is sent to find her and the ransom. Of course, one of the kidnappers (Antonio de Cabo) manages to pass the time productively, while another (Werner Pochath) whines incessantly.<br /><br />The ransom exchange goes to hell, and Laura runs into the jungle. Will Peter save her before the cannibals have a meal? Oh, yes, there are cannibals in this jungle. Why do you think it was a video nasty! Muriel Montossé is found by Peter and his partner (Antonio Mayans - Angel of Death) on the kidnapper\'s boat. Montossé i

In [None]:
count = 0
for train, label in zip(train_data["text"], train_data["sentiment"]):
  if label == 1:
    print(train)
    print("-" * 100)
    count += 1
  if count == 10:
    break
print(count)

Here is an innovative television drama; which so easily blends a compelling story, brilliantly drawn out character development, humour, romance, and drama into each episode. Here is a show that sings to it's own tune, whether it's audience chooses to follow or not. How many other shows on television these days so boldly change in tone from one season to the next? Where most of the other top shows on this site have found a formula that works, that brings in the viewers and the dollars and have stuck like glue to that formula (Prison Break, 24, and Desperate Housewives come to mind) - LOST takes a different route where even after achieving that plateau and that winning formula, the team of executive producers are brave enough to completely reinvent the show in order to service their higher goal of compelling storytelling. This is where LOST differentiates itself from normal television. This is how it's so defiant of conventional TV. And this is why LOST is one of the most cutting edge an

# IMDB Sentiment Classification with SVM

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC


# Transform each text into a vector of word counts
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             max_features=80000)

training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data["text"])

In [None]:
len(vectorizer.vocabulary_)

80000

In [None]:
# Training
model = LinearSVC()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))

Accuracy on the IMDB dataset: 83.98




# IMDB Sentiment Classification with TF-IDF featurizer and n-gram

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer


# Transform each text into a vector of word counts
vectorizer = TfidfVectorizer(stop_words="english",
                             preprocessor=clean_text,
                             ngram_range=(1, 2))

training_features = vectorizer.fit_transform(train_data["text"])
test_features = vectorizer.transform(test_data["text"])

# Training
model = LinearSVC()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

print("Accuracy on the IMDB dataset: {:.2f}".format(acc*100))


Accuracy on the IMDB dataset: 88.66


# Your turn

Improve these above functions to enhance accuary:
- Pre-processing
- Use other classification algorithms (logistic regression, Perceptron Learning Algorithm,...)
- TF-IDF, n-gram

