# PART I - The Dataset

In [59]:
from datasets import get_dataset_split_names
from datasets import load_dataset
from datasets import dataset_dict
from datasets import arrow_dataset
import string

dataset = load_dataset("imdb")

Found cached dataset imdb (C:/Users/leand/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

1 - How many splits does the dataset has ?

In [4]:
print(len(get_dataset_split_names("imdb")))

3


Le dataset contient 3 splits.

2 - How big are thes splits ?

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

Le split train contient 25000 colonnes.
Le split test contient 25000 colonnes.
Le split unsupervised contient 50000 colonnes.

3 - What is the proportion of each class on the supervised splits ?

In [6]:
sum(dataset["train"]["label"]) + sum(dataset["test"]["label"]) #count of supervised documents with positive reviews

25000

In [7]:
len(dataset["train"]) + len(dataset["test"]) #count of supervised documents

50000

In [8]:
(sum(dataset["train"]["label"]) + sum(dataset["test"]["label"])) / (len(dataset["train"]) + len(dataset["test"]))

0.5

La proportion de la classe positive est de 50%. Par conséquent, celle de la classe négative est également de 50%.

# PART II - Naive Bayes Classifier

1- Preprocess function

In [45]:
def preprocess(data : dict) -> dict:
    '''Takes a document from a dataset, lowers the letters and
    replace all punctuations by spaces'''
    text = data["text"]
    for character in string.punctuation:
        if character != "-":
          text = text.replace(character, ' ')
    data["text"] = text.lower()
    return data

In [10]:
updated_dataset = dataset.map(preprocess)

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-f89e94e326134889.arrow
Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-7e3b8c0cbea40c78.arrow
Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-9207d596dc166142.arrow


2- Naive Bayes classifier from scratch

In [11]:
def build_vocabulary_string(Vocabulary : dict[int], text : string, text_category : int, Categories : set[int]) -> dict[int]:
    '''Updates the Vocabulary by adding the words contained in the text'''
    for word in text.split():
        if not (word in Vocabulary) :
            Vocabulary[word] = {}
            for category in Categories:
                Vocabulary[word][category] = 0
        Vocabulary[word][text_category] += 1
    return Vocabulary

def build_vocabulary(dataset : dataset_dict.DatasetDict, Categories : set[int]) -> dict[int]:
    '''Construct a vocabulary from the documents contained in dataset'''
    Vocabulary = {}
    for document in dataset:
        Vocabulary = build_vocabulary_string(Vocabulary, document["text"], document["label"], Categories)
    return Vocabulary

In [12]:
import math
def train_naive_bayes(dataset : dataset_dict.DatasetDict, Categories : set[int]) -> tuple[dict[int], dict[dict[float]], dict[int]]:
    '''Make a naive bayes training over the dataset and returns the logprior, the loglikelihood and the vocabulary produced'''
    logprior = {}
    loglikelihood = {}
    documents_number = len(dataset)
    Vocabulary = build_vocabulary(dataset, Categories)
    for word in Vocabulary:
        loglikelihood[word] = {}
    for category in Categories:
        documents_number_category = 0
        for document in dataset:
            if document["label"] == category :
                documents_number_category += 1
        logprior[category] = math.log(documents_number_category / documents_number)
        word_number = 0
        for word in Vocabulary:
            word_number += Vocabulary[word][category] + 1
        for word in Vocabulary:
            loglikelihood[word][category] = math.log((Vocabulary[word][category] + 1)/word_number)
    return logprior, loglikelihood, Vocabulary

In [13]:
(logprior, loglikelihood, Vocabulary) = train_naive_bayes(updated_dataset["train"], {0, 1})

In [14]:
def test_naive_bayes(text : string, logprior : dict[int], loglikelihood : dict[dict[float]], Categories : set[int], Vocabulary : dict[int]) -> int:
    '''Estimates the category of the text from the logprior, the loglikelihood
    and the vocabulary built through the training'''
    sum = {}
    for category in Categories:
        sum[category] = logprior[category]
        for word in text.split():
            if word in Vocabulary:
                sum[category] += loglikelihood[word][category]
    return max(sum, key=sum.get)

3- Naive Bayes classifier using scikit-learn

In [60]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    def fit(self, X : arrow_dataset.Dataset, y:list[int] = None):
        return self
    def transform(self, X : arrow_dataset.Dataset) -> dataset_dict.DatasetDict:
        '''Applies the preprocess function over the documents of the dataset X, then returns the texts of X'''
        X_transformed = X.map(preprocess)
        return X_transformed["text"]

In [36]:
NaiveBayesPipe = Pipeline([("PreProcessing", PreProcessing()), ("CountVectorization", CountVectorizer()), ("Naive Bayes Classifier", MultinomialNB())])

In [37]:
NaiveBayesPipe.fit(dataset["train"], dataset["train"]["label"])

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-f89e94e326134889.arrow


4- Accuracy on both training and test set:

In [46]:
def count_good_answers(dataset : arrow_dataset.Dataset) -> int:
    '''Count the number of good answers by applying the self-made naive bayes test function over all the documents of dataset'''
    good_answers_test = 0
    for document in dataset:
        if test_naive_bayes(document["text"], logprior, loglikelihood, {0, 1}, Vocabulary) == document["label"] :
            good_answers_test += 1
    return good_answers_test

In [47]:
print("Accuracy on test set for self-made naive bayes implementation: ")
print(count_good_answers(updated_dataset["test"])/len(updated_dataset["test"]))
print("Accuracy on training set for self-made naive bayes implementation: ")
print(count_good_answers(updated_dataset["train"])/len(updated_dataset["train"]))

Accuracy on test set for self-made naive bayes implementation: 
0.81204
Accuracy on training set for self-made naive bayes implementation: 
0.90612


In [43]:
print("Accuracy on training set with scikit-learn: ")
NaiveBayesPipe.score(dataset["train"], dataset["train"]["label"])

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-f89e94e326134889.arrow


Accuracy on training set with scikit-learn: 


0.89808

In [44]:
print("Accuracy on test set with scikit-learn: ")
NaiveBayesPipe.score(dataset["test"], dataset["test"]["label"])

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-7e3b8c0cbea40c78.arrow


Accuracy on test set with scikit-learn: 


0.8136

5- Why does the scikit-learn implementation give better results ?

La différence de résultats peut s'expliquer par le fait que MultinomialNB utilise directement des flottants plutôt que des entiers, ce qui permet d'avoir des résultats plus proches lors des calculs.

6- Why is the accurary a sufficient measure of evaluation here ?

L'accuracy est un moyen suffisant d'évaluation puisqu'il permet d'avoir les proportions de bonnes et de mauvaises réponses du modèle, et que l'on dispose d'un dataset important.

7- 2 wrongly classifier example from the test set and explanation of why the model failed:

In [52]:
found = 0
for document in updated_dataset["test"]:
    if found < 2:
        if test_naive_bayes(document["text"], logprior, loglikelihood, {0, 1},  Vocabulary) != document["label"] :
            found += 1
            print("Error " + str(found) + ":")
            print(document["text"])
            print("Good value :")
            print(document["label"])
            if found == 1 :
                print("\n\n\n")

Error 1:
blind date  columbia pictures  1934   was a decent film  but i have a few issues with this film. first of all  i don t fault the actors in this film at all  but more or less  i have a problem with the script. also  i understand that this film was made in the 1930 s and people were looking to escape reality  but the script made ann sothern s character look weak. she kept going back and forth between suitors and i felt as though she should have stayed with paul kelly s character in the end. he truly did care about her and her family and would have done anything for her and he did by giving her up in the end to fickle neil hamilton who in my opinion was only out for a good time. paul kelly s character  although a workaholic was a man of integrity and truly loved kitty  ann sothern  as opposed to neil hamilton  while he did like her a lot  i didn t see the depth of love that he had for her character. the production values were great  but the script could have used a little work.
G

Dans le premier document, l'auteur a indiqué que le film n'était pas terrible, mais a voulu nuancer son propos ; par conséquent, une bonne partie du texte comprend des points positifs du film, ce qui a fausse le resultat.

Dans le second document, on retrouve le meme probleme, avec certains mots employés ("excellent", "best", "interesting", "enjoyable") qui s'apparentent beaucoup plus au vocabulaire positif qu'au vocabulaire négatif, bien qu'éventuellement utilisés avec de la négation ou des nuances.

# PART III - Stemming and Lemmatization 

1- Adding stemming to pretreatment

In [56]:
import nltk
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
import re


def preprocess_stemming(data : dict) -> dict:
    '''Applies preprocess function coupled with stemming technique over the text of the document data'''
    updated_data = preprocess(data)
    re_word = re.compile(r"^\w+$")
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for word in word_tokenize(updated_data["text"].lower()) if re_word.match(word)]
    updated_data["text"] = " ".join(stemmed)
    return updated_data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\leand\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


2- Train and evaluate the model again with these pretreatment

In [57]:
updated_dataset_stem = dataset.map(preprocess_stemming)
(logprior_stem, loglikelihood_stem, Vocabulary_stem) = train_naive_bayes(updated_dataset_stem["train"], {0, 1})

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [58]:
print("Accuracy on training set with stemming: ")
print(count_good_answers(updated_dataset_stem["train"])/len(updated_dataset_stem["train"]))
print("Accuracy on test set with stemming: ")
print(count_good_answers(updated_dataset_stem["test"])/len(updated_dataset_stem["test"]))

Accuracy on training set with stemming: 
0.8406
Accuracy on test set with stemming: 
0.75324


3- Are the results better or worse ?

Les résultats sont moins bons. Cela pourrait s'expliquer par le principe du stemming qui vise à garder la racine du mot, c'est à dire de tronquer toute déclinaison, accords et dérivation.

Certains mots peuvent donc avoir une racine lui donnant une nuance plus positive ou négative selon le contexte, et cette nuance disparaît avec le stemming, qui peut donc amener à des estimations de mots plus "neutres", avec donc moins de mots déterminants, d'où une incertitude plus importante dans les résultats.
