# PART I - The Dataset

In [1]:
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("imdb")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(ds_builder.info.description)

Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.


In [3]:
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [4]:
from datasets import load_dataset

In [5]:
from datasets import get_dataset_split_names

In [6]:
get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

1 - How many splits does the dataset has ?

In [7]:
print(len(get_dataset_split_names("imdb")))

3


In [8]:
dataset = load_dataset("imdb")

Found cached dataset imdb (/home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 77.59it/s]


2 - How big are thes splits ?

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

3 - What is the proportion of each class on the supervised splits ?

In [10]:
sum(dataset["train"]["label"]) + sum(dataset["test"]["label"]) #count of supervised documents with positive reviews

25000

In [11]:
len(dataset["train"]) + len(dataset["test"]) #count of supervised documents

50000

# PART II - Naive Bayes Classifier

In [12]:
import string
dataset = load_dataset("imdb")

Found cached dataset imdb (/home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|██████████| 3/3 [00:00<00:00, 899.61it/s]


In [13]:
display(dataset["train"][0]["text"])

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

1- Preprocess function

In [14]:
def preprocess(data):
    str = data["text"]
    for c in string.punctuation:
        if (c != "."):
          str = str.replace(c, ' ')
    data["text"] = str.lower()
    
    return data

In [15]:
preprocess(dataset["train"][0])

{'text': 'i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967. i also heard that at first it was seized by u.s. customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  i really had to see this for myself. br    br   the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life. in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men. br    br   what kills me about i am curious yellow is that 40 years ago  this was considered pornographic. really  the sex and nudity scenes are few and far be

In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
dataset = dataset.map(lambda examples: tokenizer(examples["text"]), batched=True)

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-1e31ce8330321103.arrow
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5743b37a20b41bc8.arrow
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-dff6e4d0329eab1d.arrow


In [17]:
sum(dataset["train"]["label"])

12500

In [18]:
updated_dataset = dataset.map(preprocess)

Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b28e1aeaa9fa126a.arrow
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d91771deae58c293.arrow
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-374f108ea2b07007.arrow


2- Naive Bayes classifier from scratch

In [19]:
def build_vocabulary_string(V, str, category, C):
    for word in str.split():
        if not (word in V) :
            V[word] = {}
            for c in C:
                V[word][c] = 0
        V[word][category] += 1
    return V

def build_vocabulary(dataset, C):
    V = {}
    for document in dataset:
        V = build_vocabulary_string(V, document["text"], document["label"], C)
    return V
build_vocabulary(updated_dataset["train"], {0, 1})

{'i': {0: 46324, 1: 40142},
 'rented': {0: 233, 1: 100},
 'am': {0: 1497, 1: 1253},
 'curious': {0: 118, 1: 120},
 'yellow': {0: 47, 1: 56},
 'from': {0: 9654, 1: 10693},
 'my': {0: 5996, 1: 6466},
 'video': {0: 946, 1: 619},
 'store': {0: 238, 1: 198},
 'because': {0: 4949, 1: 4069},
 'of': {0: 68850, 1: 76737},
 'all': {0: 11200, 1: 11406},
 'the': {0: 163024, 1: 172837},
 'controversy': {0: 19, 1: 26},
 'that': {0: 36955, 1: 35117},
 'surrounded': {0: 73, 1: 58},
 'it': {0: 44553, 1: 44512},
 'when': {0: 6699, 1: 7433},
 'was': {0: 25986, 1: 21702},
 'first': {0: 4199, 1: 4642},
 'released': {0: 352, 1: 516},
 'in': {0: 43446, 1: 49867},
 '1967.': {0: 3, 1: 6},
 'also': {0: 3551, 1: 5456},
 'heard': {0: 532, 1: 535},
 'at': {0: 12171, 1: 11160},
 'seized': {0: 6, 1: 4},
 'by': {0: 10495, 1: 11945},
 'u.s.': {0: 66, 1: 147},
 'customs': {0: 11, 1: 20},
 'if': {0: 9445, 1: 7222},
 'ever': {0: 3030, 1: 2514},
 'tried': {0: 501, 1: 252},
 'to': {0: 68777, 1: 66510},
 'enter': {0: 84, 1:

In [20]:
import math
def train_naive_bayes(D,C): #C : positif/negatif
    logprior = {}
    loglikelihood = {}
    Ndoc = len(D)
    Vocabulary = build_vocabulary(D, C) #bigdoc is useless
    for word in Vocabulary:
        loglikelihood[word] = {}
    for c in C:
        Nc = 0
        for document in D:
            if document["label"] == c :
                Nc += 1
        logprior[c] = math.log(Nc / Ndoc)
        word_number = 0
        for word in Vocabulary:
            word_number += Vocabulary[word][c] + 1
        for word in Vocabulary:
            loglikelihood[word][c] = math.log((Vocabulary[word][c] + 1)/word_number)
    return logprior, loglikelihood, Vocabulary

In [21]:
(logprior, loglikelihood, Voc) = train_naive_bayes(updated_dataset["train"], {0, 1})

In [22]:
def test_naive_bayes(testdoc, logprior, loglikelihood, C, V) :
    sum = {}
    for c in C:
        sum[c] = logprior[c]
        for word in testdoc.split():
            if word in V:
                sum[c] += loglikelihood[word][c]
    return max(sum, key=sum.get)

3- Naive Bayes classifier using scikit-learn

In [23]:
import sklearn
from sklearn.pipeline import Pipeline

In [24]:
from sklearn.base import BaseEstimator, TransformerMixin

In [25]:
class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    def fit(self, X, y=None):
        # fit should only take X and y as parameters
        # Even if your model is unsupervised, you need to accept a y argument!
        # Model fitting code goes here
        # fit returns self
        return self
    def transform(self, X):
        # transform takes as parameter only X
        # Apply some transformation to X
        X_transformed = X.map(preprocess)
        return X_transformed["text"]

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
pipe = Pipeline([("PreProcessing", PreProcessing()), ("CountVectorization", CountVectorizer()), ("Naive Bayes Classifier", MultinomialNB())])

In [27]:
pipe.fit(dataset["train"], dataset["train"]["label"])

Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b28e1aeaa9fa126a.arrow


4- Accuracy on both training and test set:

In [28]:
def count_good_answers(data):
    good_answers_test = 0
    for document in data:
        if test_naive_bayes(document["text"], logprior, loglikelihood, {0, 1}, Voc) == document["label"] :
            good_answers_test += 1
    return good_answers_test




In [29]:
print("Accuracy on test set: ")
print(count_good_answers(updated_dataset["test"])/len(updated_dataset["test"]))
print("Accuracy on training set: ")
print(count_good_answers(updated_dataset["train"])/len(updated_dataset["train"]))

Accuracy on test set: 
0.81204
Accuracy on training set: 
0.90612


In [30]:
print("Accuracy on training set with scikit-learn: ")
pipe.score(dataset["train"], dataset["train"]["label"])

Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-b28e1aeaa9fa126a.arrow


Accuracy on training set with scikit-learn: 


0.89808

In [31]:
print("Accuracy on test set with scikit-learn: ")
pipe.score(dataset["test"], dataset["test"]["label"])

Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-d91771deae58c293.arrow


Accuracy on test set with scikit-learn: 


0.8136

5- Why does the scikit-learn implementation give better results ?

La différence de résultats peut s'expliquer par le fait que MultinomialNB utilise directement des flottants plutôt que des entiers, ce qui permet d'avoir des résultats plus proches lors des calculs.

6- Why is the accurary a sufficient measure of evaluation here ?

L'accuracy est un moyen suffisant d'évaluation puisqu'il permet d'avoir les proportions de bonnes et de mauvaises réponses du modèle, et que l'on dispose d'un dataset important.

In [32]:
found = 0
for document in updated_dataset["test"]:
    if found < 2:
        if test_naive_bayes(document["text"], logprior, loglikelihood, {0, 1},  Voc) != document["label"] :
            found += 1
            print(document["text"])
            print(document["label"])
            print("\n\n\n")

blind date  columbia pictures  1934   was a decent film  but i have a few issues with this film. first of all  i don t fault the actors in this film at all  but more or less  i have a problem with the script. also  i understand that this film was made in the 1930 s and people were looking to escape reality  but the script made ann sothern s character look weak. she kept going back and forth between suitors and i felt as though she should have stayed with paul kelly s character in the end. he truly did care about her and her family and would have done anything for her and he did by giving her up in the end to fickle neil hamilton who in my opinion was only out for a good time. paul kelly s character  although a workaholic was a man of integrity and truly loved kitty  ann sothern  as opposed to neil hamilton  while he did like her a lot  i didn t see the depth of love that he had for her character. the production values were great  but the script could have used a little work.
0




ben 

7- 2 wrongly classifier example from the test set and explanation of why the model failed:

Dans le premier document, l'auteur a indiqué que le film n'était pas terrible, mais a voulu nuancer son propos ; par conséquent, une bonne partie du texte comprend des points positifs du film, ce qui a fausse le resultat.

Dans le second document, on retrouve le meme probleme, avec certains mots employés ("excellent", "best", "interesting", "enjoyable") qui s'apparentent beaucoup plus au vocabulaire positif qu'au vocabulaire négatif, bien qu'éventuellement utilisés avec de la négation ou des nuances.

# PART III - Stemming and Lemmatization 

1- Adding stemming to pretreatment

In [33]:
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
import re


def preprocess_stemming(data):
    str = data["text"]
    for c in string.punctuation:
        if (c != "."):
            str = str.replace(c, ' ')
    data["text"] = str.lower()
    
    re_word = re.compile(r"^\w+$")
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for word in word_tokenize(data["text"].lower()) if re_word.match(word)]
    data["text"] = " ".join(stemmed)
    
    return data

2- Train and evaluate the model again with these pretreatment

In [34]:
updated_dataset_stem = dataset.map(preprocess_stemming)
(logprior_stem, loglikelihood_stem, Voc_stem) = train_naive_bayes(updated_dataset_stem["train"], {0, 1})

    

Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-ee5adf94d0287cf4.arrow
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-7f5f56c85f7085fe.arrow
Loading cached processed dataset at /home/amine/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-21becf5615bbc4ed.arrow


In [35]:
print("Accuracy on training set with stemming: ")
print(count_good_answers(updated_dataset_stem["train"])/len(updated_dataset_stem["train"]))
print("Accuracy on test set with stemming: ")
print(count_good_answers(updated_dataset_stem["test"])/len(updated_dataset_stem["test"]))


Accuracy on training set with stemming: 
0.84316
Accuracy on test set with stemming: 
0.75368


3- Are the results better or worse ? 

EXPLIQUER POURQUOI L'ACCURANCY A CHANGÉ