In [1]:
import transformers
from datasets import load_dataset_builder
ds_builder = load_dataset_builder("imdb")

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
print(ds_builder.info.description)

Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.


In [3]:
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [4]:
from datasets import load_dataset

In [5]:
from datasets import get_dataset_split_names

In [6]:
get_dataset_split_names("imdb")

['train', 'test', 'unsupervised']

In [7]:
dataset = load_dataset("rotten_tomatoes", split="train")

Found cached dataset rotten_tomatoes (C:/Users/leand/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


In [8]:
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [9]:
dataset = load_dataset("imdb")

Found cached dataset imdb (C:/Users/leand/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
sum(dataset["train"]["label"]) + sum(dataset["test"]["label"]) #count of supervised documents with positive reviews

25000

In [11]:
len(dataset["train"]) + len(dataset["test"]) #count of supervised documents

50000

In [12]:
import string
dataset = load_dataset("imdb", split="train")

Found cached dataset imdb (C:/Users/leand/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


In [13]:
display(dataset[0]["text"])

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [94]:
def preprocess(data):
    str = data["text"]
    for c in string.punctuation:
        str = str.replace(c, ' ')
    data["text"] = str.lower()
    return data

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
dataset = dataset.map(lambda examples: tokenizer(examples["text"]), batched=True)

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-1e31ce8330321103.arrow


In [16]:
dataset

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 25000
})

In [17]:
updated_dataset = dataset.map(preprocess)

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-0f3893649e3d5ad8.arrow


In [18]:
updated_dataset[0]["label"]

0

In [19]:
dataset = dataset.map(lambda examples: tokenizer(examples["text"]), batched=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (521 > 512). Running this sequence through the model will result in indexing errors


In [20]:
def build_vocabulary_string(V, str, category, C):
    for word in str.split():
        if not (word in V) :
            V[word] = {}
            for c in C:
                V[word][c] = 0
        V[word][category] += 1
    return V

def build_vocabulary(dataset, C):
    V = {}
    for document in dataset:
        V = build_vocabulary_string(V, document["text"], document["label"], C)
    return V


In [21]:
build_vocabulary(updated_dataset, {0, 1})

{'i': {0: 46903, 1: 40737},
 'rented': {0: 237, 1: 100},
 'am': {0: 1512, 1: 1266},
 'curious': {0: 135, 1: 126},
 'yellow': {0: 48, 1: 58},
 'from': {0: 9731, 1: 10768},
 'my': {0: 6015, 1: 6488},
 'video': {0: 1043, 1: 688},
 'store': {0: 289, 1: 230},
 'because': {0: 4966, 1: 4080},
 'of': {0: 69010, 1: 76855},
 'all': {0: 12036, 1: 11943},
 'the': {0: 163404, 1: 173317},
 'controversy': {0: 23, 1: 28},
 'that': {0: 37632, 1: 35634},
 'surrounded': {0: 75, 1: 59},
 'it': {0: 48380, 1: 48036},
 'when': {0: 6726, 1: 7457},
 'was': {0: 26291, 1: 21918},
 'first': {0: 4306, 1: 4756},
 'released': {0: 400, 1: 586},
 'in': {0: 43753, 1: 50219},
 '1967': {0: 21, 1: 33},
 'also': {0: 3608, 1: 5551},
 'heard': {0: 561, 1: 550},
 'at': {0: 12278, 1: 11233},
 'seized': {0: 6, 1: 4},
 'by': {0: 10548, 1: 11998},
 'u': {0: 197, 1: 286},
 's': {0: 30697, 1: 32708},
 'customs': {0: 12, 1: 22},
 'if': {0: 9520, 1: 7287},
 'ever': {0: 3263, 1: 2728},
 'tried': {0: 515, 1: 258},
 'to': {0: 68975, 1: 

In [22]:
import math
def train_naive_bayes(D,C): #C : positif/negatif
    logprior = {}
    loglikelihood = {}
    Ndoc = len(D)
    Vocabulary = build_vocabulary(D, C) #bigdoc is useless
    for word in Vocabulary:
        loglikelihood[word] = {}
    for c in C:
        Nc = 0
        for document in D:
            if document["label"] == c :
                Nc += 1
        logprior[c] = math.log(Nc / Ndoc)
        word_number = 0
        for word in Vocabulary:
            word_number += Vocabulary[word][c] + 1
        for word in Vocabulary:
            loglikelihood[word][c] = math.log((Vocabulary[word][c] + 1)/word_number)
    return logprior, loglikelihood, Vocabulary

In [23]:
(r1, r2, r3) = train_naive_bayes(updated_dataset, {0, 1})

In [24]:
def test_naive_bayes(testdoc, logprior, loglikelihood, C, V) :
    sum = {}
    for c in C:
        sum[c] = logprior[c]
        for word in testdoc.split():
            if word in V:
                sum[c] += loglikelihood[word][c]
    return max(sum, key=sum.get)

In [25]:
r1

{0: -0.6931471805599453, 1: -0.6931471805599453}

In [26]:
r2

{'i': {0: -4.1820680448913405, 1: -4.343660679092243},
 'rented': {0: -9.465655609863918, 1: -10.34345675902507},
 'am': {0: -7.616076569747036, 1: -7.814170095545189},
 'curious': {0: -10.02527139779934, 1: -10.114390189407738},
 'yellow': {0: -11.046105985424765, 1: -10.88103983196061},
 'from': {0: -5.754751579631979, 1: -5.674150360537448},
 'my': {0: -6.235748417905715, 1: -6.1807135612451605},
 'video': {0: -7.987111515092807, 1: -8.42333600485267},
 'store': {0: -9.268045360554872, 1: -9.516159565344536},
 'because': {0: -6.427354968428041, 1: -6.644479940460523},
 'of': {0: -3.795905092372204, 1: -3.7088884557767545},
 'all': {0: -5.542185765155657, 1: -5.570592936647302},
 'the': {0: -2.933939222840313, 1: -2.8956939394282424},
 'controversy': {0: -11.759872453187446, 1: -11.591281445879854},
 'that': {0: -4.4022896795009006, 1: -4.477493695930358},
 'surrounded': {0: -10.60719294324906, 1: -10.864232713644228},
 'it': {0: -4.151063829886766, 1: -4.178850449582398},
 'when': {

In [27]:
r3

{'i': {0: 46903, 1: 40737},
 'rented': {0: 237, 1: 100},
 'am': {0: 1512, 1: 1266},
 'curious': {0: 135, 1: 126},
 'yellow': {0: 48, 1: 58},
 'from': {0: 9731, 1: 10768},
 'my': {0: 6015, 1: 6488},
 'video': {0: 1043, 1: 688},
 'store': {0: 289, 1: 230},
 'because': {0: 4966, 1: 4080},
 'of': {0: 69010, 1: 76855},
 'all': {0: 12036, 1: 11943},
 'the': {0: 163404, 1: 173317},
 'controversy': {0: 23, 1: 28},
 'that': {0: 37632, 1: 35634},
 'surrounded': {0: 75, 1: 59},
 'it': {0: 48380, 1: 48036},
 'when': {0: 6726, 1: 7457},
 'was': {0: 26291, 1: 21918},
 'first': {0: 4306, 1: 4756},
 'released': {0: 400, 1: 586},
 'in': {0: 43753, 1: 50219},
 '1967': {0: 21, 1: 33},
 'also': {0: 3608, 1: 5551},
 'heard': {0: 561, 1: 550},
 'at': {0: 12278, 1: 11233},
 'seized': {0: 6, 1: 4},
 'by': {0: 10548, 1: 11998},
 'u': {0: 197, 1: 286},
 's': {0: 30697, 1: 32708},
 'customs': {0: 12, 1: 22},
 'if': {0: 9520, 1: 7287},
 'ever': {0: 3263, 1: 2728},
 'tried': {0: 515, 1: 258},
 'to': {0: 68975, 1: 

In [32]:
test_naive_bayes(updated_dataset[4500]["text"], r1, r2, {0, 1}, r3)

0

In [33]:
updated_dataset[4500]["label"]

0

In [37]:
import sklearn

In [38]:
pipe = sklearn.pipeline.Pipeline([('Preprocessing', preprocess())])

TypeError: preprocess() missing 1 required positional argument: 'data'

In [39]:
from sklearn.base import BaseEstimator, TransformerMixin

In [161]:
class PreProcessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    def fit(self, X, y=None):
        # fit should only take X and y as parameters
        # Even if your model is unsupervised, you need to accept a y argument!
        # Model fitting code goes here
        # fit returns self
        return self
    def transform(self, X):
        # transform takes as parameter only X
        # Apply some transformation to X
        X_transformed = X.map(preprocess)
        return X_transformed["text"]

In [137]:
test = PreProcessing()

In [138]:
test.fit(dataset)

In [139]:
a = test.transform(dataset)

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-25d55f09f594a473.arrow


In [140]:
a[0]

'i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967  i also heard that at first it was seized by u s  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  i really had to see this for myself  br    br   the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life  in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states  in between asking politicians and ordinary denizens of stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men  br    br   what kills me about i am curious yellow is that 40 years ago  this was considered pornographic  really  the sex and nudity scenes are few and far between  ev

In [141]:
dataset[0]["text"]

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [162]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
pipe = sklearn.pipeline.Pipeline([("PreProcessing", PreProcessing()), ("CountVectorization", CountVectorizer()), ("Naive Bayes Classifier", MultinomialNB())])

In [158]:
a = PreProcessing()
a.fit(dataset)
a.transform(dataset)

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-25d55f09f594a473.arrow


(['i rented i am curious yellow from my video store because of all the controversy that surrounded it when it was first released in 1967  i also heard that at first it was seized by u s  customs if it ever tried to enter this country  therefore being a fan of films considered  controversial  i really had to see this for myself  br    br   the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life  in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states  in between asking politicians and ordinary denizens of stockholm about their opinions on politics  she has sex with her drama teacher  classmates  and married men  br    br   what kills me about i am curious yellow is that 40 years ago  this was considered pornographic  really  the sex and nudity scenes are few and far between  

In [163]:
pipe.fit(dataset, dataset["label"])

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-25d55f09f594a473.arrow


In [167]:
pipe.score(dataset, dataset["label"])

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-25d55f09f594a473.arrow


0.89808

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
corpus = [
'This is the first document.',
'This document is the second document.',
'And this is the third one.',
'Is this the first document?']
X = vectorizer.fit_transform(corpus)

In [72]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [79]:
print(X.toarray())
y = [1, 2, 0, 4]

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [80]:
clf.fit(X, y)

In [81]:
print(clf.predict(X[2:3]))

[0]


In [130]:
X_transformed = dataset.map(preprocess)

Loading cached processed dataset at C:\Users\leand\.cache\huggingface\datasets\imdb\plain_text\1.0.0\d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0\cache-25d55f09f594a473.arrow


In [131]:
Xtext = X_transformed["text"]

In [133]:
a = vectorizer.fit_transform(Xtext)

In [134]:
a

<25000x74702 sparse matrix of type '<class 'numpy.int64'>'
	with 3445804 stored elements in Compressed Sparse Row format>

In [135]:
print(a)

  (0, 54772)	1
  (0, 2809)	3
  (0, 15928)	3
  (0, 74042)	3
  (0, 26060)	1
  (0, 44513)	2
  (0, 71142)	1
  (0, 63262)	1
  (0, 6358)	1
  (0, 46552)	6
  (0, 2565)	1
  (0, 66202)	10
  (0, 14493)	1
  (0, 66186)	4
  (0, 64532)	1
  (0, 34559)	6
  (0, 72610)	1
  (0, 72055)	3
  (0, 24630)	2
  (0, 54533)	1
  (0, 32881)	10
  (0, 372)	1
  (0, 2724)	1
  (0, 30160)	1
  (0, 4656)	1
  :	:
  (24999, 30089)	1
  (24999, 26213)	1
  (24999, 28424)	1
  (24999, 58893)	1
  (24999, 61549)	1
  (24999, 61505)	1
  (24999, 15867)	1
  (24999, 73164)	1
  (24999, 21762)	1
  (24999, 47433)	1
  (24999, 30800)	1
  (24999, 26584)	1
  (24999, 21897)	1
  (24999, 25293)	1
  (24999, 44876)	1
  (24999, 11233)	1
  (24999, 5978)	3
  (24999, 12559)	1
  (24999, 47314)	1
  (24999, 4933)	1
  (24999, 41595)	2
  (24999, 15545)	1
  (24999, 33520)	1
  (24999, 59298)	1
  (24999, 28693)	1
