In [107]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [108]:
df = pd.read_csv('simpsons_dataset.csv')
df.head(20)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,Martin Prince,I don't think there's anything left to say.
6,Edna Krabappel-Flanders,Bart?
7,Bart Simpson,Victory party under the slide!
8,,
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!


There were rows with no characters and no text so I use dropna() to delete those rows.

In [109]:
df = df.dropna()
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


In [110]:
simpsons = df.loc[(df['raw_character_text'] == "Lisa Simpson") | (df['raw_character_text'] == "Bart Simpson")]

In [111]:
simpsons.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [112]:
text = simpsons['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode
vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14257 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [113]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(matrix[0:500,0:500]) #Let's print a little part of the matrix: the first 50 words & documents

  (23, 424)	1
  (38, 325)	1
  (43, 266)	1
  (61, 269)	1
  (72, 356)	1
  (78, 264)	1
  (80, 304)	1
  (96, 192)	1
  (98, 396)	1
  (149, 328)	1
  (154, 325)	1
  (155, 451)	1
  (161, 325)	1
  (162, 325)	1
  (184, 461)	1
  (205, 325)	1
  (208, 397)	1
  (229, 270)	1
  (235, 404)	1
  (256, 325)	1
  (284, 325)	1
  (291, 493)	1
  (292, 163)	1
  (315, 300)	1
  (318, 281)	1
  (353, 450)	1
  (355, 397)	1
  (359, 449)	1
  (363, 24)	1
  (363, 449)	1
  (381, 129)	1
  (382, 325)	1
  (383, 70)	1
  (389, 38)	1
  (389, 91)	1
  (391, 446)	1
  (393, 126)	1
  (405, 52)	1
  (405, 319)	1
  (405, 343)	1
  (408, 449)	1
  (414, 196)	1
  (422, 360)	1
  (457, 304)	1


In [114]:
simpsons_mat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
simpsons_mat.index = simpsons['raw_character_text'] #Give the rows names (text of the review)
simpsons_mat.columns = feature_names #Give the columns names (words from vocabulary)

In [115]:
simpsons_mat.iloc[0:4, 1000:1015] #Show a part of the matrix

Unnamed: 0_level_0,bartholemew,bartholomew,bartish,bartman,barto,bartrand,bartron,barts,barty,bas,base,baseball,based,basement,basements
raw_character_text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Bart Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Lisa Simpson,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [116]:
#result = pd.concat([simpsons, pd.DataFrame(matrix.toarray())], axis=1)
#result.head(20)

Unnamed: 0,raw_character_text,spoken_words,0,1,2,3,4,5,6,7,...,14247,14248,14249,14250,14251,14252,14253,14254,14255,14256
0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Lisa Simpson,Where's Mr. Bergstrom?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Lisa Simpson,That life is worth living.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bart Simpson,Victory party under the slide!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Exercise 2

In [117]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import normalize

In [118]:
result.head(20)

Unnamed: 0,raw_character_text,spoken_words,0,1,2,3,4,5,6,7,...,14247,14248,14249,14250,14251,14252,14253,14254,14255,14256
0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Lisa Simpson,Where's Mr. Bergstrom?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Lisa Simpson,That life is worth living.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Bart Simpson,Victory party under the slide!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [119]:
#X = simpsons_mat.loc[:]
X = matrix
y = simpsons['raw_character_text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [120]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [164]:
y_p = nb.predict(X_test)

In [122]:
accuracy = nb.score(X_test, y_test)

In [123]:
print(f'The accuracy is: {accuracy}')

The accuracy is: 0.654644962110581


## Lesson 2 - Exercise 1 

In [124]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_p) #creates a "confusion matrix"
cm

array([[3068,  865],
       [1596, 1597]])

In [125]:
nb.classes_

array(['Bart Simpson', 'Lisa Simpson'], dtype='<U12')

nb.classses_ is om te bekijken op welke volgorde de data in de confusion matrix staat

In [126]:
conf_matrix = pd.DataFrame(cm, index=['Bart', 'Lisa'], columns = ['Bart-p', 'Lisa-p']) 
conf_matrix

Unnamed: 0,Bart-p,Lisa-p
Bart,3068,865
Lisa,1596,1597


In [157]:
#Oplossing van Jonas
from sklearn.metrics import classification_report
print(classification_report(y_test,y_p, nb.classes_))

              precision    recall  f1-score   support

Bart Simpson       0.66      0.78      0.71      3933
Lisa Simpson       0.65      0.50      0.56      3193

 avg / total       0.65      0.65      0.65      7126



In [127]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_p)
print(f"The accuracy is {accuracy}.")

The accuracy is 0.654644962110581.


#### Recall and precision Bart

In [128]:
#How much of the predicted ‘bart’ actually bart?
precisionbart = 3068/(3068+1596)
precisionbart100 = precisionbart * 100
#How much of the real bart is predicted as bart?
recallbart = 3068/(3068+865)
recallbart100 = recallbart * 100

In [129]:
print(f"The precision of the Bart is {precisionbart}. So {precisionbart100}% of the predicted Bart actually Bart.")
print(f"The recall of the Bart is {recallbart}. So {recallbart100}% of the real Bart is predicted as Bart.")

The precision of the Bart is 0.6578044596912521. So 65.78044596912521% of the predicted Bart actually Bart.
The recall of the Bart is 0.7800661072972286. So 78.00661072972287% of the real Bart is predicted as Bart.


#### Recall and precision Lisa

In [130]:
#How much of the predicted ‘lisa’ actually lisa?
precisionlisa = 1597/(1597+865)
precisionlisa100 = precisionbart * 100
#How much of the real lisa is predicted as lisa?
recalllisa = 1597/(1597+1597)
recalllisa100 = recalllisa * 100

In [131]:
print(f"The precision of the Bart is {precisionlisa}. So {precisionlisa100}% of the predicted Lisa actually Lisa.")
print(f"The recall of the Bart is {recalllisa}. So {recalllisa100}% of the real Lisa is predicted as Lisa.")

The precision of the Bart is 0.648659626320065. So 65.78044596912521% of the predicted Lisa actually Lisa.
The recall of the Bart is 0.5. So 50.0% of the real Lisa is predicted as Lisa.


In [132]:
#X = "Bart?"
proba = nb.predict_proba(X)
proba = pd.DataFrame(proba)
proba.columns = ("Bart", "Lisa")
proba.head()

Unnamed: 0,Bart,Lisa
0,0.024127,0.975873
1,0.641077,0.358923
2,0.392244,0.607756
3,0.00051,0.99949
4,0.537264,0.462736


In [154]:
#Oplossing van Jonas voor een zin zoeken
print(simpsons.iloc[0,1])
print(nb.predict_proba(X[0]))

Where's Mr. Bergstrom?
[[0.0241265 0.9758735]]


In [156]:
index = 0
predictions = []
for line in simpsons["spoken_words"]:
    if index < 100:
       # print(line)
        if proba["Bart"][index] > 0.5:
            #print(f"Bart probabaly said: {line}") 
            test = {
                "Predicted character" : "Bart",
                "spoken_words" : line
            }
            predictions.append(test)
        elif proba["Bart"][index] < 0.5:
           # print(f"Lisa probabaly said: {line}")
            test = {
                "Predicted character" : "Lisa",
                "spoken_words" : line
            }
            predictions.append(test)
    index = index+1
predictions = pd.DataFrame(predictions)
predictions
result = pd.merge(predictions, simpsons, on="spoken_words")
result.head()

Unnamed: 0,Predicted character,spoken_words,raw_character_text
0,Lisa,Where's Mr. Bergstrom?,Lisa Simpson
1,Bart,That life is worth living.,Lisa Simpson
2,Lisa,Victory party under the slide!,Bart Simpson
3,Lisa,Mr. Bergstrom! Mr. Bergstrom!,Lisa Simpson
4,Bart,Do you know where I could find him?,Lisa Simpson


In [150]:
#result = result[['raw_character_text','Predicted character','spoken_words']]
result.columns = ['Character', 'Predicted character', 'Line']
result.head()

Unnamed: 0,Character,Predicted character,Line
0,Lisa Simpson,Lisa,Where's Mr. Bergstrom?
1,Lisa Simpson,Bart,That life is worth living.
2,Bart Simpson,Lisa,Victory party under the slide!
3,Lisa Simpson,Lisa,Mr. Bergstrom! Mr. Bergstrom!
4,Lisa Simpson,Bart,Do you know where I could find him?


In [155]:
#Dis is de oplossing van Jonas
for i in range(10):
    prob = nb.predict_proba(X[i])
    print(f"line: {i}. {simpsons.iloc[i,1]}")
    print(f"Bart: {prob[0,0]}, Lisa: {prob[0,1]}")

line: 0. Where's Mr. Bergstrom?
Bart: 0.02412650028716209, Lisa: 0.9758734997128374
line: 1. That life is worth living.
Bart: 0.6410767236351326, Lisa: 0.35892327636486704
line: 2. Victory party under the slide!
Bart: 0.3922440278271701, Lisa: 0.6077559721728304
line: 3. Mr. Bergstrom! Mr. Bergstrom!
Bart: 0.0005098590893126944, Lisa: 0.999490140910687
line: 4. Do you know where I could find him?
Bart: 0.5372636836855326, Lisa: 0.4627363163144669
line: 5. The train, how like him... traditional, yet environmentally sound.
Bart: 0.10532281716203411, Lisa: 0.8946771828379665
line: 6. I see he touched you, too.
Bart: 0.42689457136782927, Lisa: 0.5731054286321701
line: 7. Hey, thanks for your vote, man.
Bart: 0.9455507837393036, Lisa: 0.05444921626069731
line: 8. Well, you got that right. Thanks for your vote, girls.
Bart: 0.8282533089214902, Lisa: 0.17174669107851093
line: 9. Well, don't sweat it. Just so long as a couple of people did... right, Milhouse?
Bart: 0.9039007255326827, Lisa: 0.

In [161]:
#dit doet jonas op het bord
nb.feature_log_prob_[500:510]

array([], shape=(0, 14257), dtype=float64)

In [162]:
#Dit werkt nog niet, deed jonas ook op het bord
pd.DataFrame(nb.feature_log_prob_[500:800], index = vect.get_feature_names()[500:800])

ValueError: Empty data passed with indices specified.