In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [2]:
simpsons = pd.read_csv("simpsons_dataset.csv")
simpsons.head(3)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...


In [3]:
simpsons["raw_character_text"].value_counts()

Homer Simpson               29782
Marge Simpson               14141
Bart Simpson                13759
Lisa Simpson                11489
C. Montgomery Burns          3162
Moe Szyslak                  2862
Seymour Skinner              2438
Ned Flanders                 2144
Grampa Simpson               1880
Milhouse Van Houten          1862
Chief Wiggum                 1830
Krusty the Clown             1768
Nelson Muntz                 1172
Lenny Leonard                1166
Apu Nahasapeemapetilon       1006
Waylon Smithers               996
Kent Brockman                 891
Carl Carlson                  883
Edna Krabappel-Flanders       739
Dr. Julius Hibbert            691
Selma Bouvier                 611
Barney Gumble                 611
Sideshow Bob                  576
Rev. Timothy Lovejoy          558
Crowd                         540
Groundskeeper Willie          534
Gary Chalmers                 523
Ralph Wiggum                  507
Mayor Joe Quimby              503
Patty Bouvier 

The lines for bart and lisa are as follows:
Bart Simpson                     13759
Lisa Simpson                     11489

In [4]:
# create a df with on lisa and barts lines
df = simpsons[(simpsons["raw_character_text"] == "Lisa Simpson") | (simpsons["raw_character_text"] == "Bart Simpson")] 
df.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [5]:
text = df['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14258 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [6]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(matrix[0:500,0:500])
#matrix

  (24, 424)	1
  (40, 325)	1
  (45, 266)	1
  (63, 269)	1
  (74, 356)	1
  (80, 264)	1
  (82, 304)	1
  (98, 192)	1
  (100, 396)	1
  (151, 328)	1
  (156, 325)	1
  (157, 451)	1
  (163, 325)	1
  (164, 325)	1
  (186, 461)	1
  (207, 325)	1
  (210, 397)	1
  (231, 270)	1
  (237, 404)	1
  (259, 325)	1
  (287, 325)	1
  (294, 493)	1
  (295, 163)	1
  (318, 300)	1
  (321, 281)	1
  (356, 450)	1
  (358, 397)	1
  (362, 449)	1
  (366, 24)	1
  (366, 449)	1
  (386, 129)	1
  (387, 325)	1
  (388, 70)	1
  (394, 38)	1
  (394, 91)	1
  (396, 446)	1
  (398, 126)	1
  (410, 52)	1
  (410, 319)	1
  (410, 343)	1
  (413, 449)	1
  (419, 196)	1
  (428, 360)	1
  (464, 304)	1


In [7]:
docu_feat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
docu_feat.index = df['spoken_words'] #Give the rows names (text of the review)
docu_feat.columns = feature_names

In [8]:
docu_feat.iloc[0:4, 1000:1015]

Unnamed: 0_level_0,bartholemew,bartholomew,bartish,bartman,barto,bartrand,bartron,barts,barty,bas,base,baseball,based,basement,basements
spoken_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Where's Mr. Bergstrom?,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
That life is worth living.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Victory party under the slide!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Mr. Bergstrom! Mr. Bergstrom!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
#Split the data into a training and a test set
y = df['raw_character_text'] #we want to which character is speaking
X = matrix #based on the word spoken

In [10]:
#split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [11]:
# Train a NB model on the training set
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
#to find out the classes being used in the model
clf.classes_

array(['Bart Simpson', 'Lisa Simpson'], dtype='<U12')

In [12]:
# Predict the class (Lisa or Bart) of the test set
print(clf.predict(X[2:3]))

['Bart Simpson']


In [23]:
y_predict = clf.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_predict) #creates a "confusion matrix"
conf_matrix = pd.DataFrame(cm, index=['Bart_predict', 'Lisa_predict'], columns = ['Bart', 'Lisa'])
conf_matrix  #this is on the test set which is 30% of the entire dataframe, hence the number mismatch 

Unnamed: 0,Bart,Lisa
Bart_predict,3271,894
Lisa_predict,1799,1611


In [22]:
precision_recall_fscore_support(y_test, y_predict, average=None)

(array([0.64516765, 0.64311377]),
 array([0.78535414, 0.47243402]),
 array([0.70839199, 0.54471682]),
 array([4165, 3410], dtype=int64))

Precision is 64.3%, recall is 47.2% on the predictive model. 
From confusion matrix we see there are a lot of false negatives
Precision means the percentage of your results which are relevant. On the other hand, recall refers to the percentage of total relevant results correctly classified by your algorithm. This is usually a trade off ... to recall everything, you will have to keep generating results which are not accurate, hence lowering your precision

In [84]:
l = df.iloc[2:3]["spoken_words"]

7    Victory party under the slide!
Name: spoken_words, dtype: object

In [96]:
df.iloc[2:3]
#print (l[0])

Unnamed: 0,raw_character_text,spoken_words
7,Bart Simpson,Victory party under the slide!


In [15]:
df["raw_character_text"].value_counts(normalize = True)

Bart Simpson    0.544954
Lisa Simpson    0.455046
Name: raw_character_text, dtype: float64

In [16]:
clf.score(X_test, y_test) #calculate the fit on the test data

0.6444884488448845

In [137]:
#checking the probabilities of dialogue
print (df.iloc[2:3])
p = (clf.predict_proba(X[2:3]))
#arrange the array by ascending values and print the highest value?
print (p.max())

  raw_character_text                    spoken_words
7       Bart Simpson  Victory party under the slide!
0.7936665779408832


In [116]:
#Create a loop that prints out a few lines of dialogue and the associated probabilities for Bart and Lisa. 
#testing solutions
L = df.iloc[2:3]
C = clf.predict(X[2:3])
P = clf.predict_proba(X[2:3])
p = P[0][0]
for key in L:
    print (f"The sentence:\n {L['spoken_words']} \nis likely said by {C} based on {p:.2f} prediction. ")
#Tip: the array with the probabilities is 2-dimensional.
#its messy...lets clean it up

The sentence:
 7    Victory party under the slide!
Name: spoken_words, dtype: object 
is likely said by ['Bart Simpson'] based on 0.79 prediction. 
The sentence:
 7    Victory party under the slide!
Name: spoken_words, dtype: object 
is likely said by ['Bart Simpson'] based on 0.79 prediction. 


In [140]:
#loop for first fifty sentences in the datafram
#this also prints the output from the naive bayesian model and the probabilities for the results
for i in range(50):
    C = clf.predict(X[i])
    words = df['spoken_words'].iloc[i]
    prob = clf.predict_proba(X[i])
    p = prob.max()  #get the maximum value from the array
    print(f"The sentence:\n'{words}' \nis likely said by {C[0]} based on {p:.2f} prediction.")
    print(prob[0])  #the array of probabilities for comparison

The sentence:
'Where's Mr. Bergstrom?' 
is likely said by Lisa Simpson based on 0.96 prediction.
[0.04271499 0.95728501]
The sentence:
'That life is worth living.' 
is likely said by Bart Simpson based on 0.68 prediction.
[0.67634553 0.32365447]
The sentence:
'Victory party under the slide!' 
is likely said by Bart Simpson based on 0.79 prediction.
[0.79366658 0.20633342]
The sentence:
'Mr. Bergstrom! Mr. Bergstrom!' 
is likely said by Lisa Simpson based on 1.00 prediction.
[0.00167382 0.99832618]
The sentence:
'Do you know where I could find him?' 
is likely said by Bart Simpson based on 0.54 prediction.
[0.54244121 0.45755879]
The sentence:
'The train, how like him... traditional, yet environmentally sound.' 
is likely said by Lisa Simpson based on 0.93 prediction.
[0.06930173 0.93069827]
The sentence:
'I see he touched you, too.' 
is likely said by Lisa Simpson based on 0.58 prediction.
[0.42381872 0.57618128]
The sentence:
'Hey, thanks for your vote, man.' 
is likely said by Bart S

In [None]:
#Do you see patterns (based on the data and your knowledge of the Simpsons)?
