In [20]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer #The CountVectorizer object
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

In [2]:
simpsons = pd.read_csv("simpsons_dataset.csv")
simpsons.head(3)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...


In [153]:
simpsons["raw_character_text"].value_counts().head()

Homer Simpson          29782
Marge Simpson          14141
Bart Simpson           13759
Lisa Simpson           11489
C. Montgomery Burns     3162
Name: raw_character_text, dtype: int64

The lines for bart and lisa are as follows:
Bart Simpson                     13759
Lisa Simpson                     11489

In [4]:
# create a df with on lisa and barts lines
df = simpsons[(simpsons["raw_character_text"] == "Lisa Simpson") | (simpsons["raw_character_text"] == "Bart Simpson")] 
df.head()

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!
11,Lisa Simpson,Do you know where I could find him?


In [5]:
text = df['spoken_words'].values.astype('U') #Taking the text from the df. We need to convert it to Unicode

vect = CountVectorizer(stop_words='english') #Create the CV object, with English stop words
vect = vect.fit(text) #We fit the model with the words from the review text
feature_names = vect.get_feature_names() #Get the words from the vocabulary
print(f"There are {len(feature_names)} words in the vocabulary. A selection: {feature_names[500:520]}")

There are 14258 words in the vocabulary. A selection: ['anguished', 'angus', 'anima', 'animal', 'animals', 'animated', 'animation', 'animators', 'anka', 'ankle', 'ann', 'annapolis', 'anne', 'annie', 'anniversary', 'annnnd', 'announce', 'announcement', 'announcements', 'announcer']


In [151]:
matrix = vect.transform(text) #The transform method from the CountVectorizer object creates the matrix
print(matrix[0:50,0:50])
#matrix




In [7]:
docu_feat = pd.DataFrame(matrix.toarray()) #make a regular matrix, then put in Dataframe
docu_feat.index = df['spoken_words'] #Give the rows names (text of the review)
docu_feat.columns = feature_names

In [8]:
docu_feat.iloc[0:4, 1000:1015]

Unnamed: 0_level_0,bartholemew,bartholomew,bartish,bartman,barto,bartrand,bartron,barts,barty,bas,base,baseball,based,basement,basements
spoken_words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Where's Mr. Bergstrom?,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
That life is worth living.,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Victory party under the slide!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Mr. Bergstrom! Mr. Bergstrom!,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
#Split the data into a training and a test set
y = df['raw_character_text'] #we want to which character is speaking
X = matrix #based on the word spoken

In [10]:
#split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) 

In [11]:
# Train a NB model on the training set
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [64]:
#to find out the classes being used in the model
clf.classes_

array(['Bart Simpson', 'Lisa Simpson'], dtype='<U12')

In [12]:
# Predict the class (Lisa or Bart) of the test set
print(clf.predict(X[2:3]))

['Bart Simpson']


In [23]:
y_predict = clf.predict(X_test) #the predicted values
cm = confusion_matrix(y_test, y_predict) #creates a "confusion matrix"
conf_matrix = pd.DataFrame(cm, index=['Bart_predict', 'Lisa_predict'], columns = ['Bart', 'Lisa'])
conf_matrix  #this is on the test set which is 30% of the entire dataframe, hence the number mismatch 

Unnamed: 0,Bart,Lisa
Bart_predict,3271,894
Lisa_predict,1799,1611


In [22]:
precision_recall_fscore_support(y_test, y_predict, average=None)

(array([0.64516765, 0.64311377]),
 array([0.78535414, 0.47243402]),
 array([0.70839199, 0.54471682]),
 array([4165, 3410], dtype=int64))

Precision is 64.3%, recall is 47.2% on the predictive model. 
From confusion matrix we see there are a lot of false negatives
Precision means the percentage of your results which are relevant. On the other hand, recall refers to the percentage of total relevant results correctly classified by your algorithm. This is usually a trade off ... to recall everything, you will have to keep generating results which are not accurate, hence lowering your precision

In [84]:
#looking up a line in the dataframe with iloc
l = df.iloc[2:3]["spoken_words"]

7    Victory party under the slide!
Name: spoken_words, dtype: object

In [141]:
#without variable
df.iloc[2:3]

Unnamed: 0,raw_character_text,spoken_words
7,Bart Simpson,Victory party under the slide!


In [15]:
#the proportion of texts said by each character in dataframe
df["raw_character_text"].value_counts(normalize = True)

Bart Simpson    0.544954
Lisa Simpson    0.455046
Name: raw_character_text, dtype: float64

In [16]:
clf.score(X_test, y_test) #calculate the fit on the test data, i.e. the accuracy

0.6444884488448845

In [137]:
#checking the probabilities of dialogue
print (df.iloc[2:3])
p = (clf.predict_proba(X[2:3]))
#arrange the array by ascending values and print the highest value?
print (p.max())

  raw_character_text                    spoken_words
7       Bart Simpson  Victory party under the slide!
0.7936665779408832


In [116]:
#Create a loop that prints out a few lines of dialogue and the associated probabilities for Bart and Lisa. 
#testing solutions
L = df.iloc[2:3]
C = clf.predict(X[2:3])
P = clf.predict_proba(X[2:3])
p = P[0][0]
for key in L:
    print (f"The sentence:\n {L['spoken_words']} \nis likely said by {C} based on {p:.2f} prediction. ")
#Tip: the array with the probabilities is 2-dimensional.
#its messy...lets clean it up

The sentence:
 7    Victory party under the slide!
Name: spoken_words, dtype: object 
is likely said by ['Bart Simpson'] based on 0.79 prediction. 
The sentence:
 7    Victory party under the slide!
Name: spoken_words, dtype: object 
is likely said by ['Bart Simpson'] based on 0.79 prediction. 


In [145]:
#loop for first fifty sentences in the datafram
#this also prints the output from the naive bayesian model and the probabilities for the results
for i in range(50):
    C = clf.predict(X[i])
    words = df['spoken_words'].iloc[i]
    prob = clf.predict_proba(X[i])
    p = prob.max()  #get the maximum value from the array
    print(f"The sentence:'{words}' is likely said by {C[0]} based on {p:.2f} prediction.")
    print(f"Bart: {prob[0][0]}, Lisa:  {prob[0][1]}")  #the array of probabilities for comparison

The sentence:'Where's Mr. Bergstrom?' is likely said by Lisa Simpson based on 0.96 prediction.
Bart: 0.04271498580067743, Lisa:  0.957285014199323
The sentence:'That life is worth living.' is likely said by Bart Simpson based on 0.68 prediction.
Bart: 0.676345532629102, Lisa:  0.3236544673708974
The sentence:'Victory party under the slide!' is likely said by Bart Simpson based on 0.79 prediction.
Bart: 0.7936665779408832, Lisa:  0.20633342205911578
The sentence:'Mr. Bergstrom! Mr. Bergstrom!' is likely said by Lisa Simpson based on 1.00 prediction.
Bart: 0.0016738185124174146, Lisa:  0.9983261814875828
The sentence:'Do you know where I could find him?' is likely said by Bart Simpson based on 0.54 prediction.
Bart: 0.5424412124555592, Lisa:  0.4575587875444406
The sentence:'The train, how like him... traditional, yet environmentally sound.' is likely said by Lisa Simpson based on 0.93 prediction.
Bart: 0.06930172690218564, Lisa:  0.930698273097816
The sentence:'I see he touched you, too

#Do you see patterns (based on the data and your knowledge of the Simpsons)?
Bart talks a lot.

In [148]:
clf.feature_log_prob_[500:510]

array([], shape=(0, 14258), dtype=float64)

In [150]:
vect.get_feature_names()[500:510]

['anguished',
 'angus',
 'anima',
 'animal',
 'animals',
 'animated',
 'animation',
 'animators',
 'anka',
 'ankle']