In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline 
sns.set(style="ticks")


In [44]:
data = pd.read_csv('../datasets/RickAndMortyScripts.csv')
data

Unnamed: 0,index,season no.,episode no.,episode name,name,line
0,0,1,1,Pilot,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,1,1,Pilot,Morty,"What, Rick? What’s going on?"
2,2,1,1,Pilot,Rick,"I got a surprise for you, Morty."
3,3,1,1,Pilot,Morty,It's the middle of the night. What are you tal...
4,4,1,1,Pilot,Rick,"Come on, I got a surprise for you. Come on, h..."
...,...,...,...,...,...,...
1900,2483,3,7,Tales From the Citadel,Morty,That was amazing!
1901,2484,3,7,Tales From the Citadel,Rick,Got some of that mermaid puss!
1902,2485,3,7,Tales From the Citadel,Morty,I'm really hoping it wasn't a one-off thing an...
1903,2486,3,7,Tales From the Citadel,Rick,"Pssh! Not at all, Morty. That place will never..."


In [51]:
data = data.drop(columns = ['season no.', 'episode no.','episode name'])

In [52]:
data.columns

Index(['index', 'name', 'line'], dtype='object')

In [53]:
data['name'].value_counts()

Rick                      420
Morty                     347
Beth                      148
Jerry                     132
Summer                     97
Pickle Rick                77
Supernova                  44
Cop Morty                  34
All Ricks                  32
Mr. Goldenfold             28
President                  27
Cop Rick                   26
Testicle Monster A         26
Principal Vagina           25
Cornvelious Daniel         22
Snuffles                   22
Drunk Rick                 21
Dr. Wong                   21
Agency Director            20
Alan                       19
Candidate Morty            18
Vance                      17
Scary Terry                17
Jessica                    16
Million Ants               15
All Mortys                 15
Ice-T                      13
Morty 2                    13
All Summers                13
Riq IV                     13
Alien Doctor               12
Campaign Manager Morty     12
Lizard Morty               11
Cromulon  

In [54]:
data = data[data['name'].isin(['Rick', 'Morty', 'Beth', 'Jerry', 'Summer'])]
data

Unnamed: 0,index,name,line
0,0,Rick,Morty! You gotta come on. Jus'... you gotta co...
1,1,Morty,"What, Rick? What’s going on?"
2,2,Rick,"I got a surprise for you, Morty."
3,3,Morty,It's the middle of the night. What are you tal...
4,4,Rick,"Come on, I got a surprise for you. Come on, h..."
...,...,...,...
1900,2483,Morty,That was amazing!
1901,2484,Rick,Got some of that mermaid puss!
1902,2485,Morty,I'm really hoping it wasn't a one-off thing an...
1903,2486,Rick,"Pssh! Not at all, Morty. That place will never..."


In [55]:
X = data.drop('name', axis=1)
Y = data['name']

In [57]:
X

Unnamed: 0,index,line
0,0,Morty! You gotta come on. Jus'... you gotta co...
1,1,"What, Rick? What’s going on?"
2,2,"I got a surprise for you, Morty."
3,3,It's the middle of the night. What are you tal...
4,4,"Come on, I got a surprise for you. Come on, h..."
...,...,...
1900,2483,That was amazing!
1901,2484,Got some of that mermaid puss!
1902,2485,I'm really hoping it wasn't a one-off thing an...
1903,2486,"Pssh! Not at all, Morty. That place will never..."


In [58]:
Y

0        Rick
1       Morty
2        Rick
3       Morty
4        Rick
        ...  
1900    Morty
1901     Rick
1902    Morty
1903     Rick
1904    Morty
Name: name, Length: 1144, dtype: object

In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=1)
print('{}, {}'.format(X_train.shape, X_test.shape))
print('{}, {}'.format(Y_train.shape, Y_test.shape))

(858, 2), (286, 2)
(858,), (286,)


In [73]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train + X_test)

index    float64
line      object
dtype: object

In [61]:
X_train

Unnamed: 0,index,line
239,267,"Ohh, man. Oh, geez! Ohh."
87,95,"Aw, geez. Okay. I guess I can skip history. Wh..."
494,566,Oww!
538,611,"You have dropped so many balls, man. Do you ev..."
2,2,"I got a surprise for you, Morty."
...,...,...
1028,1276,"Hey, what's wrong Morty? Oh, you're worried ab..."
1277,1652,McNuggets?
1686,2123,"I think... no matter what we put on there, we ..."
259,288,Roll over.


In [62]:
X_train_vec = vectorizer.transform(X_train['line'])
X_test_vec = vectorizer.transform(X_test['line'])

In [63]:

X_train_vec.shape

(858, 2)

In [64]:
def test(model):
    print(model)
    model.fit(X_train_vec, Y_train)
    print("accuracy:", accuracy_score(Y_test, model.predict(X_test_vec)))

In [65]:
test(LogisticRegression(solver='lbfgs', multi_class='auto'))

LogisticRegression()
accuracy: 0.3706293706293706


In [66]:
test(LinearSVC())

LinearSVC()
accuracy: 0.3706293706293706


In [67]:
test(MultinomialNB())

MultinomialNB()
accuracy: 0.3706293706293706


In [68]:
test(ComplementNB())

ComplementNB()
accuracy: 0.13986013986013987


In [69]:
test(BernoulliNB())


BernoulliNB()
accuracy: 0.3706293706293706
