In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [11]:
# Import the set (per episode) and delete any possible nan row 
train_data=pd.read_csv('train_script3.csv')
train_data= train_data.dropna().reset_index()

test_data = pd.read_csv('test_script3.csv')
test_data= test_data.dropna().reset_index()

In [12]:
# Encode the labels of the training set
le = preprocessing.LabelEncoder()

le.fit(train_data['character'])
le.classes_
labels_train = le.transform(train_data['character']) 


zero = list(le.inverse_transform([0]))
one = list(le.inverse_transform([1]))
two = list(le.inverse_transform([2]))
three = list(le.inverse_transform([3]))
four = list(le.inverse_transform([4]))
five = list(le.inverse_transform([5]))
six = list(le.inverse_transform([6]))

print(zero, 'is encoded as 0')
print(one, 'is encoded as 1')
print(two, 'is encoded as 2')
print(three, 'is encoded as 3')
print(four, 'is encoded as 4')
print(five, 'is encoded as 5')
print(six, 'is encoded as 6')

['Amy'] is encoded as 0
['Bernadette'] is encoded as 1
['Howard'] is encoded as 2
['Leonard'] is encoded as 3
['Penny'] is encoded as 4
['Raj'] is encoded as 5
['Sheldon'] is encoded as 6


In [13]:
# Encode the labels of the testing set
le.fit(test_data['character'])
le.classes_
labels_test = le.transform(test_data['character']) 


zero = list(le.inverse_transform([0]))
one = list(le.inverse_transform([1]))
two = list(le.inverse_transform([2]))
three = list(le.inverse_transform([3]))
four = list(le.inverse_transform([4]))
five = list(le.inverse_transform([5]))
six = list(le.inverse_transform([6]))

print(zero, 'is encoded as 0')
print(one, 'is encoded as 1')
print(two, 'is encoded as 2')
print(three, 'is encoded as 3')
print(four, 'is encoded as 4')
print(five, 'is encoded as 5')
print(six, 'is encoded as 6')

['Amy'] is encoded as 0
['Bernadette'] is encoded as 1
['Howard'] is encoded as 2
['Leonard'] is encoded as 3
['Penny'] is encoded as 4
['Raj'] is encoded as 5
['Sheldon'] is encoded as 6


In [14]:
# Replace the label with the encoded ones
train_data['character'] = labels_train
test_data['character'] = labels_test

In [15]:
# Check the length of the training set
len(train_data)

1264

In [16]:
# Gather all data together 

data = [train_data, test_data]
all_data = pd.concat(data).reset_index()
all_data = all_data[['lines', 'character']]

In [18]:
count_vect = CountVectorizer()

# Get the count vectors 
X_train_counts = count_vect.fit_transform(all_data.lines)
tfidf_transformer = TfidfTransformer()

# Using the count vectors get the tfidf features 
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [22]:
# Run the SVM model with 10 fold stratified cross validation

from sklearn import svm 
from sklearn.svm import SVC

from sklearn.model_selection import StratifiedKFold, KFold
import pycm
from pycm import *

skf = StratifiedKFold(n_splits=10)
X = X_train_tfidf
y = all_data.character


all_cm = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = svm.SVC(gamma= 'scale')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = y_pred.tolist()
    y_test = y_test.tolist()
    cm = ConfusionMatrix(y_pred, y_test)
    all_cm.append(cm)
    


In [23]:
# Obtain the confusion matrix for the different 10 holds together 
confusion_matrix = pd.DataFrame()
for i in range(len(all_cm)):
      confusion_matrix= confusion_matrix.append(pd.DataFrame(all_cm[i].table))
        
confusion_matrix = confusion_matrix.groupby(confusion_matrix.index).sum()
confusion_matrix.columns=['Amy', 'Bernadette', 'Howard', 'Leonard', 'Penny', 'Raj','Sheldon']
confusion_matrix.index = ['Amy', 'Bernadette', 'Howard', 'Leonard', 'Penny', 'Raj','Sheldon']
confusion_matrix



Unnamed: 0,Amy,Bernadette,Howard,Leonard,Penny,Raj,Sheldon
Amy,38,1,23,25,7,46,11
Bernadette,3,50,36,12,10,47,0
Howard,2,1,133,21,4,62,2
Leonard,1,3,28,158,16,15,3
Penny,0,2,20,15,170,17,0
Raj,0,0,27,19,2,173,3
Sheldon,1,0,5,4,0,10,206


In [27]:
# The performance metrics

TP = np.diag(confusion_matrix) 
FP = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix) 
FN = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)


PRE = (TP)/(TP+FP)
print('precision:', np.mean(PRE))
REC = (TP)/(TP+FN)
print('recall:', np.mean(REC))
F1 = 2*np.mean(REC)*np.mean(PRE)/(np.mean(REC)+np.mean(PRE))
print('F1:', np.mean(F1))

precision: 0.6153334291555171
recall: 0.7184536445057276
F1: 0.6629072263377923
