In [141]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

In [142]:
# load data into pandas dataframe
shakespeareData = pd.read_csv('../data/raw/Shakespeare/Shakespeare_data.csv', index_col='Dataline')

In [143]:
# python list comprehension without a condition
PlayerLineLength = [len(x) for x in shakespeareData['PlayerLine']]
shakespeareData['PlayerLineLength'] = PlayerLineLength
shakespeareData

Unnamed: 0_level_0,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,PlayerLineLength
Dataline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Henry IV,,,,ACT I,5
2,Henry IV,,,,SCENE I. London. The palace.,28
3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ...",96
4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38
5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42
...,...,...,...,...,...,...
111392,A Winters Tale,38.0,5.3.180,LEONTES,"Lead us from hence, where we may leisurely",42
111393,A Winters Tale,38.0,5.3.181,LEONTES,Each one demand an answer to his part,37
111394,A Winters Tale,38.0,5.3.182,LEONTES,Perform'd in this wide gap of time since first,46
111395,A Winters Tale,38.0,5.3.183,LEONTES,We were dissever'd: hastily lead away.,38


I believe that calulating the line length for each line spoken will be a useful feature. One character may average longer lines than others. I will check the average line length for two characters and see if this supports my hypothesis. 

In [144]:
count = 0
count2=0
num = 0 
num2 = 0
zipped = zip(shakespeareData['PlayerLineLength'],shakespeareData['Player'])
zlist = list(zipped)
for (x,y) in zlist:
    if y=="KING HENRY IV":
        num = num + 1
        count=count+x
    elif y=="WESTMORELAND":
        num2 = num + 1
        count2=count2+x
print("Average line length in chars for King Henry IV: ", count/num)
print("Average line length in chars for Westmoreland: ",count2/num2)

Average line length in chars for King Henry IV:  39.762569832402235
Average line length in chars for Westmoreland:  9.044568245125348


The cell above shows that the average line length for King Henry vs Westmoreland is substantially different. On average King Henry's lines are 30 characters longer in length than Westmoreland

In [145]:
# remove rows with NaN
shakespeareData=shakespeareData.dropna()
# reindex for removed rows starting with 1
shakespeareData.index = np.arange(1, len(shakespeareData)+1)
shakespeareData.index.name = 'Dataline'
shakespeareData

Unnamed: 0_level_0,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine,PlayerLineLength
Dataline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,",38
2,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,",42
3,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils,46
4,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.,39
5,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil,41
...,...,...,...,...,...,...
105148,A Winters Tale,38.0,5.3.179,LEONTES,"Is troth-plight to your daughter. Good Paulina,",47
105149,A Winters Tale,38.0,5.3.180,LEONTES,"Lead us from hence, where we may leisurely",42
105150,A Winters Tale,38.0,5.3.181,LEONTES,Each one demand an answer to his part,37
105151,A Winters Tale,38.0,5.3.182,LEONTES,Perform'd in this wide gap of time since first,46


In [None]:
I think that spliting Act Scene Line into separate features may help with classification instead of having all three in 

In [146]:
# split ActSceneLine into Act Scene Line

# creates a list of tuples (Act, Scene, Line) or (nan, nan, nan)
# creates a tuple of integer by spliting the string using the "." as a delimiter to split on
ActSceneLine = [tuple(map(int,x.split('.'))) for x in shakespeareData['ActSceneLine'] ]
# Creates Act, Scene, and Line lists from list comprehension using tuple unpacking
Act = [A for A, S, L in ActSceneLine]
Scene = [S for A, S, L in ActSceneLine]
Line = [L for A, S, L in ActSceneLine]
# removes ActSceneLine from dataframe and adds in Act, Scene, Line
shakespeareData = shakespeareData.drop(columns='ActSceneLine')
shakespeareData['Act'] = Act
shakespeareData['Scene'] = Scene
shakespeareData['Line'] = Line
shakespeareData

Unnamed: 0_level_0,Play,PlayerLinenumber,Player,PlayerLine,PlayerLineLength,Act,Scene,Line
Dataline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Henry IV,1.0,KING HENRY IV,"So shaken as we are, so wan with care,",38,1,1,1
2,Henry IV,1.0,KING HENRY IV,"Find we a time for frighted peace to pant,",42,1,1,2
3,Henry IV,1.0,KING HENRY IV,And breathe short-winded accents of new broils,46,1,1,3
4,Henry IV,1.0,KING HENRY IV,To be commenced in strands afar remote.,39,1,1,4
5,Henry IV,1.0,KING HENRY IV,No more the thirsty entrance of this soil,41,1,1,5
...,...,...,...,...,...,...,...,...
105148,A Winters Tale,38.0,LEONTES,"Is troth-plight to your daughter. Good Paulina,",47,5,3,179
105149,A Winters Tale,38.0,LEONTES,"Lead us from hence, where we may leisurely",42,5,3,180
105150,A Winters Tale,38.0,LEONTES,Each one demand an answer to his part,37,5,3,181
105151,A Winters Tale,38.0,LEONTES,Perform'd in this wide gap of time since first,46,5,3,182


In [147]:
# swaps player to the last column since it is the classification and other columns are features
shakespeareData = shakespeareData[["Play", "PlayerLinenumber","PlayerLine","PlayerLineLength","Act","Scene","Line","Player"]]
shakespeareData

Unnamed: 0_level_0,Play,PlayerLinenumber,PlayerLine,PlayerLineLength,Act,Scene,Line,Player
Dataline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Henry IV,1.0,"So shaken as we are, so wan with care,",38,1,1,1,KING HENRY IV
2,Henry IV,1.0,"Find we a time for frighted peace to pant,",42,1,1,2,KING HENRY IV
3,Henry IV,1.0,And breathe short-winded accents of new broils,46,1,1,3,KING HENRY IV
4,Henry IV,1.0,To be commenced in strands afar remote.,39,1,1,4,KING HENRY IV
5,Henry IV,1.0,No more the thirsty entrance of this soil,41,1,1,5,KING HENRY IV
...,...,...,...,...,...,...,...,...
105148,A Winters Tale,38.0,"Is troth-plight to your daughter. Good Paulina,",47,5,3,179,LEONTES
105149,A Winters Tale,38.0,"Lead us from hence, where we may leisurely",42,5,3,180,LEONTES
105150,A Winters Tale,38.0,Each one demand an answer to his part,37,5,3,181,LEONTES
105151,A Winters Tale,38.0,Perform'd in this wide gap of time since first,46,5,3,182,LEONTES


In [148]:
# export feature engineered / cleaned data set to the processed data directory 
shakespeareData.to_csv('../data/processed/Shakespeare_cleaned.csv')

        

Split data set into features and labels then split both into training and testing datasets.
Also encodes string data for usage in RandomForrest Classifier

In [149]:
# remove the players line string so we can run random forest
shakespeareData = shakespeareData.drop(columns = 'PlayerLine')

# encode the plays
encodePlays = preprocessing.LabelEncoder()
encodePlays.fit(shakespeareData.iloc[:, 0].values)
Plays = encodePlays.transform(shakespeareData.iloc[:, 0].values)
shakespeareData['Play'] = Plays

# encode labels for use in classifier
encodedLabels = preprocessing.LabelEncoder()
encodedLabels.fit(shakespeareData.iloc[:, 6].values)
encoded_label = encodedLabels.transform(shakespeareData.iloc[:, 6].values)
shakespeareData['Player'] = encoded_label

# spliting into features and labels
attributes = shakespeareData.iloc[:, 0:6].values
labels = shakespeareData.iloc[:, 6].values

#spliting into training and testing 
Atr_train, Atr_test, label_train, label_test = train_test_split(attributes, labels, test_size=0.2, random_state=0)

Now, I will use scikit-learn's random forest regressor to do random forest classification on the data set.

In [150]:
forest = RandomForestClassifier(n_estimators=20, random_state=0)
forest.fit(Atr_train, label_train)
label_pred = forest.predict(Atr_test)

# prints out the accuracy of predictions 
print(accuracy_score(label_test, label_pred))

0.7353906138557368


The random forrest classifier works well on this data set. 
Now, I will run a Naive Bayes on the training data and check the prediction accuracy.

In [151]:
naive = GaussianNB()
naive.fit(Atr_train, label_train)
label_pred = naive.predict(Atr_test)
print(accuracy_score(label_test, label_pred))

0.22609481241976132


As expected, the naive bayes does not perform well on this data set. The features of this data set are correlated and the data set does not follow a statistical distribution 