# Project 2 - To Be or Not To Be
## Import Packages

In [1]:
import numpy as np
import pandas as pd

#Usefull functions
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#models
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#ignore warnings
import warnings
warnings.filterwarnings("ignore") 

## Import Dataset

In [2]:
df = pd.read_csv("Data\\Shakespeare_data.csv")
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


## Data Cleaning
### Take a look of dataset information

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111396 entries, 0 to 111395
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          111396 non-null  int64  
 1   Play              111396 non-null  object 
 2   PlayerLinenumber  111393 non-null  float64
 3   ActSceneLine      105153 non-null  object 
 4   Player            111389 non-null  object 
 5   PlayerLine        111396 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 3.4+ MB


In [4]:
df.isnull().sum()

Dataline               0
Play                   0
PlayerLinenumber       3
ActSceneLine        6243
Player                 7
PlayerLine             0
dtype: int64

### Delete Null variable rows

In [5]:
df_noNull = df.dropna()
df_noNull.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"
5,6,Henry IV,1.0,1.1.3,KING HENRY IV,And breathe short-winded accents of new broils
6,7,Henry IV,1.0,1.1.4,KING HENRY IV,To be commenced in strands afar remote.
7,8,Henry IV,1.0,1.1.5,KING HENRY IV,No more the thirsty entrance of this soil


In [6]:
df_noNull.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105152 entries, 3 to 111394
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Dataline          105152 non-null  int64  
 1   Play              105152 non-null  object 
 2   PlayerLinenumber  105152 non-null  float64
 3   ActSceneLine      105152 non-null  object 
 4   Player            105152 non-null  object 
 5   PlayerLine        105152 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 4.0+ MB


### Drop unused columns from data set and make it easier to observe

In [7]:
df2 = df_noNull.drop(['ActSceneLine','Dataline'], axis='columns')
linelength = [len(i) for i in df2['PlayerLine']]
df2['LineLength']=linelength
df2.head()
dff = df2[['Play','PlayerLinenumber','PlayerLine','LineLength','Player']]
dff.head()

Unnamed: 0,Play,PlayerLinenumber,PlayerLine,LineLength,Player
3,Henry IV,1.0,"So shaken as we are, so wan with care,",38,KING HENRY IV
4,Henry IV,1.0,"Find we a time for frighted peace to pant,",42,KING HENRY IV
5,Henry IV,1.0,And breathe short-winded accents of new broils,46,KING HENRY IV
6,Henry IV,1.0,To be commenced in strands afar remote.,39,KING HENRY IV
7,Henry IV,1.0,No more the thirsty entrance of this soil,41,KING HENRY IV


### Store the cleaned dataset

In [8]:
dff.to_csv('Data\\UsableData.csv')

## Multi-Model Process

In [9]:
Players = df.pivot_table(index=['Player'], aggfunc='size')
Players

Player
A Lord                1
A Patrician           4
A Player              4
AARON               375
ABERGAVENNY          18
                   ... 
Young MARCIUS         2
of BUCKINGHAM        14
of King Henry VI     10
of Prince Edward     10
of young Princes     11
Length: 934, dtype: int64

### Unified Types and Forms

In [10]:
dff['Play'] = preprocessing.LabelEncoder().fit_transform(dff['Play'])
dff['PlayerLine'] = preprocessing.LabelEncoder().fit_transform(dff['PlayerLine'])
dff['Player'] = preprocessing.LabelEncoder().fit_transform(dff['Player'])
X = dff.loc[:, (dff.columns != 'Player')]
Y = dff['Player'].values

### Splite Train and Test Dataset

In [11]:
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y,test_size=0.30, random_state=1, shuffle = True)
print(X_train)
print(X_validation)
print(Y_train)
print(Y_validation)

        Play  PlayerLinenumber  PlayerLine  LineLength
9991      13               3.0       50466          39
67952      1              43.0       68771          29
103910    33              31.0       64717          41
42689     14              16.0       57347          32
48553     15             127.0       42752          40
...      ...               ...         ...         ...
53031     18              70.0       94365          51
103945    33               3.0       87745          46
5513      11               6.0         378          45
82290     26              28.0       13900          44
104467    33               1.0       82402          39

[73606 rows x 4 columns]
       Play  PlayerLinenumber  PlayerLine  LineLength
40544    14              23.0       20671          25
52578    17              52.0       57473          43
46204    16              16.0       82062          42
60230    19              16.0       84144          36
65403    21              10.0       42337   

### Gaussian Naive Bayes Model

In [12]:
model = GaussianNB()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
print('Accuracy = '+ str(accuracy_score(Y_validation, predictions)))

Accuracy = 0.1767577505864452


### K Neighbors Classifier Model

In [13]:
model = KNeighborsClassifier()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
print('Accuracy = '+ str(accuracy_score(Y_validation, predictions)))

Accuracy = 0.024852596208711088


### Linear Discriminat Analysis Model

In [14]:
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
print('Accuracy = '+ str(accuracy_score(Y_validation, predictions)))

Accuracy = 0.08077093767831103


### Linear Regression Model

In [15]:
model = LinearRegression()
model.fit(X_train, Y_train)
predictions = model.predict(X_validation)
prediction = []
def convertR(floatlist, target):
    for i in floatlist:
        if i%1>0.5:
            target.append(round(i))
        elif i%1<0.4:
            target.append(round(i)-1)
        else:
            target.append(round(i))

convertR(predictions,prediction)
print('Accuracy = '+ str(accuracy_score(Y_validation, prediction)))

Accuracy = 0.002726177645343308


## Conclusion

In the test process, we use 70% of our dataset to train the each model and 30% to test the prediction. The four models that we chosed did not perform very well. The highest score is Gaussian Naive Bayes model which has 17.6% accuracy.