# Supervised ML Exploration

### Use the skills I learned about NB, SVM, DT, and Random Forest to find the best model to identify letters

In [1]:
#import the letter recognition data into adata frame https://archive.ics.uci.edu/ml/machine-learning-databases/letter-recognition/
import pandas as pd
letter_df = pd.read_csv('/Users/jonathan/Desktop/letter_recognition_data.csv')

In [2]:
#check out the data
#NOTE: I converted the txt file to a csv and added the column headers in excel 
letter_df.head()

Unnamed: 0,lettr,x-box,y-box,width,high,onpix,x-bar,y-bar,x2bar,y2bar,xybar,x2ybr,xy2br,x-ege,xegvy,y-ege,yegvx
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [7]:
#create label and feature data sets

import numpy as np
from sklearn import preprocessing
# X is features
X = np.array(letter_df.drop(['lettr'], 1))
# Scale X by preprocessing to speed performance
X = preprocessing.scale(X)

#y is label
y = np.array(letter_df['lettr'])



In [8]:
#Make training and test data, used the default 25% test size
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

## Try out an SVM

In [9]:
#Use a standard SVM with out of the box parameters to predict the letter
from sklearn.metrics import accuracy_score
from sklearn import svm
from time import time

clf = svm.SVC(gamma='auto')
t0 = time()
clf.fit(X_train, y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 2.347 s
prediction time: 2.193 s
Accuracy 0.946


In [18]:
#Tune the SVM by adjusting the Kernel and C parameter
from sklearn.metrics import accuracy_score
from sklearn import svm
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = svm.SVC(gamma='auto', kernel='rbf', C=100000)
t0 = time()
clf.fit(X_train, y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 2.014 s
prediction time: 1.487 s
Accuracy 0.9714


In [11]:
#Tune the SVM, slightly lower C
from sklearn.metrics import accuracy_score
from sklearn import svm
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = svm.SVC(gamma='auto', kernel='rbf', C=1000)
t0 = time()
clf.fit(X_train, y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 2.069 s
prediction time: 1.606 s
Accuracy 0.9714


In [12]:
#Tune the SVM, try sigmoid Kernel
from sklearn.metrics import accuracy_score
from sklearn import svm
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = svm.SVC(gamma='auto', kernel='sigmoid', C=1000)
t0 = time()
clf.fit(X_train, y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 2.985 s
prediction time: 3.087 s
Accuracy 0.4344


### Why would a SVM be good for this?

#### Good - Not a ton of features (16 of them) or data, could be good at separating on these multiple dimensions (kernel trick). 

## Try Random Forest

In [13]:
#Random Forest with no parameter changes
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = RandomForestClassifier()
t0 = time()
clf.fit(X_train,y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 0.183 s
prediction time: 0.014 s
Accuracy 0.9336




In [14]:
#Random Forest - updated the min sample split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = RandomForestClassifier(min_samples_split = 25)
t0 = time()
clf.fit(X_train,y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))



training time: 0.164 s
prediction time: 0.014 s
Accuracy 0.9032


In [15]:
#Random Forest - updated the n_estimators to 200 (200 trees)
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = RandomForestClassifier(n_estimators = 200)
t0 = time()
clf.fit(X_train,y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 3.225 s
prediction time: 0.294 s
Accuracy 0.964


In [16]:
#Random Forest - updated the n_estimators to 1000
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from time import time

#really high c and kernel of rbf, the really high c could lead to over fitting
clf = RandomForestClassifier(n_estimators = 1000)
t0 = time()
clf.fit(X_train,y_train)
#print trianing time
print ("training time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(X_test)

#print prediction  time
print ("prediction time:", round(time()-t0, 3), "s")

#get accuracy
acc = accuracy_score(pred, y_test)
print('Accuracy ' + str(acc))

training time: 15.288 s
prediction time: 1.431 s
Accuracy 0.9666


#### Random forest seemed good since it would be hard to overfit and is typically good in CV classification. Adjusting the N_esitmators helpped, but going real big was marginal. SVM was much more performant given time and accuracy