## Install Packages

In [145]:
%pip install seaborn
%matplotlib inline

Note: you may need to restart the kernel to use updated packages.


In [146]:
pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [167]:
import numpy as np
import pandas as pd
import plotly.express as px
from time import time

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import  train_test_split
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score, log_loss
from sklearn.model_selection import StratifiedShuffleSplit, ShuffleSplit, GridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

### Import CSV FILES

In [148]:
df_train = pd.read_csv("/Users/isa/Desktop/LeafClassification/train.csv")
df_test = pd.read_csv("/Users/isa/Desktop/LeafClassification/test.csv")

## Data Preprocessing

In [149]:
df_train.head()
df_train_cp = df_train.copy()

In [150]:
df_test.head()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,4,0.019531,0.009766,0.078125,0.011719,0.003906,0.015625,0.005859,0.0,0.005859,...,0.006836,0.0,0.015625,0.000977,0.015625,0.0,0.0,0.0,0.003906,0.053711
1,7,0.007812,0.005859,0.064453,0.009766,0.003906,0.013672,0.007812,0.0,0.033203,...,0.0,0.0,0.006836,0.001953,0.013672,0.0,0.0,0.000977,0.037109,0.044922
2,9,0.0,0.0,0.001953,0.021484,0.041016,0.0,0.023438,0.0,0.011719,...,0.12891,0.0,0.000977,0.0,0.0,0.0,0.0,0.015625,0.0,0.0
3,12,0.0,0.0,0.009766,0.011719,0.017578,0.0,0.003906,0.0,0.003906,...,0.012695,0.015625,0.00293,0.036133,0.013672,0.0,0.0,0.089844,0.0,0.008789
4,13,0.001953,0.0,0.015625,0.009766,0.039062,0.0,0.009766,0.0,0.005859,...,0.0,0.042969,0.016602,0.010742,0.041016,0.0,0.0,0.007812,0.009766,0.007812


In [151]:
df_train['species'].nunique()

99

In [152]:
plot_df = df_train['species'].value_counts().sort_values(ascending=False)
fig = px.bar(plot_df, x=plot_df.index, y=plot_df, color='species',
             labels={'index':'Duration', 'y':'Count'}, height=500, width=900, color_continuous_scale=px.colors.sequential.Sunset)
fig.update_layout(title="Count of species in each duration category")
fig.show()

In [153]:
le = LabelEncoder().fit(df_train['species'])
df_train['label'] = le.transform(df_train['species'])
labels = df_train['label']
df_train = df_train.drop(columns=['id','species','label'])

In [154]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(df_train)
train_scale = pd.DataFrame(scaler.transform(df_train))

In [155]:
test_id = df_test.id
test_data = df_test.drop(['id'], axis=1)
test_features_scale = scaler.transform(test_data)

In [156]:
x = df_train
y = df_train_cp['species']

In [162]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
for train_index, val_index in sss.split(train_scale,  labels):
    x_train, x_val = train_scale.iloc[train_index], train_scale.iloc[val_index]
    y_train, y_val = labels.iloc[train_index], labels.iloc[val_index]

In [163]:
cv_sets = ShuffleSplit(n_splits=10,test_size=0.20,random_state=42)
classifiers = [RandomForestClassifier(), SVC(), KNeighborsClassifier()]
params = [{'n_estimators' : [3,10,30], 'max_features':[2,4,6,8]},
          {'kernel':('linear','poly','sigmoid','rbf'),'C':[0.01,0.05,0.025,0.07,0.09,1.0], 'gamma':['scale'], 'probability':[True]},
          {'n_neighbors': [3,5,7,9]}]

In [164]:
best_estimators = []
for classifier, param in zip(classifiers, params):
    grid = GridSearchCV(classifier,param,cv=cv_sets)
    grid = grid.fit(x_train,y_train)
    best_estimators.append(grid.best_estimator_)

In [165]:
best_estimators

[RandomForestClassifier(max_features=6, n_estimators=30),
 SVC(C=0.025, kernel='linear', probability=True),
 KNeighborsClassifier(n_neighbors=3)]

In [168]:
for estimator in best_estimators:
    estimator.fit(x_train, y_train)
    name = estimator.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    print('**Training set**')
    train_predictions = estimator.predict(x_train)
    acc = accuracy_score(y_train, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = estimator.predict_proba(x_train)
    ll = log_loss(y_train, train_predictions)
    print("Log Loss: {}".format(ll))
    
    print('**Validation set**')
    train_predictions = estimator.predict(x_val)
    acc = accuracy_score(y_val, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = estimator.predict_proba(x_val)
    ll = log_loss(y_val, train_predictions)
    print("Log Loss: {}".format(ll))
    
print("="*30)

RandomForestClassifier
****Results****
**Training set**
Accuracy: 100.0000%
Log Loss: 0.24625558191756503
**Validation set**
Accuracy: 97.4747%
Log Loss: 0.8338787421453393
SVC
****Results****
**Training set**
Accuracy: 100.0000%
Log Loss: 2.2059498779787847
**Validation set**
Accuracy: 99.4949%
Log Loss: 2.318575900964872
KNeighborsClassifier
****Results****
**Training set**
Accuracy: 98.2323%
Log Loss: 0.041433802638993475
**Validation set**
Accuracy: 98.4848%
Log Loss: 0.0412192836531407
