In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score, log_loss

In [2]:
train = pd.read_csv('data_3/train.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


In [4]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
id,1,2,3,5,6
species,Acer_Opalus,Pterocarya_Stenoptera,Quercus_Hartwissiana,Tilia_Tomentosa,Quercus_Variabilis
margin1,0.007812,0.005859,0.005859,0,0.005859
margin2,0.023438,0,0.009766,0.003906,0.003906
margin3,0.023438,0.03125,0.019531,0.023438,0.048828
...,...,...,...,...,...
texture60,0,0,0,0,0
texture61,0,0,0,0,0
texture62,0.004883,0.000977,0,0.017578,0
texture63,0,0.039062,0.020508,0,0


In [5]:
len(train.species.unique())

99

In [6]:
def encode_strings(df):
    categorical_cols = [col for col in df.columns if df[col].dtype == 'O']

    labelEncoders = {
        col : LabelEncoder() for col in categorical_cols
    }

    fitted_labelEncoders = {
        col : labelEncoders[col].fit(df[col]) for col in categorical_cols
    }

    for col in categorical_cols:
        df[col] = fitted_labelEncoders[col].transform(df[col])

    return fitted_labelEncoders, categorical_cols, df

In [7]:
labelEncoders, categorical_cols, df_encoded = encode_strings(train.copy())

In [36]:
trainX, testX, trainY, testY = train_test_split(df_encoded.drop('species', axis=1), df_encoded.species, random_state=10, test_size=0.2, stratify=df_encoded.species)

In [37]:
testY.name

'species'

In [46]:
class Classifiers:
    models          = []
    probs_pred      = []
    preds           = []
    accuracy_scores = []
    log_losses      = []
    
    def get_log_losses(self):
        return self.log_losses
    
    def get_accuracy_scores(self):
        return self.accuracy_scores
    
    def calculate_scores(self, labelEncoders, testY):
        y_true = labelEncoders[testY.name].inverse_transform(testY)
        for x in range(len(self.models)):
            y_pred = labelEncoders[testY.name].inverse_transform(self.preds[x])
            
            accScore = accuracy_score(y_true, y_pred)
            
            logLoss = log_loss(testY, self.probs_pred[x])
            #logLoss = log_loss(testY, self.preds[x])
            
            self.accuracy_scores.append(accScore)
            self.log_losses.append(logLoss)
            print(f'{self.models[x]} : {round(accScore*100,2)} % | {logLoss}')
    
    def predict(self, testX):
        for model in self.models:
            tmp_pred = model.predict(testX)
            tmp_prob = model.predict_proba(testX)
            self.preds.append(tmp_pred)
            self.probs_pred.append(tmp_prob)
    
    def fit(self, trainX, trainY):
        for model in self.models:
            model.fit(trainX, trainY)
        
    def __init__(self):
        model_DecisionTreeClassifier = DecisionTreeClassifier()
        model_RandForestClassifier = RandomForestClassifier()
        model_SupportVector = SVC(probability=True)
        model_GaussianNB = GaussianNB()
        self.models = [
            model_DecisionTreeClassifier,
            model_RandForestClassifier,
            model_SupportVector,
            model_GaussianNB            
        ]

In [47]:
models = Classifiers()
models.fit(trainX, trainY)
models.predict(testX)
models.calculate_scores(labelEncoders, testY)

DecisionTreeClassifier() : 73.74 % | 9.07078976027967
RandomForestClassifier() : 97.98 % | 0.7659486127753398
SVC(probability=True) : 0.51 % | 4.608415456133478
GaussianNB() : 92.42 % | 0.8001221477601873


> ## Random Forest Classifier yields the best results out of the others

In [48]:
full_Test = pd.read_csv('data_3/test.csv')

In [49]:
full_Test

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,4,0.019531,0.009766,0.078125,0.011719,0.003906,0.015625,0.005859,0.000000,0.005859,...,0.006836,0.000000,0.015625,0.000977,0.015625,0.0,0.0,0.000000,0.003906,0.053711
1,7,0.007812,0.005859,0.064453,0.009766,0.003906,0.013672,0.007812,0.000000,0.033203,...,0.000000,0.000000,0.006836,0.001953,0.013672,0.0,0.0,0.000977,0.037109,0.044922
2,9,0.000000,0.000000,0.001953,0.021484,0.041016,0.000000,0.023438,0.000000,0.011719,...,0.128910,0.000000,0.000977,0.000000,0.000000,0.0,0.0,0.015625,0.000000,0.000000
3,12,0.000000,0.000000,0.009766,0.011719,0.017578,0.000000,0.003906,0.000000,0.003906,...,0.012695,0.015625,0.002930,0.036133,0.013672,0.0,0.0,0.089844,0.000000,0.008789
4,13,0.001953,0.000000,0.015625,0.009766,0.039062,0.000000,0.009766,0.000000,0.005859,...,0.000000,0.042969,0.016602,0.010742,0.041016,0.0,0.0,0.007812,0.009766,0.007812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,1576,0.000000,0.000000,0.003906,0.015625,0.041016,0.000000,0.017578,0.000000,0.005859,...,0.098633,0.000000,0.004883,0.000000,0.003906,0.0,0.0,0.018555,0.000000,0.000977
590,1577,0.000000,0.003906,0.003906,0.005859,0.017578,0.000000,0.017578,0.005859,0.000000,...,0.012695,0.004883,0.004883,0.002930,0.009766,0.0,0.0,0.090820,0.000000,0.016602
591,1579,0.017578,0.029297,0.015625,0.013672,0.003906,0.015625,0.025391,0.000000,0.000000,...,0.073242,0.000000,0.028320,0.000000,0.001953,0.0,0.0,0.000000,0.042969,0.006836
592,1580,0.013672,0.009766,0.060547,0.025391,0.035156,0.025391,0.039062,0.000000,0.003906,...,0.003906,0.000000,0.000977,0.000000,0.011719,0.0,0.0,0.000000,0.011719,0.018555


In [61]:
model_RandForest = RandomForestClassifier()
model_RandForest.fit(trainX, trainY)
train_score = accuracy_score(labelEncoders['species'].inverse_transform(testY), labelEncoders['species'].inverse_transform(model_RandForest.predict(testX)))
print(f'Accuracy on train : {train_score}')


Accuracy on train : 0.9848484848484849


In [62]:
predictions = model_RandForest.predict(full_Test)