In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import seaborn as sns
from pandas import set_option
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import plotly.offline as py
import plotly.graph_objs as go

import time

In [2]:
df = pd.read_csv('chineseMNIST.csv')

print("Rows and Columns of Raw Data :",df.to_numpy().shape)

Rows and Columns of Raw Data : (15000, 4098)


In [3]:
#df['label'].replace(100,11,inplace=True)
#df['label'].replace(1000,12,inplace=True)
#df['label'].replace(10000,13,inplace=True)
#df['label'].replace(100000000,14,inplace=True)

labels = df['label']

df.drop(['label'], axis=1, inplace=True)
df.drop(['character'], axis=1, inplace=True)
print("Rows and Columns of Raw Data :",df.to_numpy().shape)

Rows and Columns of Raw Data : (15000, 4096)


In [4]:
dataset = df.to_numpy()
labels = labels.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(dataset, 
                                                    labels, 
                                                    test_size=0.2, 
                                                    random_state=42)

In [5]:
def visualisation(data1,data2):
    # to view a sample of image in the data
    data1 = data1.reshape(12000,64,64,1)
    data1 = data1.astype('float32') / 255.0
    
    f, ax = plt.subplots(2,5) 
    f.set_size_inches(10, 10)
    f.suptitle("10 Sample of Image", fontsize=20)
    k = 0
    for i in range(2):
        for j in range(5):
            ax[i,j].imshow(data1[k])
            ax[i,j].title.set_text(data2[k])
            k += 1
    plt.tight_layout()   
    
    return 

In [6]:
# visualisation(x_train,y_train)

In [6]:
def visualisation1(data):
    # to view the count of dependent variable in data
    # we have balanced dataset 
    plt.figure(figsize = (10,10))
    sns.set_style("darkgrid")
    
    sns.countplot(data).set(title='Count of Dependent Variable')
    
    return 

In [8]:
# visualisation1(labels)

In [7]:
def GetBasedModel():
    
    basedModels = []
    basedModels.append(('LR', LogisticRegression(solver='sag')))
    basedModels.append(('LDA'  , LinearDiscriminantAnalysis()))
    basedModels.append(('KNN'  , KNeighborsClassifier()))
    basedModels.append(('CART' , DecisionTreeClassifier()))
    basedModels.append(('NB'   , GaussianNB()))
    basedModels.append(('SVM'  , SVC(probability=True)))
    basedModels.append(('AB'   , AdaBoostClassifier()))
    basedModels.append(('GBM'  , GradientBoostingClassifier()))
    basedModels.append(('RF'   , RandomForestClassifier()))
    basedModels.append(('ET'   , ExtraTreesClassifier()))
    
    return basedModels

In [8]:
def BasedLine2(x_train, y_train,models):
    # Test options and evaluation metric
    
    results = []
    names = []
    times = []
    measure = []
    f_measure = [] 
    auc = []
    for name, model in models:
        
        start_time = time.time()
        model.fit(x_train,y_train)
        predictions = model.predict(x_test)
        accuracy = accuracy_score(y_test, predictions)
        measurement = precision_recall_fscore_support(y_test, predictions, average='weighted')
        f = f1_score(y_test, predictions, average='weighted')
        auc_score = roc_auc_score(y_test, model.predict_proba(x_test), multi_class='ovr')
        results.append(accuracy)
        measure.append(measurement)
        f_measure.append(f)
        auc.append(auc_score)
        names.append(name)
        end_time = time.time()
        times.append((end_time - start_time))
        msg = "%s: %.2f (%.2s seconds)" % (name, accuracy, (end_time - start_time))
        print(msg)

    return names, results, times, measure, f_measure, auc

In [9]:
def ScoreDataFrame(names,results,times):
    def floatingDecimals(f_val, dec=3):
        prc = "{:."+str(dec)+"f}" 
    
        return float(prc.format(f_val))

    scores = []
    for r in results:
        scores.append(floatingDecimals(r.mean(),4))

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': scores,'Time':times})
    return scoreDataFrame

In [10]:
models = GetBasedModel()
option_model,option_result, time_taken , result_4 ,result_f , result_AUC= BasedLine2(x_train, y_train,models)
print('Done')
basedLineScore = ScoreDataFrame(option_model,option_result,time_taken)
print(basedLineScore)
print('Completed')


The max_iter was reached which means the coef_ did not converge



LR: 0.39 (23 seconds)
LDA: 0.38 (49 seconds)
KNN: 0.40 (8. seconds)
CART: 0.34 (7. seconds)
NB: 0.24 (4. seconds)
SVM: 0.72 (34 seconds)
AB: 0.25 (52 seconds)
GBM: 0.58 (49 seconds)
RF: 0.61 (26 seconds)
ET: 0.70 (34 seconds)
Done


In [11]:
result_4

[(0.3866287760624562, 0.3903333333333333, 0.3866776311656264, None),
 (0.3836911410154583, 0.37733333333333335, 0.37143667637582184, None),
 (0.5954817137853705, 0.398, 0.41724221532817546, None),
 (0.3267274190605826, 0.335, 0.3295050948950002, None),
 (0.3405846326238808, 0.23966666666666667, 0.1975997950547864, None),
 (0.7254058170722139, 0.718, 0.7156461696815752, None),
 (0.23644063931002718, 0.24666666666666667, 0.23266341942575608, None),
 (0.5823797457686141, 0.5813333333333334, 0.5765507083296516, None),
 (0.6078705776241754, 0.6066666666666667, 0.5996475015431758, None),
 (0.7088880695515237, 0.7013333333333334, 0.6973859998113573, None)]

In [12]:
result_f

[0.3866776311656264,
 0.37143667637582184,
 0.41724221532817546,
 0.3295050948950002,
 0.1975997950547864,
 0.7156461696815752,
 0.23266341942575608,
 0.5765507083296516,
 0.5996475015431758,
 0.6973859998113573]

In [13]:
result_AUC

[0.8088554720430509,
 0.7587558906256999,
 0.83558099126386,
 0.6468092273163325,
 0.6175290102919263,
 0.9582012310271659,
 0.715512295873836,
 0.9197868318018855,
 0.9320825477439344,
 0.9543996196450825]

In [14]:
time_taken

[232.04332304000854,
 49.242473125457764,
 8.49470067024231,
 7.196629524230957,
 4.5453941822052,
 3446.116340637207,
 52.549927949905396,
 4915.511994123459,
 26.406840324401855,
 34.9475781917572]

In [15]:
option_result

[0.3903333333333333,
 0.37733333333333335,
 0.398,
 0.335,
 0.23966666666666667,
 0.718,
 0.24666666666666667,
 0.5813333333333334,
 0.6066666666666667,
 0.7013333333333334]

In [16]:
option_model

['LR', 'LDA', 'KNN', 'CART', 'NB', 'SVM', 'AB', 'GBM', 'RF', 'ET']