# Leaf classification (Kaggle)

Making leaf classification from kaggle competitions https://www.kaggle.com/c/leaf-classification

Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import PolynomialFeatures



Loading datasets.

In [3]:
data = pd.read_csv('C:\Users\Nadiia\Downloads\leaf_train.csv')
test_data = pd.read_csv('C:\Users\Nadiia\Downloads\leaf_test.csv')

Getting some data summary

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


In [6]:
data.describe()

Unnamed: 0,id,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
count,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,...,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0,990.0
mean,799.59596,0.017412,0.028539,0.031988,0.02328,0.014264,0.038579,0.019202,0.001083,0.007167,...,0.036501,0.005024,0.015944,0.011586,0.016108,0.014017,0.002688,0.020291,0.008989,0.01942
std,452.477568,0.019739,0.038855,0.025847,0.028411,0.01839,0.05203,0.017511,0.002743,0.008933,...,0.063403,0.019321,0.023214,0.02504,0.015335,0.060151,0.011415,0.03904,0.013791,0.022768
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,415.25,0.001953,0.001953,0.013672,0.005859,0.001953,0.0,0.005859,0.0,0.001953,...,0.0,0.0,0.000977,0.0,0.004883,0.0,0.0,0.0,0.0,0.000977
50%,802.5,0.009766,0.011719,0.025391,0.013672,0.007812,0.015625,0.015625,0.0,0.005859,...,0.004883,0.0,0.005859,0.000977,0.012695,0.0,0.0,0.003906,0.00293,0.011719
75%,1195.5,0.025391,0.041016,0.044922,0.029297,0.017578,0.056153,0.029297,0.0,0.007812,...,0.043701,0.0,0.022217,0.009766,0.021484,0.0,0.0,0.023438,0.012695,0.029297
max,1584.0,0.087891,0.20508,0.15625,0.16992,0.11133,0.31055,0.091797,0.03125,0.076172,...,0.42969,0.20215,0.17285,0.2002,0.10645,0.57813,0.15137,0.37598,0.086914,0.1416


In [7]:
data.species.value_counts()

Quercus_Infectoria_sub          10
Alnus_Sieboldiana               10
Acer_Palmatum                   10
Quercus_Vulcanica               10
Callicarpa_Bodinieri            10
Eucalyptus_Glaucescens          10
Quercus_Cerris                  10
Acer_Pictum                     10
Ilex_Cornuta                    10
Morus_Nigra                     10
Cornus_Controversa              10
Eucalyptus_Urnigera             10
Cornus_Macrophylla              10
Quercus_Pyrenaica               10
Betula_Pendula                  10
Alnus_Maximowiczii              10
Cercis_Siliquastrum             10
Quercus_Phillyraeoides          10
Quercus_Rubra                   10
Acer_Rubrum                     10
Quercus_Semecarpifolia          10
Zelkova_Serrata                 10
Quercus_Afares                  10
Quercus_Ellipsoidalis           10
Tilia_Platyphyllos              10
Pterocarya_Stenoptera           10
Quercus_Crassifolia             10
Olea_Europaea                   10
Tilia_Oliveri       

Specifying target variable as y and matrix of attributes, train as X, test as X_t

In [8]:
y = data[[1]]

In [9]:
X = data.iloc[:, 2:]

In [10]:
X_t = test_data.iloc[:, 1:]

Watching shapes of matrices

In [13]:
X.shape

(990, 192)

In [14]:
X_t.shape

(594, 192)

Splitting matrices into random train and test subsets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, random_state=0)

## Logistic regression

For applying logistic regression classifier let's generate polynomial combinations of the features with degree 2. 

In [16]:
transform = PolynomialFeatures(2)
X_train_transform = transform.fit_transform(X_train)
X_test_transform = transform.transform(X_test)
X_t_transform = transform.transform(X_t)

And scale data

In [17]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_transform)
X_test_scaled = scaler.transform(X_test_transform)
X_t_scaled = scaler.fit_transform(X_t_transform)

Let's apply Logistic Regression classifier with L2 regularization and balanced classes and print out accuracy score for predicted values.

In [31]:
clf_newton_cg = LogisticRegression(class_weight = 'balanced',fit_intercept = False, 
                                   multi_class='multinomial', solver = 'newton-cg')
clf_newton_cg.fit(X_train_scaled, y_train)
print accuracy_score(y_test, clf_newton_cg.predict(X_test_scaled)), clf_newton_cg.score(X_test_scaled, y_test)

0.991935483871 0.991935483871


Trying different solvers and choosing the best.

In [279]:
clf_lbfgs = LogisticRegression(class_weight = 'balanced',fit_intercept = False, multi_class='multinomial', 
                         solver = 'lbfgs')
clf_lbfgs.fit(X_train_pca, y_train)
print accuracy_score(y_test, clf_lbfgs.predict(X_test_scaled))

0.983870967742


In [282]:
clf_sag = LogisticRegression(class_weight = 'balanced',fit_intercept = False, multi_class='multinomial', 
                         solver = 'sag', max_iter = 150)
clf_sag.fit(X_train_pca, y_train)
print accuracy_score(y_test, clf_sag.predict(X_test_scaled))

0.983870967742


Let's find the best parameters for classifier with 'newton-cg' solver.

In [24]:
param_grid = {'C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]}

Reshape y_train.values for GridSearchCV optimizer

In [25]:
y_train.values.shape

(742L, 1L)

In [26]:
np.reshape(y_train.values, (742,)) 

array(['Cornus_Controversa', 'Olea_Europaea', 'Callicarpa_Bodinieri',
       'Quercus_Coccinea', 'Cornus_Macrophylla', 'Quercus_Agrifolia',
       'Acer_Capillipes', 'Quercus_Cerris', 'Cotinus_Coggygria',
       'Tilia_Tomentosa', 'Quercus_Alnifolia', 'Quercus_Chrysolepis',
       'Quercus_Kewensis', 'Viburnum_x_Rhytidophylloides',
       'Quercus_Brantii', 'Cornus_Chinensis', 'Quercus_Crassipes',
       'Acer_Rufinerve', 'Cercis_Siliquastrum', 'Quercus_Afares',
       'Quercus_Rhysophylla', 'Quercus_x_Turneri', 'Quercus_Hartwissiana',
       'Acer_Platanoids', 'Acer_Mono', 'Quercus_Brantii', 'Alnus_Cordata',
       'Quercus_x_Hispanica', 'Ilex_Aquifolium', 'Prunus_Avium',
       'Quercus_Afares', 'Acer_Saccharinum', 'Cornus_Chinensis',
       'Quercus_Phillyraeoides', 'Arundinaria_Simonii', 'Acer_Opalus',
       'Populus_Adenopoda', 'Quercus_Coccifera', 'Cornus_Chinensis',
       'Cornus_Controversa', 'Viburnum_Tinus', 'Quercus_Nigra',
       'Phildelphus', 'Quercus_Infectoria_sub', '

In [28]:
optimizer = GridSearchCV(LogisticRegression(class_weight = 'balanced',
                    fit_intercept = False, multi_class='multinomial', 
                                   solver = 'newton-cg'), param_grid)
optimizer.fit(X_train_scaled, np.reshape(y_train.values, (742,)))
print optimizer.best_score_, optimizer.best_params_

0.977088948787 {'C': 0.5}


Score of classifier with C=0.05

In [29]:
clf_newton_cg = LogisticRegression(C = 0.05, class_weight = 'balanced',fit_intercept = False, 
                                   multi_class='multinomial', solver = 'newton-cg')
clf_newton_cg.fit(X_train_scaled, y_train)
print accuracy_score(y_test, clf_newton_cg.predict(X_test_scaled)), clf_newton_cg.score(X_test_scaled, y_test)

0.991935483871 0.991935483871


Predict probabylity for test dataset and transform results to the appropriate form.

In [41]:
probability = clf_newton_cg.predict_proba(X_t_scaled)

In [42]:
cols = np.unique(data.species)

In [43]:
test_data.shape

(594, 193)

In [44]:
df = pd.DataFrame(probability, columns = cols, index = test_data.id)
df

Unnamed: 0_level_0,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,Acer_Saccharinum,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.825121e-07,6.272494e-08,3.881375e-10,3.280180e-05,8.084558e-08,1.956119e-08,3.068614e-09,4.916420e-08,2.042373e-08,3.081083e-07,...,2.553875e-09,1.088645e-05,1.271599e-07,5.437408e-08,3.558671e-06,9.192556e-09,2.085834e-09,9.333155e-10,5.396885e-06,2.657860e-08
7,1.883475e-06,2.260947e-06,6.195384e-06,3.521894e-05,2.002393e-06,3.545398e-06,2.887354e-04,2.894024e-06,1.979194e-06,3.168199e-06,...,5.245071e-06,5.537760e-06,1.070055e-06,4.493322e-07,1.263697e-07,3.568007e-05,5.994697e-06,3.930696e-05,7.950472e-07,2.067690e-05
9,6.036383e-06,9.935250e-01,1.995446e-06,1.673564e-06,3.813621e-03,1.026554e-05,2.526768e-06,7.173059e-05,3.613658e-04,4.004081e-05,...,7.815180e-06,1.403571e-06,1.892064e-06,1.864607e-06,5.206291e-07,1.140228e-06,4.376386e-06,1.851500e-07,4.016392e-07,3.677777e-04
12,3.268702e-06,6.546676e-03,1.942498e-05,2.579484e-06,4.054952e-05,1.397212e-06,6.508207e-04,1.515986e-05,1.692837e-04,1.960233e-04,...,3.296749e-05,5.571892e-06,2.383283e-05,4.849303e-06,3.529611e-06,1.115921e-04,8.963560e-03,1.959612e-05,3.430964e-06,3.669836e-04
13,4.908714e-06,5.658199e-06,1.399522e-07,7.395604e-08,2.582812e-06,2.493382e-08,5.434630e-06,2.343562e-06,1.947201e-04,7.859971e-06,...,1.439321e-05,2.606937e-07,2.704655e-05,5.201783e-07,1.098841e-04,2.724741e-05,2.553697e-04,1.102456e-05,4.493345e-07,2.565654e-07
16,1.744527e-04,7.444374e-05,1.874810e-04,8.414701e-01,6.787166e-05,2.546623e-05,3.061876e-04,6.180289e-04,1.173859e-04,3.542645e-05,...,6.054111e-04,6.661901e-04,2.832850e-04,2.778746e-04,1.622938e-04,5.110867e-03,2.304746e-04,5.365670e-04,3.644630e-04,2.946799e-04
19,3.320263e-05,1.327110e-05,3.443652e-05,9.948508e-01,6.481796e-06,3.681094e-06,3.011865e-05,1.381393e-05,8.902166e-06,4.338221e-06,...,2.380502e-05,9.770039e-05,2.746391e-05,1.477833e-05,8.728065e-06,5.433741e-04,1.298312e-05,5.712597e-05,2.020060e-05,2.131961e-04
23,1.170195e-06,1.431975e-05,3.080709e-05,9.473474e-05,7.391894e-06,1.002152e-03,5.003240e-05,5.704683e-06,1.649134e-06,4.408446e-06,...,6.905139e-06,2.398572e-05,3.661822e-06,3.320246e-06,3.965909e-06,1.563808e-06,2.773990e-06,7.622127e-06,6.129040e-06,1.466802e-05
24,8.897134e-06,1.382409e-06,7.711673e-07,1.592802e-07,5.823925e-07,8.800143e-06,1.087205e-06,4.219019e-07,1.529226e-05,4.146550e-07,...,6.248995e-07,2.403322e-07,1.245431e-07,2.545409e-07,9.859995e-08,1.955522e-06,1.296316e-07,1.413922e-08,4.262065e-08,5.971084e-08
28,1.315536e-05,1.146572e-05,7.411880e-07,3.956874e-07,7.594848e-07,1.163863e-07,6.460682e-06,1.922669e-05,9.998373e-01,4.546519e-07,...,3.307276e-06,2.818556e-07,1.617414e-06,2.982395e-07,1.604839e-06,1.435585e-05,1.239157e-06,2.068055e-08,3.436485e-08,2.732820e-07


In [45]:
df.to_excel('leaf_classification.xlsx', sheet_name ='Sheet1')

In [46]:
df.to_csv('leaf_classification1.csv', sep=',')

## Random forest

Random forest with enough number of estimators, like 1000. I'll do that on AWS EC2 service, because it will take some time.

In [269]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000)
clf.fit(X_train_scaled, np.reshape(y_train.values, (742,)))
print accuracy_score(y_test, clf.predict(X_test_scaled))

0.979838709677


## XGBoost

Reducing quantity of features by SVD.

In [104]:
from sklearn.decomposition import TruncatedSVD
tsvd = TruncatedSVD(n_components=99)
X_train_pca = tsvd.fit_transform(X_train_scaled)
X_test_pca = tsvd.transform(X_test_scaled)
X_t_pca = tsvd.transform(X_t_scaled)

Applying XGBoost to the received data.

In [243]:
import xgboost as xgb

estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=100, 
                              min_child_weight=3, nthread=4)
estimator.fit(X_train_pca, np.reshape(y_train.values, (742,)))
print accuracy_score(y_test, estimator.predict(X_test_pca))

0.923387096774


Adjusting parameters to XGBoost.

In [230]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator, param_grid = param_test1, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train_pca, np.reshape(y_train.values, (742,)))
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.89081, std: 0.02676, params: {'max_depth': 3, 'min_child_weight': 1},
  mean: 0.95179, std: 0.03259, params: {'max_depth': 3, 'min_child_weight': 3},
  mean: 0.37835, std: 0.20751, params: {'max_depth': 3, 'min_child_weight': 5},
  mean: 0.89081, std: 0.02676, params: {'max_depth': 5, 'min_child_weight': 1},
  mean: 0.95179, std: 0.03259, params: {'max_depth': 5, 'min_child_weight': 3},
  mean: 0.37835, std: 0.20751, params: {'max_depth': 5, 'min_child_weight': 5},
  mean: 0.89081, std: 0.02676, params: {'max_depth': 7, 'min_child_weight': 1},
  mean: 0.95179, std: 0.03259, params: {'max_depth': 7, 'min_child_weight': 3},
  mean: 0.37835, std: 0.20751, params: {'max_depth': 7, 'min_child_weight': 5},
  mean: 0.89081, std: 0.02676, params: {'max_depth': 9, 'min_child_weight': 1},
  mean: 0.95179, std: 0.03259, params: {'max_depth': 9, 'min_child_weight': 3},
  mean: 0.37835, std: 0.20751, params: {'max_depth': 9, 'min_child_weight': 5}],
 {'max_depth': 3, 'min_child_weight': 3

In [233]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator, 
 param_grid = param_test3, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X_train_pca, np.reshape(y_train.values, (742,)))
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_



([mean: 0.95179, std: 0.03259, params: {'gamma': 0.0},
  mean: 0.94708, std: 0.02984, params: {'gamma': 0.1},
  mean: 0.94709, std: 0.01907, params: {'gamma': 0.2},
  mean: 0.94170, std: 0.02111, params: {'gamma': 0.3},
  mean: 0.94507, std: 0.02220, params: {'gamma': 0.4}],
 {'gamma': 0.0},
 0.95178815178815179)

In [235]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator, 
 param_grid = param_test4, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch4.fit(X_train_pca, np.reshape(y_train.values, (742,)))
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_



([mean: 0.94442, std: 0.02211, params: {'subsample': 0.6, 'colsample_bytree': 0.6},
  mean: 0.95788, std: 0.02430, params: {'subsample': 0.7, 'colsample_bytree': 0.6},
  mean: 0.95384, std: 0.02234, params: {'subsample': 0.8, 'colsample_bytree': 0.6},
  mean: 0.92888, std: 0.02671, params: {'subsample': 0.9, 'colsample_bytree': 0.6},
  mean: 0.94711, std: 0.01811, params: {'subsample': 0.6, 'colsample_bytree': 0.7},
  mean: 0.95822, std: 0.02648, params: {'subsample': 0.7, 'colsample_bytree': 0.7},
  mean: 0.94844, std: 0.02358, params: {'subsample': 0.8, 'colsample_bytree': 0.7},
  mean: 0.93328, std: 0.01736, params: {'subsample': 0.9, 'colsample_bytree': 0.7},
  mean: 0.94813, std: 0.02588, params: {'subsample': 0.6, 'colsample_bytree': 0.8},
  mean: 0.95687, std: 0.02188, params: {'subsample': 0.7, 'colsample_bytree': 0.8},
  mean: 0.95179, std: 0.03259, params: {'subsample': 0.8, 'colsample_bytree': 0.8},
  mean: 0.92890, std: 0.02264, params: {'subsample': 0.9, 'colsample_bytree'

In [239]:
param_test6 = {
 'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch6 = GridSearchCV(estimator,
 param_grid = param_test6, scoring='accuracy',n_jobs=4,iid=False, cv=5)
gsearch6.fit(X_train_pca, np.reshape(y_train.values, (742,)))
gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_



([mean: 0.96192, std: 0.02741, params: {'reg_alpha': 0},
  mean: 0.95551, std: 0.02271, params: {'reg_alpha': 0.001},
  mean: 0.95349, std: 0.03089, params: {'reg_alpha': 0.005},
  mean: 0.95990, std: 0.01920, params: {'reg_alpha': 0.01},
  mean: 0.96091, std: 0.01548, params: {'reg_alpha': 0.05}],
 {'reg_alpha': 0},
 0.96191646191646174)

In [244]:
estimator = xgb.XGBClassifier(learning_rate =0.01, n_estimators=5000, max_depth=3,
 min_child_weight=3, gamma=0, subsample=0.7, colsample_bytree=0.9, nthread=4, scale_pos_weight=1, seed=27)

In [245]:
estimator.fit(X_train_pca, np.reshape(y_train.values, (742,)))
print accuracy_score(y_test, estimator.predict(X_test_pca))

0.939516129032
