# Project 1: The Humble Leaf

In [14]:
import os
import tarfile
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

LEAF_PATH = os.path.join("/home","josias", "ml", "python-environments", "LeafKaggle",
                        "DataSets", "leaf-classification")
def load_leaf_data(housing_path = LEAF_PATH):
    csv_path = os.path.join(housing_path, "train.csv")
    return pd.read_csv(csv_path)
    

In [2]:
leaf_data = load_leaf_data(LEAF_PATH)
leaf_data.head()

Unnamed: 0,id,species,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,1,Acer_Opalus,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391
1,2,Pterocarya_Stenoptera,0.005859,0.0,0.03125,0.015625,0.025391,0.001953,0.019531,0.0,...,0.000977,0.0,0.0,0.000977,0.023438,0.0,0.0,0.000977,0.039062,0.022461
2,3,Quercus_Hartwissiana,0.005859,0.009766,0.019531,0.007812,0.003906,0.005859,0.068359,0.0,...,0.1543,0.0,0.005859,0.000977,0.007812,0.0,0.0,0.0,0.020508,0.00293
3,5,Tilia_Tomentosa,0.0,0.003906,0.023438,0.005859,0.021484,0.019531,0.023438,0.0,...,0.0,0.000977,0.0,0.0,0.020508,0.0,0.0,0.017578,0.0,0.047852
4,6,Quercus_Variabilis,0.005859,0.003906,0.048828,0.009766,0.013672,0.015625,0.005859,0.0,...,0.09668,0.0,0.021484,0.0,0.0,0.0,0.0,0.0,0.0,0.03125


In [35]:
leaf_data.info()
leaf_labels = leaf_data.species.unique()
leaf_labels.size

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 990 entries, 0 to 989
Columns: 194 entries, id to texture64
dtypes: float64(192), int64(1), object(1)
memory usage: 1.5+ MB


99

In [100]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(leaf_labels)
labels_enc = le.transform(leaf_data.species)    
    
leaf_num = leaf_data.drop("species", axis=1)
leaf_num = leaf_num.drop("id", axis=1)

num_attribs = list(leaf_num)
cat_attribs = ["species"]

#cat_attribs = leaf_labels.reshape(-1,1)

cat_pipeline = Pipeline([('selector', DataFrameSelector(cat_attribs)),
                        ('cat_encoder', OneHotEncoder(sparse=False)),
                        ])

num_pipeline = Pipeline([('selector', DataFrameSelector(num_attribs)),
                        ('std_scaler', StandardScaler())
                        ])

full_pipeline = FeatureUnion(transformer_list=[
                        ("num_pipeline", num_pipeline),
                        ("cat_pipeline", cat_pipeline),
                    ])

#test_cat = cat_pipeline.fit_transform(leaf_data) 
#test_num = num_pipeline.fit_transform(leaf_data)
leaf_prep = num_pipeline.fit_transform(leaf_data)
leaf_labels_enc = cat_pipeline.fit_transform(leaf_data)
leaf_prep.shape
leaf_labels_enc.shape

(990, 192)

## Normal LogisticRegression

In [116]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression( solver="liblinear", multi_class="ovr")
log_reg.fit(leaf_prep, labels_enc)

In [120]:
from sklearn.model_selection import cross_val_score
cross_val_score(log_reg, leaf_prep, labels_enc, cv=3)

array([0.96717172, 0.95622896, 0.96632997])

In [119]:
#from sklearn.metrics import confusion_matrix
#confusion_matrix( leaf_prep, labels_enc)

# SoftMax

In [111]:
softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10)
softmax_reg.fit(leaf_prep,labels_enc )



LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [121]:
cross_val_score(softmax_reg, leaf_prep, labels_enc, cv=3)



array([0.98737374, 0.97306397, 0.99326599])