In [None]:
%matplotlib inline

# Data Preparation

### Load CSV using Pandas

In [None]:
import pandas as pd
train = pd.read_csv('../data/kaggle/train.csv')
test = pd.read_csv('../data/kaggle/test.csv')
print(train.shape, test.shape)

### Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_train = encoder.fit_transform(train.species)
labels = encoder.classes_

### Reshape the data

In [None]:
X_train = train.drop(['species'], axis=1).set_index('id', drop=True)
X_train.index.name = None
X_test = test.set_index('id', drop=True)
X_test.index.name = None

### Standard Scaler (regularization)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
transform_values = lambda df, func: pd.DataFrame(
    func(df.values), 
    columns=df.columns, 
    index=df.index)
X_train_scaled = transform_values(X_train, scaler.fit_transform)
X_test_scaled = transform_values(X_test, scaler.transform)
# X_train_scaled = X_train
# X_test_scaled = X_test

In [None]:
scaler1 = StandardScaler()
scaler1.fit(X_train)
X_train_scaled1 = transform_values(X_train, scaler1.transform)
X_train_scaled1

### Peek the data

In [None]:
cols = ['margin1', 'texture1', 'shape1'] # X_train.columns
train_rows = range(0,4) # range(X_train.shape[0])
test_rows = range(0,4)  # range(X_test.shape[0])
the_merge = lambda a, b, rows: pd.merge(
    a[cols].iloc[rows],
    b[cols].iloc[rows],
    left_index=True,
    right_index=True,
    suffixes=['','_scaled'])

pd.concat([the_merge(X_train, X_train_scaled, train_rows),
           the_merge(X_test,  X_test_scaled, test_rows )],
          keys=['train', 'test'])
           

### Plotting the data

In [None]:
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
from pylab import rcParams
rcParams['figure.figsize'] = (24, 3)
shapes = X_train_scaled.filter(like='shape')
r = 3
c = 6
for i in range(r * c):
    id = shapes.index[i]
    img = mpimg.imread('../data/kaggle/images/%s.jpg' % id)
    plt.subplot(r, c*2, 2*i + 1)
    plt.imshow(img, cmap='hot')
    plt.subplot(r, c*2, 2*i + 2)
    plt.plot(shapes.values[i])
    plt.title(id)
plt.show()

# Model Selection

### List Classifiers

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

classifiers = [
    ('KNN', KNeighborsClassifier(10)),
    ('Linear SVM', SVC(kernel="linear", C=0.025, probability=True)),
    ('RBF SVM',    SVC(C=1, probability=True)),

    #('Nu SVM', NuSVC(probability=True)),
    #('Gaussian Process', GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True)),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('GradientBoost', GradientBoostingClassifier()),
    ('Neural Network', MLPClassifier()),
    ('Naive Bayes', GaussianNB()),
    ('LDA', LinearDiscriminantAnalysis()),
    ('QDA', QuadraticDiscriminantAnalysis())]

### Split for Cross Validation

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, log_loss

# num_fold = 1
# cv_rate = 0.1
# sss = StratifiedShuffleSplit(num_fold, test_size=cv_rate, random_state=23)

# for train_index, test_index in sss.split(X_train_scaled, Y_train):
#     X, X_cv = X_train_scaled.values[train_index], X_train_scaled.values[test_index]
#     y, y_cv = y_train[train_index], y_train[test_index]

# Logging for Visual Comparison
#log_cols=["Classifier", "Accuracy", "Log Loss"]
log_cols=["Classifier", "Accuracy"]
log = pd.DataFrame(columns=log_cols)

for name, clf in classifiers:
    it = 0
    print("="*30)
    print(name,)
    
    score = cross_val_score(clf, X_train_scaled, y_train).min()
    print("Accuracy Score: %.4f" % score)
        
#     clf.fit(X, y)
    
#     print('****Results****')
#     score = clf.score(X_cv, y_cv)*100
#     print("Accuracy Score: %.4f" % score)

#     y_pred_proba = clf.predict_proba(X_cv)
#     ll = log_loss(y_cv, y_pred_proba)
#     print("Log Loss:       %f" % ll)

    log_entry = pd.DataFrame([[name, score]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

In [None]:
log

In [None]:
import seaborn as sns
rcParams['figure.figsize'] = (20, 10)
sns.set_color_codes("muted")
sns.barplot(x='Accuracy', y='Classifier', data=log, color="b")

plt.xlabel('Accuracy %')
plt.title('Classifier Accuracy')
plt.show()

# sns.set_color_codes("muted")
# sns.barplot(x='Log Loss', y='Classifier', data=log, color="g")

# plt.xlabel('Log Loss')
# plt.title('Classifier Log Loss')
# plt.show()

# Submission

In [None]:
# Predict Test Set
favorite_clf = SVC(kernel="linear", C=0.025, probability=True)
favorite_clf.fit(X_train_scaled, y_train)
test_predictions = favorite_clf.predict_proba(X_test_scaled)

# Format DataFrame
submission = pd.DataFrame(test_predictions, columns=labels)
submission.insert(0, 'id', X_test.index)
submission.reset_index()

# Export Submission
#submission.to_csv('submission.csv', index = False)
submission.tail()