- ref. 

In [15]:
# load data
import pandas as pd, numpy as np
import seaborn as sns, matplotlib.pyplot as plt
pd.set_option("display.width", 100)
pd.set_option("display.max_rows", 100)

# train_df = pd.read_csv("./train.csv")   # 42000
train_df = pd.read_csv("./train.csv", nrows=1000)
test_df = pd.read_csv("./test.csv")   # 28000
train_num = train_df.shape[0]
test_num = test_df.shape[0]
train_x = train_df.drop(['label'], axis=1)
train_y = train_df["label"]
test_x = test_df
all_df = pd.concat((train_x, test_df))

dfd = {'train': train_df, 'test': test_df, 'all': all_df}
for name,df in dfd.items():
    print(name, df.shape)
    print(df.head())

train (1000, 785)
   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8    ...     \
0      1       0       0       0       0       0       0       0       0       0    ...      
1      0       0       0       0       0       0       0       0       0       0    ...      
2      1       0       0       0       0       0       0       0       0       0    ...      
3      4       0       0       0       0       0       0       0       0       0    ...      
4      0       0       0       0       0       0       0       0       0       0    ...      

   pixel774  pixel775  pixel776  pixel777  pixel778  pixel779  pixel780  pixel781  pixel782  \
0         0         0         0         0         0         0         0         0         0   
1         0         0         0         0         0         0         0         0         0   
2         0         0         0         0         0         0         0         0         0   
3         0         0         0      

In [2]:
# check data
for name,df in dfd.items():
    print(name)
    print(df.info())
    #print(df.describe())   # doesn't show a lot of information

train
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 6.0 MB
None
test


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 6.0 MB


None
all
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 12.0 MB
None


In [3]:
# check null
for name,df in dfd.items():
    print(name)
    isnull_ratio = df.isnull().sum()/df.isnull().count()
    print(isnull_ratio[isnull_ratio != 0].sort_values(ascending=False))

train
Series([], dtype: float64)
test
Series([], dtype: float64)
all
Series([], dtype: float64)


In [4]:
# check correlation (takes time)
# -> pixel436 has high correlation with label.. so what?
# correlations = train_df.astype(float).corr()['label'].sort_values(ascending=False)
# print(correlations.head())
# print(correlations.tail())

In [5]:
# check skewness (takes time)
# -> pixel367 has high skewness, but log/boxcox transform doesn't help
# from scipy.stats import skew
# skewness = all_df.apply(lambda x: skew(x)).sort_values(ascending=False)
# print(skewness.head())
# print(skewness.tail())
# all_df['pixel367'].hist(bins=50)
# plt.show()

In [6]:
# grid search on svc parameter
from sklearn import svm
from sklearn.model_selection import GridSearchCV
train_x = all_df[:train_num]
test_x = all_df[train_num:]
C_list = np.logspace(-2, 0, 3)
gamma_list = np.logspace(-3, -1, 3)
degree_list = np.linspace(1, 3, 3)
tuned_parameters = [
    {'C': C_list, 'kernel': ['poly'], 'gamma': gamma_list, 'degree': degree_list},
    # {'C': C_list, 'kernel': ['sigmoid'], 'gamma': gamma_list},
    # {'C': C_list, 'kernel': ['linear']},
    # {'C': C_list, 'kernel': ['rbf'], 'gamma': gamma_list},
    ]
svc = svm.SVC()
svm_cv = GridSearchCV(svc, tuned_parameters, cv=3)
svm_cv.fit(train_x, train_y)
print(svm_cv.best_score_, svm_cv.best_params_)
results = svm_cv.cv_results_
for mean, std, params in zip(results['mean_test_score'], results['std_test_score'], results['params']):
    print("{:0.3f} (+/-{:0.03f} for {}".format(mean, std, params))

0.885 {'C': 0.01, 'degree': 2.0, 'gamma': 0.001, 'kernel': 'poly'}
0.867 (+/-0.011 for {'C': 0.01, 'degree': 1.0, 'gamma': 0.001, 'kernel': 'poly'}
0.867 (+/-0.011 for {'C': 0.01, 'degree': 1.0, 'gamma': 0.01, 'kernel': 'poly'}
0.867 (+/-0.011 for {'C': 0.01, 'degree': 1.0, 'gamma': 0.10000000000000001, 'kernel': 'poly'}
0.885 (+/-0.017 for {'C': 0.01, 'degree': 2.0, 'gamma': 0.001, 'kernel': 'poly'}
0.885 (+/-0.017 for {'C': 0.01, 'degree': 2.0, 'gamma': 0.01, 'kernel': 'poly'}
0.885 (+/-0.017 for {'C': 0.01, 'degree': 2.0, 'gamma': 0.10000000000000001, 'kernel': 'poly'}
0.855 (+/-0.020 for {'C': 0.01, 'degree': 3.0, 'gamma': 0.001, 'kernel': 'poly'}
0.855 (+/-0.020 for {'C': 0.01, 'degree': 3.0, 'gamma': 0.01, 'kernel': 'poly'}
0.855 (+/-0.020 for {'C': 0.01, 'degree': 3.0, 'gamma': 0.10000000000000001, 'kernel': 'poly'}
0.867 (+/-0.011 for {'C': 0.10000000000000001, 'degree': 1.0, 'gamma': 0.001, 'kernel': 'poly'}
0.867 (+/-0.011 for {'C': 0.10000000000000001, 'degree': 1.0, 'gamma'

In [7]:
# TODO set n_jobs
# cross validation
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_validate

# Random Forest parameters
rf_params = {
    'n_estimators': 1000,
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}
# Extra Trees Parameters
et_params = {
    'n_estimators':1000,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}
# AdaBoost parameters
ada_params = {
    'n_estimators': 1000,
    'learning_rate' : 0.75
}
# Gradient Boosting parameters
gb_params = {
    'n_estimators': 1000,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

models = []
models.append(('svm', SVC(**svm_cv.best_params_)))
# models.append(('random_forest', RandomForestClassifier(**rf_params)))
# models.append(('extra_trees', ExtraTreesClassifier(**et_params)))
# models.append(('ada_boost', AdaBoostClassifier(**ada_params)))
# models.append(('gradient_boosting', GradientBoostingClassifier(**gb_params)))
# models.append(('perceptron', Perceptron(max_iter=1000)))
# models.append(('sgd_classifier', SGDClassifier(max_iter=1000)))
# models.append(('logistic_regression', LogisticRegression()))
# models.append(('k-nearest_neighbors', KNeighborsClassifier(n_neighbors=3)))
# models.append(('gaussian_naive bayes', GaussianNB()))
# models.append(('linear_svc', LinearSVC()))
# models.append(('decision_tree', DecisionTreeClassifier()))
k_fold = KFold(n_splits=3)
results = {'model': [], 'test_mean': [], 'test_std': [], 'train_mean': [], 'train_std': []}
for name,model in models:
    scores = cross_validate(model, train_x, train_y, cv=k_fold)
    results['model'].append(name)
    results['test_mean'].append(np.mean(scores['test_score']))
    results['test_std'].append(np.std(scores['test_score']))
    results['train_mean'].append(np.mean(scores['train_score']))
    results['train_std'].append(np.std(scores['train_score']))
print(pd.DataFrame(results).sort_values(by='test_mean', ascending=False))

  model  test_mean  test_std  train_mean  train_std
0   svm   0.877006  0.006369         1.0        0.0


In [18]:
# learn and predict
test_id = np.arange(1, test_num+1)
results = {}
for name,model in models:
    model.fit(train_x, train_y)
    output = model.predict(test_x)
    results[name] = output
    submit = pd.DataFrame(data={'ImageId':test_id, 'Label':output})
    submit.to_csv('{}_submit.csv'.format(name), index=False)

In [None]:
# TODO check confusion matrix
# from sklearn.metrics import confusion_matrix, classification_report
# for name,model in models:
#     print(name)
#     predict_y = model.predict(train_x)
#     print(confusion_matrix(train_y, predict_y))
#     print(classification_report(train_y, predict_y))

In [19]:
!head svm_submit.csv

ImageId,Label
1,2
2,0
3,9
4,9
5,2
6,7
7,0
8,3
9,0
