In [2]:
# Copied almost entirely from Xu  Xu Yinan's script - uses slightly different calculation method to achieve higher score.


import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler


np.random.seed(42)

train = pd.read_csv('./data/train.csv')
x_train = train.drop(['id', 'species'], axis=1).values
le = LabelEncoder().fit(train['species'])
y_train = le.transform(train['species'])

scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)

params = {'C':[1, 10, 50, 100, 500, 1000, 2000], 'tol': [0.001, 0.0001, 0.005]}
log_reg = LogisticRegression(solver='newton-cg', multi_class='multinomial')
clf = GridSearchCV(log_reg, params, scoring='neg_log_loss', refit='True', n_jobs=1, cv=5)
clf.fit(x_train, y_train)

print("best params: " + str(clf.best_params_))
for params, mean_score, scores in clf.grid_scores_:
  print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std(), params))
  print(scores)

test = pd.read_csv('./data/test.csv')
test_ids = test.pop('id')
x_test = test.values
scaler = StandardScaler().fit(x_test)
x_test = scaler.transform(x_test)

y_test = clf.predict_proba(x_test)

submission = pd.DataFrame(y_test, index=test_ids, columns=le.classes_)
submission.to_csv('submission.csv')

best params: {'C': 2000, 'tol': 0.0001}
-0.111 (+/-0.015) for {'C': 1, 'tol': 0.001}
[-0.11846263 -0.09579797 -0.13176928 -0.11477786 -0.09202927]
-0.111 (+/-0.015) for {'C': 1, 'tol': 0.0001}
[-0.11846378 -0.09579824 -0.1317605  -0.11477505 -0.09204022]
-0.111 (+/-0.015) for {'C': 1, 'tol': 0.005}
[-0.11843975 -0.09579344 -0.13184225 -0.11477493 -0.09211521]
-0.058 (+/-0.017) for {'C': 10, 'tol': 0.001}
[-0.06474214 -0.04249528 -0.08486901 -0.06024768 -0.03847028]
-0.058 (+/-0.017) for {'C': 10, 'tol': 0.0001}
[-0.06473591 -0.04260482 -0.08482931 -0.060182   -0.03845354]
-0.058 (+/-0.017) for {'C': 10, 'tol': 0.005}
[-0.06520049 -0.04120733 -0.08499392 -0.06012396 -0.03944937]
-0.047 (+/-0.019) for {'C': 50, 'tol': 0.001}
[-0.05220181 -0.02898761 -0.07823498 -0.04817143 -0.02621757]
-0.047 (+/-0.019) for {'C': 50, 'tol': 0.0001}
[-0.05199619 -0.02951524 -0.07817944 -0.04810708 -0.02546668]
-0.048 (+/-0.019) for {'C': 50, 'tol': 0.005}
[-0.05508765 -0.02810997 -0.07862918 -0.04793924 -