In [93]:
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np
import joblib
import os

In [9]:
def read_data(train_path, test_path):
    train = pd.read_csv(train_path)
    x_test = pd.read_csv(test_path)
    y_train = train["label"]
    x_train = train.drop("label", axis=1)
    return x_train, y_train, x_test

In [30]:
def print_cv_res(res):
    for params, acc in zip(
        res.cv_results_["params"], 
        res.cv_results_["mean_test_score"]
    ):
        print(params, "acc: %.4f" % acc)
    
    print("best param")
    print(res.best_params_)

In [115]:
def save_prediction(model, x_test, save_path):
    y_test = pd.DataFrame(model.predict(x_test)).reset_index()
    y_test.columns = ["id", "label"]
    y_test.to_csv(save_path, index=False)

In [10]:
x_train, y_train, x_test = read_data("data/train.csv", "data/test.csv")

## Random forest classifier
- with 10-fold cv
- tune parameters
    - n_estimators: [5, 10, 15, 20, 25, 30]
    - max_depth: [1, 3, 5, 7, 9]
    - max_samples: [0.1, 0.3, 0.5, 0.7]
    - min_samples_split: [2, 5, 8, 10, 15]
- best accuracy on val_set: 0.9097, submission accuracy: 0.96551
- best_param:
    - n_estimators=20, max_depth=5, max_samples=0.6, min_samples_split=5
    - n_estimators=20, max_depth=4, max_samples=0.4, min_samples_split=5

In [87]:
param_grid = {
#     'n_estimators': [5, 10, 15, 20, 25, 30],
    'max_depth': [2, 4, 6, 8],
    'min_samples_split': [2, 5, 8, 10],
    'max_samples': [0.2, 0.4, 0.6, 0.8]
#          'min_samples_leaf': [1, 2, 5]
     }

In [88]:
rf = RandomForestClassifier(
    n_estimators=20,
    max_depth=4, 
    max_samples=0.4,
    max_features='sqrt',
    min_samples_split=5,
    bootstrap=True,
    random_state=0)

In [89]:
grid_clf = GridSearchCV(rf, param_grid, cv=10, scoring='accuracy')
grid_clf.fit(x_train, y_train)
print_cv_res(grid_clf)

{'max_depth': 2, 'max_samples': 0.2, 'min_samples_split': 2} acc: 0.8500
{'max_depth': 2, 'max_samples': 0.2, 'min_samples_split': 5} acc: 0.8500
{'max_depth': 2, 'max_samples': 0.2, 'min_samples_split': 8} acc: 0.8486
{'max_depth': 2, 'max_samples': 0.2, 'min_samples_split': 10} acc: 0.8139
{'max_depth': 2, 'max_samples': 0.2, 'min_samples_split': 15} acc: 0.7681
{'max_depth': 2, 'max_samples': 0.4, 'min_samples_split': 2} acc: 0.8972
{'max_depth': 2, 'max_samples': 0.4, 'min_samples_split': 5} acc: 0.8972
{'max_depth': 2, 'max_samples': 0.4, 'min_samples_split': 8} acc: 0.8972
{'max_depth': 2, 'max_samples': 0.4, 'min_samples_split': 10} acc: 0.8972
{'max_depth': 2, 'max_samples': 0.4, 'min_samples_split': 15} acc: 0.8972
{'max_depth': 2, 'max_samples': 0.6, 'min_samples_split': 2} acc: 0.8722
{'max_depth': 2, 'max_samples': 0.6, 'min_samples_split': 5} acc: 0.8722
{'max_depth': 2, 'max_samples': 0.6, 'min_samples_split': 8} acc: 0.8722
{'max_depth': 2, 'max_samples': 0.6, 'min_sampl

In [90]:
best_model = grid_clf. best_estimator_

if not os.path.exists("model"):
    os.mkdir("model")

joblib.dump(best_model, "model/random_forest.joblib")

save_prediction(best_model, x_test, "data/rf_submission.csv")

## Logistic regression
- best param: C = 0.2, best val_set acc: 0.9306, submission acc: 0.93103 (baseline)

In [95]:
from sklearn.linear_model import LogisticRegression

In [112]:
param_grid = {
    'C': [0.1, 0.15, 0.2, 0.25, 0.3],
     }

In [113]:
lr = LogisticRegression(
    penalty='l2',
    tol=1e-4,
    random_state=0)

grid_clf = GridSearchCV(lr, param_grid, cv=10, scoring='accuracy')
grid_clf.fit(x_train, y_train)
print_cv_res(grid_clf)

{'C': 0.08} acc: 0.9306
{'C': 0.1} acc: 0.9306
{'C': 0.15} acc: 0.9194
{'C': 0.2} acc: 0.9319
{'C': 0.25} acc: 0.9208
{'C': 0.3} acc: 0.9083
best param
{'C': 0.2}


In [114]:
best_model = grid_clf. best_estimator_
joblib.dump(best_model, "model/logistic_regression.joblib")

save_prediction(best_model, x_test, "data/lr.csv")
