In [1]:
import optuna
from optuna.samplers import TPESampler
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
data = pd.read_csv("internship_train.csv")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,target
0,236,488,16,221,382,97,-4.472136,0.107472,0,132,...,13.340874,0.870542,1.962937,7.466666,11.547794,8.822916,9.046424,7.895535,11.010677,20.107472
1,386,206,357,232,1,198,7.81025,0.763713,1,143,...,12.484882,7.16868,2.885415,12.413973,10.260494,10.091351,9.270888,3.173994,13.921871,61.763713
2,429,49,481,111,111,146,8.602325,0.651162,1,430,...,14.030257,0.39497,8.160625,12.592059,8.937577,2.265191,11.255721,12.794841,12.080951,74.651162
3,414,350,481,370,208,158,8.306624,0.424645,1,340,...,2.789577,6.416708,10.549814,11.456437,6.468099,2.519049,0.258284,9.317696,5.383098,69.424645
4,318,359,20,218,317,301,8.124038,0.767304,1,212,...,1.88656,1.919999,2.268203,0.149421,4.105907,10.416291,6.816217,8.58696,4.512419,66.767304


In [3]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,90000.0,249.423944,144.336393,0.0,125.0,250.0,374.0,499.0
1,90000.0,250.236267,144.0222,0.0,126.0,251.0,375.0,499.0
2,90000.0,248.637289,144.107577,0.0,124.0,248.0,374.0,499.0
3,90000.0,249.7366,144.284945,0.0,125.0,250.0,375.0,499.0
4,90000.0,249.436178,143.941581,0.0,125.0,250.0,373.0,499.0
5,90000.0,249.656167,144.329168,0.0,124.0,250.0,374.0,499.0
6,90000.0,-0.011402,7.038171,-9.949874,-7.071068,0.0,7.0,9.949874
7,90000.0,0.498548,0.288682,1.4e-05,0.248932,0.497136,0.747513,0.999987
8,90000.0,0.499189,0.500002,0.0,0.0,0.0,1.0,1.0
9,90000.0,249.842033,144.612718,0.0,124.0,250.0,376.0,499.0


In [4]:
y, X = data['target'], data.drop('target', axis=1)
X = preprocessing.normalize(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [41]:
class Optimizer:
    def __init__(self, trials=50):
        self.trials = trials
        self.sampler = TPESampler(seed=34)
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(X_train, y_train)
        y_predict = model.predict(X_test)
        
        return np.sqrt(mean_squared_error(y_test, y_predict))
            
    def optimize(self):
        study = optuna.create_study(
            direction="minimize", 
            sampler=self.sampler
        )
        study.optimize(
            self.objective, 
            n_trials=self.trials
        )
        return study.best_params

In [None]:
def create_model(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 20, 50)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    model = RandomForestRegressor(
        n_estimators=n_estimators, 
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_depth=max_depth, 
        n_jobs = 10,
        random_state=34
    )
    return model

optimizer = Optimizer(100)
optuna_params = optimizer.optimize()
optuna_params['random_state'] = 34
optuna_params

[32m[I 2020-12-01 00:20:34,786][0m A new study created in memory with name: no-name-25aac8bc-ef39-477b-af1b-cc9eb3b090da[0m
[32m[I 2020-12-01 00:33:18,592][0m Trial 0 finished with value: 3.281775873588055 and parameters: {'n_estimators': 517, 'max_depth': 46, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 3.281775873588055.[0m
[32m[I 2020-12-01 00:51:59,533][0m Trial 1 finished with value: 3.545072194026884 and parameters: {'n_estimators': 424, 'max_depth': 23, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 3.281775873588055.[0m
[32m[I 2020-12-01 01:18:35,927][0m Trial 2 finished with value: 3.307068886594109 and parameters: {'n_estimators': 641, 'max_depth': 45, 'min_samples_split': 10, 'min_samples_leaf': 7}. Best is trial 0 with value: 3.281775873588055.[0m
[32m[I 2020-12-01 01:30:15,202][0m Trial 3 finished with value: 3.2836584458555045 and parameters: {'n_estimators': 473, 'max_depth': 42, 'min_samples_split

In [5]:
model = RandomForestRegressor(
        n_estimators=992, 
        min_samples_split=2,
        min_samples_leaf=2,
        max_depth=50, 
        n_jobs = 100,
        random_state=600, verbose = 1
    )
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done 250 tasks      | elapsed:  5.0min
[Parallel(n_jobs=100)]: Done 600 tasks      | elapsed: 11.3min
[Parallel(n_jobs=100)]: Done 992 out of 992 | elapsed: 19.0min finished
[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done 250 tasks      | elapsed:    0.4s
[Parallel(n_jobs=100)]: Done 600 tasks      | elapsed:    1.0s
[Parallel(n_jobs=100)]: Done 992 out of 992 | elapsed:    1.4s finished


In [6]:
np.sqrt(mean_squared_error(y_test, y_predict))

3.214383202725109

In [7]:
pickle.dump(model, open("RFR.pickle.dat", "wb"))

In [8]:
data_test = pd.read_csv("internship_hidden_test.csv")
data_test = preprocessing.normalize(data_test)
predictions = model.predict(data_test)

[Parallel(n_jobs=100)]: Using backend ThreadingBackend with 100 concurrent workers.
[Parallel(n_jobs=100)]: Done 250 tasks      | elapsed:    2.6s
[Parallel(n_jobs=100)]: Done 600 tasks      | elapsed:    5.4s
[Parallel(n_jobs=100)]: Done 992 out of 992 | elapsed:   11.5s finished


In [10]:
prediction = pd.DataFrame(predictions, columns=['target']).to_csv('prediction.csv', index=False)