In [14]:
import optuna

from lightgbm import LGBMRegressor
from optuna.samplers import TPESampler
import numpy as np 
import pandas as pd
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
import plotly.express as px

from sklearn.metrics import mean_squared_error


In [15]:
data = pd.read_csv("internship_train.csv")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,target
0,236,488,16,221,382,97,-4.472136,0.107472,0,132,...,13.340874,0.870542,1.962937,7.466666,11.547794,8.822916,9.046424,7.895535,11.010677,20.107472
1,386,206,357,232,1,198,7.81025,0.763713,1,143,...,12.484882,7.16868,2.885415,12.413973,10.260494,10.091351,9.270888,3.173994,13.921871,61.763713
2,429,49,481,111,111,146,8.602325,0.651162,1,430,...,14.030257,0.39497,8.160625,12.592059,8.937577,2.265191,11.255721,12.794841,12.080951,74.651162
3,414,350,481,370,208,158,8.306624,0.424645,1,340,...,2.789577,6.416708,10.549814,11.456437,6.468099,2.519049,0.258284,9.317696,5.383098,69.424645
4,318,359,20,218,317,301,8.124038,0.767304,1,212,...,1.88656,1.919999,2.268203,0.149421,4.105907,10.416291,6.816217,8.58696,4.512419,66.767304


In [16]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,90000.0,249.423944,144.336393,0.0,125.0,250.0,374.0,499.0
1,90000.0,250.236267,144.0222,0.0,126.0,251.0,375.0,499.0
2,90000.0,248.637289,144.107577,0.0,124.0,248.0,374.0,499.0
3,90000.0,249.7366,144.284945,0.0,125.0,250.0,375.0,499.0
4,90000.0,249.436178,143.941581,0.0,125.0,250.0,373.0,499.0
5,90000.0,249.656167,144.329168,0.0,124.0,250.0,374.0,499.0
6,90000.0,-0.011402,7.038171,-9.949874,-7.071068,0.0,7.0,9.949874
7,90000.0,0.498548,0.288682,1.4e-05,0.248932,0.497136,0.747513,0.999987
8,90000.0,0.499189,0.500002,0.0,0.0,0.0,1.0,1.0
9,90000.0,249.842033,144.612718,0.0,124.0,250.0,376.0,499.0


In [17]:
y, X = data['target'], data.drop('target', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## LGBMRegressor with Optuna

In [18]:
class Optimizer:
    def __init__(self, trials=50):
        self.trials = trials
        self.sampler = TPESampler(seed=34)
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(X_train, y_train)
        y_predict = model.predict(X_test)
        
        return np.sqrt(mean_squared_error(y_test, y_predict))
            
    def optimize(self):
        study = optuna.create_study(
            direction="minimize", 
            sampler=self.sampler
        )
        study.optimize(
            self.objective, 
            n_trials=self.trials
        )
        return study.best_params

In [23]:
def create_model(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    num_leaves = trial.suggest_int("num_leaves", 20, 100) 
    min_split_gain = trial.suggest_uniform('min_split_gain', 0., 2)
    min_child_samples = trial.suggest_int("min_child_samples", 10, 40)
    reg_alpha = trial.suggest_uniform('reg_alpha', 0., 2)
    reg_lambda = trial.suggest_uniform('reg_lambda', 0., 2)
    random_state = trial.suggest_int("random_state", 10, 1000) 
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    
    model = LGBMRegressor(
        n_estimators=n_estimators, 
        reg_alpha = reg_alpha,
        min_split_gain = min_split_gain,
        min_child_samples = min_child_samples,
        reg_lambda = reg_lambda,
        random_state = random_state,
        num_leaves=num_leaves,
        learning_rate=learning_rate,
        n_jobs = 30,
    )
    return model

optimizer = Optimizer(100)
optuna_params = optimizer.optimize()
optuna_params

[32m[I 2020-12-11 15:36:18,766][0m A new study created in memory with name: no-name-36593ee7-b28a-4563-97f4-fbf9efa07af1[0m
[32m[I 2020-12-11 15:36:20,448][0m Trial 0 finished with value: 0.062437118526032356 and parameters: {'n_estimators': 517, 'num_leaves': 88, 'min_split_gain': 0.8955257090015836, 'min_child_samples': 21, 'reg_alpha': 1.8723170721698466, 'reg_lambda': 0.8531487634710668, 'random_state': 551, 'learning_rate': 0.4083401719561955}. Best is trial 0 with value: 0.062437118526032356.[0m
[32m[I 2020-12-11 15:36:23,564][0m Trial 1 finished with value: 0.18385601169305624 and parameters: {'n_estimators': 897, 'num_leaves': 40, 'min_split_gain': 1.296945569646381, 'min_child_samples': 40, 'reg_alpha': 1.750977167621787, 'reg_lambda': 0.775269475698364, 'random_state': 991, 'learning_rate': 0.8265113477681999}. Best is trial 0 with value: 0.062437118526032356.[0m
[32m[I 2020-12-11 15:36:25,635][0m Trial 2 finished with value: 0.0805672723201253 and parameters: {'n_

[32m[I 2020-12-11 15:37:11,904][0m Trial 21 finished with value: 0.01808164175761232 and parameters: {'n_estimators': 450, 'num_leaves': 48, 'min_split_gain': 0.02031934971943536, 'min_child_samples': 19, 'reg_alpha': 0.10418604515297064, 'reg_lambda': 1.711899637550728, 'random_state': 256, 'learning_rate': 0.09722376976808454}. Best is trial 21 with value: 0.01808164175761232.[0m
[32m[I 2020-12-11 15:37:15,326][0m Trial 22 finished with value: 0.01988257302904815 and parameters: {'n_estimators': 463, 'num_leaves': 47, 'min_split_gain': 0.23768230604632043, 'min_child_samples': 19, 'reg_alpha': 0.027155263287608575, 'reg_lambda': 1.9844819977818136, 'random_state': 19, 'learning_rate': 0.08504789505688197}. Best is trial 21 with value: 0.01808164175761232.[0m
[32m[I 2020-12-11 15:37:18,468][0m Trial 23 finished with value: 0.01587971637077264 and parameters: {'n_estimators': 479, 'num_leaves': 47, 'min_split_gain': 0.23306729498207232, 'min_child_samples': 19, 'reg_alpha': 0.0

[32m[I 2020-12-11 15:39:05,494][0m Trial 43 finished with value: 0.028904895294254405 and parameters: {'n_estimators': 496, 'num_leaves': 57, 'min_split_gain': 0.21037483267488136, 'min_child_samples': 18, 'reg_alpha': 0.33192563958024207, 'reg_lambda': 1.7456680335806352, 'random_state': 203, 'learning_rate': 0.17782464225743339}. Best is trial 24 with value: 0.015020189698270417.[0m
[32m[I 2020-12-11 15:39:10,046][0m Trial 44 finished with value: 0.018493661512390156 and parameters: {'n_estimators': 622, 'num_leaves': 62, 'min_split_gain': 0.3477409666937839, 'min_child_samples': 26, 'reg_alpha': 1.958059894404368, 'reg_lambda': 1.8634090976116737, 'random_state': 95, 'learning_rate': 0.05214323544320515}. Best is trial 24 with value: 0.015020189698270417.[0m
[32m[I 2020-12-11 15:39:13,566][0m Trial 45 finished with value: 0.027643762165739955 and parameters: {'n_estimators': 568, 'num_leaves': 34, 'min_split_gain': 0.1028664127889144, 'min_child_samples': 23, 'reg_alpha': 0.

[32m[I 2020-12-11 15:40:40,965][0m Trial 65 finished with value: 0.0730629292537712 and parameters: {'n_estimators': 672, 'num_leaves': 37, 'min_split_gain': 0.059354544465700376, 'min_child_samples': 19, 'reg_alpha': 0.3295908182397678, 'reg_lambda': 1.5232931599418638, 'random_state': 82, 'learning_rate': 0.5455069537131143}. Best is trial 24 with value: 0.015020189698270417.[0m
[32m[I 2020-12-11 15:40:45,573][0m Trial 66 finished with value: 0.017870651317080457 and parameters: {'n_estimators': 754, 'num_leaves': 53, 'min_split_gain': 0.5591793187642378, 'min_child_samples': 20, 'reg_alpha': 0.07567303155506751, 'reg_lambda': 1.3024255823267037, 'random_state': 147, 'learning_rate': 0.0351220410105038}. Best is trial 24 with value: 0.015020189698270417.[0m
[32m[I 2020-12-11 15:40:51,078][0m Trial 67 finished with value: 0.01772289184270343 and parameters: {'n_estimators': 812, 'num_leaves': 40, 'min_split_gain': 0.0014031130365750322, 'min_child_samples': 21, 'reg_alpha': 0.

[32m[I 2020-12-11 15:42:36,472][0m Trial 87 finished with value: 0.019537114414606854 and parameters: {'n_estimators': 610, 'num_leaves': 90, 'min_split_gain': 0.041072433825196694, 'min_child_samples': 14, 'reg_alpha': 0.7946508145985353, 'reg_lambda': 1.0847200954828093, 'random_state': 265, 'learning_rate': 0.11017866065325835}. Best is trial 76 with value: 0.014739289213477.[0m
[32m[I 2020-12-11 15:42:40,684][0m Trial 88 finished with value: 0.017255699703424734 and parameters: {'n_estimators': 581, 'num_leaves': 88, 'min_split_gain': 0.16225441927991868, 'min_child_samples': 15, 'reg_alpha': 1.0742832030404523, 'reg_lambda': 0.9802879838849297, 'random_state': 196, 'learning_rate': 0.07926481752421878}. Best is trial 76 with value: 0.014739289213477.[0m
[32m[I 2020-12-11 15:42:49,239][0m Trial 89 finished with value: 0.013600774763179209 and parameters: {'n_estimators': 536, 'num_leaves': 97, 'min_split_gain': 0.09930936858409187, 'min_child_samples': 10, 'reg_alpha': 0.55

{'n_estimators': 409,
 'num_leaves': 100,
 'min_split_gain': 0.00511731492494083,
 'min_child_samples': 11,
 'reg_alpha': 0.7367797493656292,
 'reg_lambda': 1.2307545324579827,
 'random_state': 354,
 'learning_rate': 0.02289880674412864}

In [26]:
model = LGBMRegressor(**optuna_params)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_predict))

0.012583749772719813

So, the best score of LGBMRegressor is 0.012583749772719813

## RandomForestRegressor

In [27]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

In [33]:
y_predict = model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_predict))

0.0038118534603215726

In [34]:
np.set_printoptions(suppress=True)
model.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.99990079, 0.00009916, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        ])

## Exploring data features

Due to feature importances, only '6' and '7' columns have influence on prediction. Let's explore them

In [36]:
exp_data = data[['6', '7', 'target']]
exp_data

Unnamed: 0,6,7,target
0,-4.472136,0.107472,20.107472
1,7.810250,0.763713,61.763713
2,8.602325,0.651162,74.651162
3,8.306624,0.424645,69.424645
4,8.124038,0.767304,66.767304
...,...,...,...
89995,4.898979,0.563878,24.563878
89996,-3.605551,0.861690,13.861690
89997,6.164414,0.633704,38.633704
89998,-3.605551,0.687309,13.687309


The fractional part of the "7" column coincides with the fractional part of target.

In [42]:
exp_data.sort_values(by='6').iloc[:10]

Unnamed: 0,6,7,target
34550,-9.949874,0.442097,99.442097
84890,-9.949874,0.775673,99.775673
84886,-9.949874,0.331428,99.331428
12380,-9.949874,0.04127,99.04127
37199,-9.949874,0.230843,99.230843
78327,-9.949874,0.365398,99.365398
7743,-9.949874,0.453355,99.453355
61802,-9.949874,0.777411,99.777411
50022,-9.949874,0.227702,99.227702
22803,-9.949874,0.769055,99.769055


In [44]:
exp_data.sort_values(by='6').iloc[-10:]

Unnamed: 0,6,7,target
34374,9.949874,0.659565,99.659565
49870,9.949874,0.71833,99.71833
26770,9.949874,0.684514,99.684514
72929,9.949874,0.443141,99.443141
32369,9.949874,0.063147,99.063147
55514,9.949874,0.148961,99.148961
22963,9.949874,0.704081,99.704081
72873,9.949874,0.085607,99.085607
65621,9.949874,0.020293,99.020293
53108,9.949874,0.193455,99.193455


In [46]:
fig = px.scatter(exp_data, x='6', y='target')
fig.show()

As we can see in plot, there is quadratic dependence between target values and '6' column. Let's make a prediction for hidden test dataset

In [47]:
def predict(X):
    return data_test['6'] ** 2 + data_test['7']

In [None]:
data_test = pd.read_csv("internship_hidden_test.csv")
predictions = predict(X_test)
pd.DataFrame(predictions, columns=['target']).to_csv('prediction.csv', index=False)