In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split

train_data = pd.read_csv('../input/30daysml/train.csv', index_col='id')
test_data_full = pd.read_csv('../input/30daysml/test.csv', index_col='id')
# train_data.head()

X = train_data.copy()

X.dropna(axis=0, subset=['target'], inplace=True)
y = X.target
X.drop(['target'], axis=1, inplace=True)

X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

cat_cols = ['cat0','cat1','cat2','cat3','cat4','cat5','cat6','cat7','cat8','cat9']


low_cad_cols = [cname 
                for cname in X_train_full.columns 
                if cname in cat_cols and 
                X_train_full[cname].nunique() < 10 and 
                X_train_full[cname].dtype == 'object'
               ]

num_cols = [cname 
            for cname in X_train_full.columns
            if cname not in cat_cols and
            X_train_full[cname].dtype in ['int64', 'float64']
           ]

my_cols = low_cad_cols + num_cols

X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = test_data_full[my_cols].copy()

X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)
X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

X_train.head()

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


model_1 = XGBRegressor(random_state=0, tree_method='gpu_hist')

model_1.fit(X_train, y_train)
predict_1 = model_1.predict(X_valid)

mae = mean_absolute_error(predict_1, y_valid)
print(mae)

In [None]:
# l_rate: 0.05, 20 : 0.5748327773119019
# l_rate: 0.02, 20: 0.5747938985183658
# l_rate: 0.03, 20: 0.5747059324335401 
# l_rate: 0.04, 5: 0.5746730885315657
# l_rate: 0.04, 20: 0.57454748443436 best
model_2 = XGBRegressor(n_estimators=700, learning_rate=0.04, tree_method='gpu_hist')

model_2.fit(X_train,y_train,
           early_stopping_rounds=20,
           eval_set=[(X_valid, y_valid)],
           verbose=False
           )

# 200: 0.5748975211334032
# 20: 0.5748327773119019
preds_2 = model_2.predict(X_valid)

mae = mean_absolute_error(preds_2, y_valid)
new_val = 0.57454748443436
if mae < new_val:
    print(mae)
elif mae == new_val:
    print('same')
else:
    print(mae,'too big')

In [None]:
test_predict2 = model_2.predict(X_test)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

def get_score(n_est):
    my_line = Pipeline(steps=[
                        ('model',XGBRegressor(n_estimators=n_est, learning_rate=0.04, tree_method='gpu_hist'))
                        ]
                  )
    scores = -1 * cross_val_score(
        my_line,
        X_train, y_train,
        cv=4,
        scoring='neg_mean_absolute_error'
    )
    return scores.mean()

values = [500,600,700,800,900,1000,1150,1300,1500]

results = {i:get_score(i) for i in values[:6]}

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(list(results.keys()), list(results.values()))
plt.show()

In [None]:
best = min(results,key=results.get)
print(best)

In [None]:
#test on test data
test_predict1 = model_1.predict(X_test)

print(mae)

In [None]:
# lev = 0
# gle = {}
# l_rate = [0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5, 1, 2]
# def mae_calc(n_est, l_rate, stops):
#     reg = XGBRegressor(n_estimators=n_est, learning_rate=l_rate, tree_method='gpu_hist')
#     reg.fit(X_train, y_train,
#            early_stopping_rounds=stops,
#            eval_set=[(X_valid, y_valid)],
#            verbose=False)
#     pred = reg.predict(X_valid)
#     return mean_absolute_error(pred, y_valid)



# def optimum_sol(l_rate_list):
#     global lev
#     for n in range(20,10001,30):
#         for learn in l_rate_list:
#             for rd in range(5,1000,10):
#                 if rd > n:
#                     break
#                 num = mae_calc(n, learn, rd)
#                 gle[lev] = [n, learn, rd, num]
#                 lev += 1
                
# optimum_sol(l_rate)
# minimum = min(gle.keys(), key=(lambda k: gle[k][3]))
# print(minimum)
                

In [None]:
output = pd.DataFrame({'id': X_test.index,
           'target': test_predict2})

output.to_csv('submission.csv', index=False)