# Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Reading and Analyzing the data

In [2]:
training_data = pd.read_csv("../input/30-days-of-ml/train.csv")
testing_data = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission_data = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")
print("Length of training data: ",len(training_data))
training_data.head()

Length of training data:  300000


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


### Making the KFolds of the training data

In [3]:
training_data["kfold"] = -1 
kf = KFold(n_splits=5, shuffle=True, random_state=0)
for fold, (train_ind, valid_ind) in enumerate(kf.split(training_data)):
    training_data.loc[valid_ind, "kfold"] = fold
print("Length of training data: ",len(training_data))
training_data.head()

Length of training data:  300000


Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634,1
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,3
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,1
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226,4


### Splitting the Categorical and Numerical Data

In [4]:
features = training_data.drop(["id", "target", "kfold"], axis=1)
features_list = features.columns.tolist()
#Combined Columns = Numerical Columns + Categorical Columns 
print("Combined Columns List:", features_list)
print("Number of Combined Columns:", len(features_list))
categorical_columns = features.select_dtypes(include="object").columns.tolist()
print("Catergorical Columns List:", categorical_columns)
print("Number of Categorical Columns:", len(categorical_columns))
numerical_columns = features.select_dtypes(include=["int", "float"]).columns.tolist()
print("Numerical Columns List: ", numerical_columns)
print("Number of Numerical Columns:", len(numerical_columns))

Combined Columns List: ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
Number of Combined Columns: 24
Catergorical Columns List: ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']
Number of Categorical Columns: 10
Numerical Columns List:  ['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
Number of Numerical Columns: 14


# Making and Training the Model

In [5]:
final_predictions = []  #list used to save final predictions of different folds.
testing_data = testing_data[features_list]
for fold in range(5):
    #Splitting the training and validation data based on the folds
    x_train = training_data[training_data.kfold!=fold].reset_index(drop=True)
    x_valid = training_data[training_data.kfold==fold].reset_index(drop=True)
    y_train = x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[features_list]
    x_valid = x_valid[features_list]
    x_test = testing_data.copy()
    
    #Encoding the categorical data
    encoder = OrdinalEncoder()
    x_train[categorical_columns] = encoder.fit_transform(x_train[categorical_columns])
    x_valid[categorical_columns] = encoder.transform(x_valid[categorical_columns])
    x_test[categorical_columns] = encoder.transform(x_test[categorical_columns])
    
    #Making the model
    xgb_params = {
    'random_state': 1, 
    'n_jobs': 4,
    'booster': 'gbtree',
    'n_estimators': 10000,
    'learning_rate': 0.034682894846408095,
    'reg_lambda': 1.224383455634919,
    'reg_alpha': 36.043214512614476,
    'subsample': 0.9219010649982458,
    'colsample_bytree': 0.11247495917687526,
    'max_depth': 3,
    'min_child_weight': 6}
    model = XGBRegressor(**xgb_params)
#     model = XGBRegressor(random_state=fold,
#                          n_jobs=4,
#                          n_estimators=1350,
#                          tree_method="gpu_hist",
#                          learning_rate=0.14,
#                          sub_sample=0.99,
#                          max_depth=3,
#                          colsample_bytree=0.5,
#                          reg_alpha=25.4,
#                          eval_metric="rmse")

#     model = LGBMRegressor(n_estimators=10000,
#                           learning_rate=0.1,
#                           random_state=11,
#                           max_depth=2,
#                           subsample=0.95,
#                           colsample_bytree=0.85,
#                           reg_alpha=30.0,
#                           reg_lambda=25.0,
#                           num_leaves=4,
#                           max_bin=512)
    #Training the model
    model.fit(x_train, y_train, early_stopping_rounds=300, eval_set=[(x_valid, y_valid)], verbose=1000)
    
    #Making predictions
    validation_prediction = model.predict(x_valid)
    error = mean_squared_error(y_valid, validation_prediction, squared=False)
    print("MSE of Fold "+ str(fold) + ":", error)
    
    #Saving test predictions
    test_errors = model.predict(x_test)
    final_predictions.append(test_errors)


[0]	validation_0-rmse:7.50777
[1000]	validation_0-rmse:0.72607
[2000]	validation_0-rmse:0.72180
[3000]	validation_0-rmse:0.72013
[4000]	validation_0-rmse:0.71930
[5000]	validation_0-rmse:0.71878
[6000]	validation_0-rmse:0.71850
[7000]	validation_0-rmse:0.71832
[8000]	validation_0-rmse:0.71822
[9000]	validation_0-rmse:0.71820
[9145]	validation_0-rmse:0.71820
MSE of Fold 0: 0.7181837140869914
[0]	validation_0-rmse:7.51067
[1000]	validation_0-rmse:0.72319
[2000]	validation_0-rmse:0.71898
[3000]	validation_0-rmse:0.71725
[4000]	validation_0-rmse:0.71632
[5000]	validation_0-rmse:0.71584
[6000]	validation_0-rmse:0.71561
[7000]	validation_0-rmse:0.71548
[8000]	validation_0-rmse:0.71544
[8853]	validation_0-rmse:0.71543
MSE of Fold 1: 0.7154173518460784
[0]	validation_0-rmse:7.51249
[1000]	validation_0-rmse:0.72856
[2000]	validation_0-rmse:0.72395
[3000]	validation_0-rmse:0.72216
[4000]	validation_0-rmse:0.72119
[5000]	validation_0-rmse:0.72063
[6000]	validation_0-rmse:0.72034
[7000]	validation

# Making the submission file

In [6]:
predictions = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission_data.target = predictions
sample_submission_data.to_csv("submission.csv", index=False)