In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
pd.set_option("display.max_rows", 100)

In [2]:
train = pd.read_csv("SUPCOM_train.csv")
test = pd.read_csv("SUPCOM_test.csv")

In [3]:
train.head()

Unnamed: 0,BCT_CODBUR,CTR_MATFIS,CTR_CATEGO_X,FJU_CODFJU,CTR_CESSAT,ACT_CODACT,CTR_OBLDIR,CTR_OBLACP,CTR_OBLRES,CTR_OBLFOP,...,target,FAC_MNTPRI_F,FAC_MFODEC_F,FAC_MNTDCO_F,FAC_MNTTVA_F,FAC_MNTPRI_C,FAC_MFODEC_C,FAC_MNTDCO_C,FAC_MNTTVA_C,id
0,44,6210,C,99,0,3707,4,2,2,2,...,16.639203,,,,,,,,,train_id17437
1,401,14383,M,22,0,6105,4,2,2,2,...,17.827703,,,,,,,,,train_id5086
2,243,11555,M,23,0,5751,4,2,2,2,...,15.473503,25387000.0,0.0,0.0,4571280.0,,,,,train_id1443
3,72,6175,C,99,0,4402,4,2,2,2,...,0.0,,,,,,,,,train_id15469
4,42,1417,M,23,0,6302,4,2,2,2,...,14.286244,,,,,,,,,train_id14368


In [4]:
missing_count = train.isna().sum()

In [5]:
len(train)

21295

In [6]:
#finding variables almost completely filled with missing values
toDrop = []
for name in missing_count.index:
    if missing_count[name] >18000:
        toDrop.append(name)
toDrop

['SND_MNTPRD_E',
 'SND_MNTTVA_E',
 'SND_MNTDRC_E',
 'SND_MNTAVA_E',
 'SND_MNTTAX_E',
 'SND_MNTPAY_E',
 'SND_MNTAIR_E',
 'SND_MNTPRD_A',
 'SND_MNTTVA_A',
 'SND_MNTDRC_A',
 'SND_MNTAVA_A',
 'SND_MNTTAX_A',
 'SND_MNTPAY_A',
 'SND_MNTAIR_A',
 'FAC_MNTPRI_F',
 'FAC_MFODEC_F',
 'FAC_MNTDCO_F',
 'FAC_MNTTVA_F',
 'FAC_MNTPRI_C',
 'FAC_MFODEC_C',
 'FAC_MNTDCO_C',
 'FAC_MNTTVA_C']

In [7]:
train.drop(toDrop, axis = 1, inplace=True)

In [8]:
corr_res = train.corr(method = "kendall")["target"]

In [19]:
toKeep = corr_res.sort_values(ascending=False)[abs(corr_res) > 0.05].index[1:]

In [20]:
len(toKeep)

16

In [21]:
train_X = train[toKeep]
train_y = train["target"]

In [14]:
train_X.head()

Unnamed: 0,TVA_MNTPAY,CTR_OBLDLI,TVA_CHAF18,AX2_HONORA,CTR_OBLDIR,TVA_CAFSUS,ACT_CODACT,CTR_CESSAT,SND_MNTPRD_I,ADB_MNTORD,TVA_CHAFF6,TVA_DEDRSM,TVA_BASRSM,TVA_CRDINI,TVA_RESTIT,TVA_CRDFIN
0,35416705.0,1.0,755492735.0,2950000.0,4,610000.0,3707,0,192717832.0,831900.0,0.0,3984975.0,7969950.0,0.0,0.0,0.0
1,2130942.0,1.0,0.0,203353885.0,4,0.0,6105,0,,,0.0,0.0,0.0,0.0,0.0,0.0
2,256128.0,1.0,20373500.0,0.0,4,23461000.0,5751,0,,,0.0,0.0,0.0,0.0,0.0,0.0
3,318095.0,2.0,338481203.0,0.0,4,0.0,4402,0,,,61381586.0,0.0,0.0,26974794.0,0.0,17804671.0
4,0.0,1.0,0.0,0.0,4,0.0,6302,0,,,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
len(train_X.columns)

16

In [35]:
params = {"n_estimators": np.arange(100, 500, 10), "max_depth": np.arange(1,10, 2), "subsample": np.arange(0.5,1.0,0.1), "max_features": np.arange(10,16)}

rs = RandomizedSearchCV(GradientBoostingRegressor(random_state=123), params, n_iter = 100, scoring = "neg_mean_squared_error", n_jobs=-1, cv=3, random_state=123)


# rs.fit(train_X, train_y)
full_pipe = make_pipeline(SimpleImputer(), StandardScaler(), rs)

In [57]:
full_pipe.fit(train_X, train_y)

Pipeline(memory=None,
     steps=[('simpleimputer', SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean',
       verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('randomizedsearchcv', RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimat...=True,
          return_train_score='warn', scoring='neg_mean_squared_error',
          verbose=0))])

In [59]:
test1 = test[toKeep]
test1

Unnamed: 0,TVA_MNTPAY,CTR_OBLDLI,TVA_CHAF18,AX2_HONORA,CTR_OBLDIR,TVA_CAFSUS,ACT_CODACT,CTR_CESSAT,SND_MNTPRD_I,ADB_MNTORD,TVA_CHAFF6,TVA_DEDRSM,TVA_BASRSM,TVA_CRDINI,TVA_RESTIT,TVA_CRDFIN
0,6432909.0,1.0,1.937050e+09,0.000000e+00,4,0.000000e+00,3601,0,,3.408111e+07,2.869537e+08,0.0,0.0,0.000000e+00,0.0,0.000000e+00
1,0.0,1.0,0.000000e+00,3.738790e+07,4,0.000000e+00,3912,0,,,0.000000e+00,0.0,0.0,7.200000e+03,0.0,1.800000e+04
2,0.0,1.0,0.000000e+00,0.000000e+00,4,0.000000e+00,4221,0,,,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.000000e+00
3,266049.0,1.0,2.067658e+08,0.000000e+00,4,0.000000e+00,3203,0,,7.500000e+05,0.000000e+00,0.0,0.0,2.861370e+05,0.0,6.666490e+05
4,3554217.0,1.0,0.000000e+00,,4,0.000000e+00,6308,0,,,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.000000e+00
5,,1.0,,,4,,4212,0,,,,,,,,
6,17107532.0,1.0,3.097870e+08,0.000000e+00,4,0.000000e+00,5703,0,,,0.000000e+00,108000.0,216000.0,0.000000e+00,0.0,0.000000e+00
7,0.0,1.0,0.000000e+00,,6,0.000000e+00,6105,2,,,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.000000e+00
8,610452.0,1.0,0.000000e+00,1.100000e+06,4,0.000000e+00,6272,0,,,2.944016e+07,0.0,0.0,1.257529e+06,0.0,0.000000e+00
9,5572985.0,1.0,0.000000e+00,3.665594e+07,4,0.000000e+00,6302,0,,,0.000000e+00,0.0,0.0,0.000000e+00,0.0,0.000000e+00


In [60]:
pred = full_pipe.predict(test1)
pred

array([15.35943426, 14.19219538, 14.66021231, ..., 13.73069519,
       10.67659132, 13.68590521])

In [68]:
#getting submission file
pd.DataFrame({"id":test["id"].values, "target": pred}).to_csv("subm1.csv", index = False)
#it returns 5.96 score upon submission