In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df_train = pd.read_csv("SUPCOM_Train.csv")
df_test = pd.read_csv("SUPCOM_Test.csv")

In [3]:
df_train.head()

Unnamed: 0,BCT_CODBUR,CTR_MATFIS,CTR_CATEGO_X,FJU_CODFJU,CTR_CESSAT,ACT_CODACT,CTR_OBLDIR,CTR_OBLACP,CTR_OBLRES,CTR_OBLFOP,...,target,FAC_MNTPRI_F,FAC_MFODEC_F,FAC_MNTDCO_F,FAC_MNTTVA_F,FAC_MNTPRI_C,FAC_MFODEC_C,FAC_MNTDCO_C,FAC_MNTTVA_C,id
0,44,6210,C,99,0,3707,4,2,2,2,...,16.639203,,,,,,,,,train_id17437
1,401,14383,M,22,0,6105,4,2,2,2,...,17.827703,,,,,,,,,train_id5086
2,243,11555,M,23,0,5751,4,2,2,2,...,15.473503,25387000.0,0.0,0.0,4571280.0,,,,,train_id1443
3,72,6175,C,99,0,4402,4,2,2,2,...,0.0,,,,,,,,,train_id15469
4,42,1417,M,23,0,6302,4,2,2,2,...,14.286244,,,,,,,,,train_id14368


In [4]:
df_test.head()

Unnamed: 0,BCT_CODBUR,CTR_MATFIS,CTR_CATEGO_X,FJU_CODFJU,CTR_CESSAT,ACT_CODACT,CTR_OBLDIR,CTR_OBLACP,CTR_OBLRES,CTR_OBLFOP,...,RES_ANNIMP,FAC_MNTPRI_F,FAC_MFODEC_F,FAC_MNTDCO_F,FAC_MNTTVA_F,FAC_MNTPRI_C,FAC_MFODEC_C,FAC_MNTDCO_C,FAC_MNTTVA_C,id
0,238,5016,M,22,0,3601,4,2,2,2,...,2017,,,,,,,,,test_id1120
1,173,3757,M,22,0,3912,4,2,2,2,...,2017,,,,,,,,,test_id1680
2,125,15482,C,99,0,4221,4,2,2,1,...,2018,,,,,,,,,test_id1063
3,9,11722,C,99,0,3203,4,2,2,2,...,2015,,,,,,,,,test_id3731
4,153,6557,P,99,0,6308,4,2,2,2,...,2014,,,,,,,,,test_id9766


In [5]:
# Get list of categorical variables
s = (df_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['CTR_CATEGO_X', 'id']


In [6]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_df_train = df_train.copy()


# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_df_train[col] = label_encoder.fit_transform(df_train[col])


In [7]:
label_df_train.head()

Unnamed: 0,BCT_CODBUR,CTR_MATFIS,CTR_CATEGO_X,FJU_CODFJU,CTR_CESSAT,ACT_CODACT,CTR_OBLDIR,CTR_OBLACP,CTR_OBLRES,CTR_OBLFOP,...,target,FAC_MNTPRI_F,FAC_MFODEC_F,FAC_MNTDCO_F,FAC_MNTTVA_F,FAC_MNTPRI_C,FAC_MFODEC_C,FAC_MNTDCO_C,FAC_MNTTVA_C,id
0,44,6210,0,99,0,3707,4,2,2,2,...,16.639203,,,,,,,,,7036
1,401,14383,1,22,0,6105,4,2,2,2,...,17.827703,,,,,,,,,16699
2,243,11555,1,23,0,5751,4,2,2,2,...,15.473503,25387000.0,0.0,0.0,4571280.0,,,,,4206
3,72,6175,0,99,0,4402,4,2,2,2,...,0.0,,,,,,,,,5181
4,42,1417,1,23,0,6302,4,2,2,2,...,14.286244,,,,,,,,,4147


In [8]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy='median')
imputed_df_train = pd.DataFrame(my_imputer.fit_transform(label_df_train))

# Imputation removed column names; put them back
imputed_df_train.columns = label_df_train.columns


In [9]:
X = imputed_df_train.drop('target',axis=1)
y = imputed_df_train['target']

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [12]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X_train,y_train)

RandomForestRegressor()

In [13]:
# Evaluation function
from sklearn.metrics import mean_squared_log_error,mean_absolute_error,r2_score
def rmsle(y_test,y_preds):
    return np.sqrt(mean_squared_log_error(y_test,y_preds))
# Create function to evaluate model on a few different levels
def show_scores(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_test)
    scores = {
        "Training MAE": mean_absolute_error(y_train,train_preds),
        "Valid MAE":mean_absolute_error(y_test,val_preds),
        "Training RMSLE": rmsle(y_train,train_preds),
        "Valid RMSLE": rmsle(y_test,val_preds),
        "Training R^2": r2_score(y_train,train_preds),
        "Valid R^2": r2_score(y_test,val_preds)
    }
    return scores

In [14]:
show_scores(model)

{'Training MAE': 1.5938378953725876,
 'Valid MAE': 4.359427569007461,
 'Training RMSLE': 0.6756463661739299,
 'Valid RMSLE': 1.1259037617569203,
 'Training R^2': 0.9035726992406755,
 'Valid R^2': 0.3160454483914824}

In [16]:
from sklearn.ensemble import ExtraTreesRegressor,AdaBoostRegressor,GradientBoostingRegressor
model_1 = ExtraTreesRegressor()
model_1.fit(X_train,y_train)
show_scores(model_1)

{'Training MAE': 8.620373596548573e-13,
 'Valid MAE': 4.404606537787039,
 'Training RMSLE': 2.5797376862381856e-12,
 'Valid RMSLE': 1.1241389040310508,
 'Training R^2': 1.0,
 'Valid R^2': 0.31439506211270174}

In [17]:
model_2 = AdaBoostRegressor()
model_2.fit(X_train,y_train)
show_scores(model_2)

{'Training MAE': 5.307324438478788,
 'Valid MAE': 5.401255638217685,
 'Training RMSLE': 1.1910224582571227,
 'Valid RMSLE': 1.222569353339566,
 'Training R^2': 0.18508778459612252,
 'Valid R^2': 0.17548048655683812}

In [18]:
model_3 = GradientBoostingRegressor()
model_3.fit(X_train,y_train)
show_scores(model_3)

ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

In [19]:
df_test.head()

Unnamed: 0,BCT_CODBUR,CTR_MATFIS,CTR_CATEGO_X,FJU_CODFJU,CTR_CESSAT,ACT_CODACT,CTR_OBLDIR,CTR_OBLACP,CTR_OBLRES,CTR_OBLFOP,...,RES_ANNIMP,FAC_MNTPRI_F,FAC_MFODEC_F,FAC_MNTDCO_F,FAC_MNTTVA_F,FAC_MNTPRI_C,FAC_MFODEC_C,FAC_MNTDCO_C,FAC_MNTTVA_C,id
0,238,5016,M,22,0,3601,4,2,2,2,...,2017,,,,,,,,,test_id1120
1,173,3757,M,22,0,3912,4,2,2,2,...,2017,,,,,,,,,test_id1680
2,125,15482,C,99,0,4221,4,2,2,1,...,2018,,,,,,,,,test_id1063
3,9,11722,C,99,0,3203,4,2,2,2,...,2015,,,,,,,,,test_id3731
4,153,6557,P,99,0,6308,4,2,2,2,...,2014,,,,,,,,,test_id9766


In [20]:
# Get list of categorical variables
s = (df_test.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['CTR_CATEGO_X', 'id']


In [21]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
label_df_test = df_test.copy()


# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in object_cols:
    label_df_test[col] = label_encoder.fit_transform(df_test[col])


In [22]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy='median')
imputed_df_test = pd.DataFrame(my_imputer.fit_transform(label_df_test))

# Imputation removed column names; put them back
imputed_df_test.columns = label_df_test.columns


In [23]:
imputed_df_test.shape,X_train.shape

((7517, 120), (17036, 120))

In [24]:
predictions = model_1.predict(imputed_df_test)
predictions

array([14.97692684, 13.99542393,  6.48621153, ..., 14.01722297,
        9.62243135, 11.68694434])

In [25]:
# Loading test predictions into csv
submission = pd.DataFrame({'Id':df_test["id"],'target': predictions});
submission.to_csv('submission_new_Fraud.csv', index=False)
submission

Unnamed: 0,Id,target
0,test_id1120,14.976927
1,test_id1680,13.995424
2,test_id1063,6.486212
3,test_id3731,14.480238
4,test_id9766,14.023932
...,...,...
7512,test_id1497,14.478915
7513,test_id10025,9.874051
7514,test_id1045,14.017223
7515,test_id10695,9.622431


In [26]:
from sklearn.model_selection import RandomizedSearchCV
# Different RandomForestRegressor hyperparameters
rf_grid = {
    "n_estimators":np.arange(10,200,10),
    "max_depth":[None,3,5,10,100],
    "min_samples_split":np.arange(2,20,2),
    "min_samples_leaf":np.arange(1,20,2),
    "max_features": [0.5,1,"sqrt","auto"],
    "max_samples":[10000]
}
# Instanciate RandomizedSearchCV
rs_model = RandomizedSearchCV(
ExtraTreesRegressor(n_jobs=-1,
                     random_state=42),
                    param_distributions = rf_grid,
                     n_iter=50,
                    cv=5,
                    verbose=True)
rs_model.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:  6.8min finished


RandomizedSearchCV(cv=5,
                   estimator=ExtraTreesRegressor(n_jobs=-1, random_state=42),
                   n_iter=50,
                   param_distributions={'max_depth': [None, 3, 5, 10, 100],
                                        'max_features': [0.5, 1, 'sqrt',
                                                         'auto'],
                                        'max_samples': [10000],
                                        'min_samples_leaf': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19]),
                                        'min_samples_split': array([ 2,  4,  6,  8, 10, 12, 14, 16, 18]),
                                        'n_estimators': array([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100, 110, 120, 130,
       140, 150, 160, 170, 180, 190])},
                   verbose=True)

In [27]:
rs_model.best_params_

{'n_estimators': 160,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'max_samples': 10000,
 'max_features': 'auto',
 'max_depth': None}

In [28]:
show_scores(rs_model)

{'Training MAE': 1.9221449818266538,
 'Valid MAE': 4.457544791354345,
 'Training RMSLE': 0.7595333549743811,
 'Valid RMSLE': 1.1498235825415344,
 'Training R^2': 0.8582076078912135,
 'Valid R^2': 0.3073758633109479}

In [29]:
ideal_model = ExtraTreesRegressor(
n_estimators=160,
min_samples_split=4,
min_samples_leaf=3,
max_samples=10000,
max_features='auto',
max_depth=None)
ideal_model.fit(X_train,y_train)
show_scores(ideal_model)

{'Training MAE': 1.9271558295851354,
 'Valid MAE': 4.457726647580672,
 'Training RMSLE': 0.7609863490732459,
 'Valid RMSLE': 1.1498741689282326,
 'Training R^2': 0.8575535634987181,
 'Valid R^2': 0.3079281562069187}

In [32]:
predictions1= model_1.predict(imputed_df_test)
predictions2= rs_model.predict(imputed_df_test)
predictions3= ideal_model.predict(imputed_df_test)

In [35]:
# Loading test predictions into csv
submission = pd.DataFrame({'Id':df_test["id"],'target': predictions3});
submission.to_csv('submission_new_Fraud_3.csv', index=False)
submission

Unnamed: 0,Id,target
0,test_id1120,15.549853
1,test_id1680,14.254450
2,test_id1063,10.107281
3,test_id3731,14.456434
4,test_id9766,13.528979
...,...,...
7512,test_id1497,14.165221
7513,test_id10025,10.547638
7514,test_id1045,15.780850
7515,test_id10695,12.246799
