# Porto Seguro Kaggle Competition

<img src="image.jpg">

In [None]:
#Importing Essential libraries
import warnings
warnings.filterwarnings('ignore')
from Modules import utils as u
from Modules import FeatureEngg as fe
from Modules import ModelBasedFeatureEngg as mbf
from Modules.Ensemble import Create_ensemble
import pandas as pd
import numpy as np
from tqdm import tqdm
from itertools import combinations
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import xgboost as xgb
from sklearn.model_selection import KFold
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from scipy import sparse as ssp
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
import pickle

In [None]:
#Loading data
data_tr = pd.read_csv('train.csv')
data_te = pd.read_csv('test.csv')
id_tr = data_tr["id"]
id_te = data_te["id"]
y_tr = data_tr["target"]
data_tr = data_tr.drop(["id", "target"], axis = 1)
data_te = data_te.drop("id", axis = 1)
    
#Dropping 'calc' features
#We are dropping these features as they do not show any significant
#impact on the target variables.
calc_features = []
for f in data_tr.columns :
    if 'calc' in f :
        calc_features.append(f)
data_tr = data_tr.drop(calc_features, axis = 1)
data_te = data_te.drop(calc_features, axis = 1)

In [None]:
#Ensemble Parameters
lgb_params = {}
lgb_params['learning_rate'] = 0.01
lgb_params['n_estimators'] = 900
lgb_params['max_bin'] = 25
lgb_params['subsample'] = 0.9
lgb_params['subsample_freq'] = 25
lgb_params['colsample_bytree'] = 0.8   
lgb_params['min_child_samples'] = 600
lgb_params['random_state'] = 6
lgb_params['scale_pos_weight'] = 1
lgb_params['min_child_weight'] = 0.001
lgb_params['num_leaves'] = 31
lgb_params['subsample_for_bin'] = 200000

lgb_params1 = {}
lgb_params1['learning_rate'] = 0.01
lgb_params1['n_estimators'] = 700
lgb_params1['max_bin'] = 15
lgb_params1['subsample'] = 0.8
lgb_params1['subsample_freq'] = 15
lgb_params1['colsample_bytree'] = 0.9  
lgb_params1['min_child_samples'] = 800
lgb_params1['random_state'] = 6
lgb_params1['scale_pos_weight'] = 3
lgb_params1['min_child_weight'] = 0.001
lgb_params1['num_leaves'] = 25
lgb_params1['subsample_for_bin'] = 200000

lgb_params2 = {}
lgb_params2['learning_rate'] = 0.02
lgb_params2['n_estimators'] = 900
lgb_params2['max_bin'] = 20
lgb_params2['subsample'] = 0.8
lgb_params2['subsample_freq'] = 10
lgb_params2['colsample_bytree'] = 0.8   
lgb_params2['min_child_samples'] = 600
lgb_params2['random_state'] = 6
lgb_params2['scale_pos_weight'] = 3
lgb_params1['min_child_weight'] = 0.001
lgb_params2['num_leaves'] = 25
lgb_params2['subsample_for_bin'] = 200000


lgb_model = LGBMClassifier(**lgb_params)
lgb_model1 = LGBMClassifier(**lgb_params1)
lgb_model2 = LGBMClassifier(**lgb_params2)

In [None]:
def final_fun_1(data_tr, data_te, y_tr) :
    '''
    -> This function includes entire pipeline, from data preprocessing to making final predictions.
    -> It takes raw data as input and returns final predictions for them.
    '''
    #Performing all the feature engineering tasks and getting the final features
    fe.feature_engineering(data_tr, data_te, y_tr)
    #Generating model based features
    #This can take upto 24hrs
    mbf.ModelBasedFeatures(data_tr, data_te, y_tr)
    #-------------------------------------------------------------------------------
    #Loading featured engineered and model based features
    print('Loading Train Data...')
    X = pd.read_csv('final_train.csv')
    print('Loading Test Data...')
    X_te = pd.read_csv('final_test.csv')
    print('Loading Labels...')
    y = pd.read_csv('labels.csv')
    train_id = X['id']
    test_id = X_te['id']
    X = X.drop(['Unnamed: 0', 'id'], axis = 1)
    X_te = X_te.drop(['Unnamed: 0', 'id'], axis = 1)
    y = y.drop(['Unnamed: 0'], axis = 1)
    #-------------------------------------------------------------------------------
    car_tr_fea, car_te_fea = pickle.load(open("car_features.pk",'rb'), encoding='iso-8859-1')
    ind_tr_fea, ind_te_fea = pickle.load(open("ind_features.pk",'rb'), encoding='iso-8859-1')
    reg_tr_fea, reg_te_fea = pickle.load(open("reg_features.pk",'rb'), encoding='iso-8859-1')
    #-------------------------------------------------------------------------------
    fea_tr = np.concatenate((car_tr_fea, ind_tr_fea, reg_tr_fea), axis = 1)
    fea_te = np.concatenate((car_te_fea, ind_te_fea, reg_te_fea), axis = 1)
    #-------------------------------------------------------------------------------
    fea_tr_df = pd.DataFrame(fea_tr)
    fea_te_df = pd.DataFrame(fea_te)
    #-------------------------------------------------------------------------------
    X = pd.concat(objs=[X, fea_tr_df], axis = 1)
    X_te = pd.concat(objs=[X_te, fea_te_df], axis = 1)
    #-------------------------------------------------------------------------------
    lgb_stack = Create_ensemble(n_splits = 5, base_models = [lgb_model, lgb_model1, lgb_model2])        
    X = X
    Y = y
    T = X_te
    lgb_train_pred, lgb_test_pred = lgb_stack.predict(X, Y, T)
    #-------------------------------------------------------------------------------
    #The predictions for the inputs will be stored in Ensemble.csv
    sub = pd.DataFrame()
    sub['id'] = test_id
    sub['target'] = lgb_test_pred.mean(axis=1)
    sub.to_csv('Ensemble.csv', float_format='%.6f', index=False)

In [None]:
def final_fun_2(data_tr, data_te, y_tr) :
    final_fun_1(data_tr, data_te, y_tr)
    #Printing final Gini Score
    y_pred = pd.read_csv('Ensemble.csv')
    print( "\nFinal Gini : {}".format(u.eval_gini(y_tr, S_train[:,i])))