In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

# 1. Identify which type of machine learning problem is it?

This is a bi-class classification problem as the Target variabe(dependent variable) values are either 0 or 1.

# 2. Please suggest models that the team may consider for the problem at hand. Suggest at least three models and discuss their pros and cons.

1. Logistic regression

Pros:-
It is a simple and efficient algorithm that is easy to implement and interpret.
It can handle both continuous and categorical independent variables.
It provides probability estimates for each class, which can be useful for decision-making.
It is robust to noise and outliers in the data.

Cons:
It assumes a linear relationship between the independent variables and the log-odds of the dependent variable.
It is sensitive to overfitting when the number of independent variables is large.
It cannot handle non-linear relationships between the independent variables and the dependent variable.

2. Random Forest 

Pros:
It is a powerful and versatile algorithm that can handle both classification and regression problems.
It can handle large datasets with high dimensionality.
It is less prone to overfitting than other decision tree algorithms.
It can handle missing data and outliers in the data.

Cons:
It can be computationally expensive for large datasets and may require a lot of memory.
It can be difficult to interpret and understand the relationships between the variables and the outcome.
It may not perform well when there are many irrelevant features in the data.

3. XG Boost 

Pros:
It is a fast and efficient algorithm that can handle large datasets with high dimensionality.
It can handle missing data and outliers in the data.
It can automatically handle feature selection and feature engineering.
It can produce accurate predictions and has been shown to perform well in machine learning competitions.

Cons:
It can be computationally expensive for large datasets and may require a lot of memory.
It can be prone to overfitting if the hyperparameters are not tuned properly.
It may not perform well when there are many irrelevant features in the data.

# 3. Use the train dataset along with the five new variables you engineered during the EDA. 

In [73]:
df= pd.read_csv('Final_preprocessed_data.csv')

In [74]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,...,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_PREV,Delayed_days,TOTAL_AMT_CREDIT_x,TOTAL_AMT_CREDIT_y,Utilization,MAX_DPD,AMT_CREDIT_y,AMT_DOWN_PAYMENT,ACTUAL_CREDIT
0,0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,...,1.0,1.0,-20.421053,865055.565,108131.945625,,0.0,179055.0,0.0,179055.0
1,1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0.0,3.0,-7.16,1017400.5,254350.125,,0.0,348637.5,0.0,348637.5
2,2,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0.0,3.0,-7.16,1017400.5,254350.125,,0.0,68053.5,6885.0,61168.5
3,3,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0.0,3.0,-7.16,1017400.5,254350.125,,0.0,1035882.0,,
4,4,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,...,0.0,1.0,-7.666667,189037.8,94518.9,,0.0,20106.0,4860.0,15246.0


# No of columns

The total no of features is 129 which consists of 123 original features and 5 engineered features during the EDA.

In [75]:
df.columns

Index(['Unnamed: 0', 'SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE',
       'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT_x',
       ...
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'SK_ID_PREV', 'Delayed_days',
       'TOTAL_AMT_CREDIT_x', 'TOTAL_AMT_CREDIT_y', 'Utilization', 'MAX_DPD',
       'AMT_CREDIT_y', 'AMT_DOWN_PAYMENT', 'ACTUAL_CREDIT'],
      dtype='object', length=132)

In [76]:
df.drop(columns=['SK_ID_CURR'], inplace=True)

# Identify the data preparation steps required

1. Data cleaning: This involves identifying and handling missing or incorrect values, dealing with duplicates, and removing irrelevant data.
2. Data transformation: This involves transforming the data so that it can be used effectively by machine learning algorithms. This includes converting categorical variables into numerical variables using one-hot encoding or label encoding, scaling numerical variables, and normalizing or standardizing the data.
3.Handling class imbalance: This involves addressing class imbalance if present in the data, where the number of examples in one class is much greater than the other. Techniques such as oversampling, undersampling, or generating synthetic samples can be used to balance the classes.

# 5. Set up preprocessor and estimators using column transformers and pipeline.  While setting up preprocessing make your pipeline robust so that if a new category shows up in future, the pipeline does not break. 

For columnns with greater than 50% Null Values: Our approach is to drop them

For numerical Null value columns: We are trying median imputation as our primary approach

For categorical Null value features: We are imouting them with most frequent values as our primary batch

In [77]:
cat_vars = [var for var in df.columns if df[var].dtype == 'O']
cat_vars

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE',
 'FONDKAPREMONT_MODE',
 'HOUSETYPE_MODE',
 'WALLSMATERIAL_MODE',
 'EMERGENCYSTATE_MODE']

In [78]:
cat_vars_na50 = [i for i in df[cat_vars].columns if (df[i].isnull().sum() / len(df[cat_vars]) * 100) > 50]
cat_vars_na50

['FONDKAPREMONT_MODE', 'WALLSMATERIAL_MODE']

In [79]:
num_vars = [var for var in df.columns if df[var].dtype in ['int64', 'float64']]
num_vars

['Unnamed: 0',
 'TARGET',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT_x',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENT

In [80]:
num_vars_na50 = [i for i in df[num_vars].columns if (df[i].isnull().sum() / len(df[num_vars]) * 100) > 50]
num_vars_na50

['OWN_CAR_AGE',
 'EXT_SOURCE_1',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'Utilization',
 'AMT_DOWN_PAYMENT',
 'ACTUAL_CREDIT']

In [81]:
df.drop(columns=['OWN_CAR_AGE',
 'EXT_SOURCE_1',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'Utilization',
 'AMT_DOWN_PAYMENT',
 'ACTUAL_CREDIT'], inplace=True)

In [82]:
df.drop(columns=['FONDKAPREMONT_MODE', 'WALLSMATERIAL_MODE'], inplace=True)

In [97]:
df

Unnamed: 0.1,Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_PREV,Delayed_days,TOTAL_AMT_CREDIT_x,TOTAL_AMT_CREDIT_y,MAX_DPD,AMT_CREDIT_y
0,0,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,1.0,1.0,-20.421053,865055.565,108131.945625,0.0,179055.0
1,1,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,3.0,-7.160000,1017400.500,254350.125000,0.0,348637.5
2,2,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,3.0,-7.160000,1017400.500,254350.125000,0.0,68053.5
3,3,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,3.0,-7.160000,1017400.500,254350.125000,0.0,1035882.0
4,4,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,1.0,-7.666667,189037.800,94518.900000,0.0,20106.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430150,1430150,0,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,0.0,2.0,0.0,1.0,8.0,-8.067568,3801919.500,345629.045455,5.0,491580.0
1430151,1430151,0,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,0.0,2.0,0.0,1.0,8.0,-8.067568,3801919.500,345629.045455,5.0,254340.0
1430152,1430152,0,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,0.0,2.0,0.0,1.0,8.0,-8.067568,3801919.500,345629.045455,5.0,45000.0
1430153,1430153,0,Cash loans,F,N,N,0,157500.0,675000.0,49117.5,...,0.0,2.0,0.0,1.0,8.0,-8.067568,3801919.500,345629.045455,5.0,1067940.0


In [98]:
df.shape

(1430155, 94)

In [85]:
#X= df.drop(columns=['TARGET'])
#y=df['TARGET']

In [99]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline
from feature_engine.encoding import RareLabelEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier

In [100]:
def log_transform(X):
    return np.log(X)

In [101]:
from sklearn.base import TransformerMixin, BaseEstimator
class SparseToDenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if issparse(X):
            return X.toarray()
        else:
            return X

In [102]:
# pipeline for left skewed numeric columns 
left_skew_pipeline  = Pipeline(steps=[('skewness', PowerTransformer(method='yeo-johnson')),('imputer', SimpleImputer(strategy='median'))])

# pipeline for right skewed numeric columns 
right_skew_pipeline  = Pipeline(steps=[('log', FunctionTransformer(log_transform)),('imputer', SimpleImputer(strategy='median'))])


# pipeline for numeric columns that need to be discretized 
num_pipe = Pipeline(steps = [("imp", SimpleImputer(strategy= "median", add_indicator= True)),
                              ("disc", KBinsDiscretizer(strategy= "equal_width", encode = "ordinal"))]) 


# pipeline for nominal categorical columns 
nom_cat_pipe = Pipeline(steps = [("imp", SimpleImputer(strategy= "constant", fill_value = "missing")), 
                                 ("ohe", OneHotEncoder(handle_unknown= 'ignore',sparse_output=False)),])  


# pipeline for ordinal categorical columns 
ord_cat_pipe = Pipeline(steps = [("imp", SimpleImputer(strategy= "most_frequent", add_indicator = True)), 
                                 ("ord", OrdinalEncoder())])  


numeric_pipeline= Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
])
dense_transformer = Pipeline([
    ("sparse_to_dense", SparseToDenseTransformer())
])

In [103]:
nom_cat_vars = ['NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'EMERGENCYSTATE_MODE']
ord_cat_vars = ['FLAG_OWN_REALTY', 'FLAG_OWN_CAR', 'NAME_CONTRACT_TYPE', 'CODE_GENDER']


In [104]:
left_skew_num=['DAYS_REGISTRATION','DAYS_LAST_PHONE_CHANGE']
right_skew_num = ['AMT_INCOME_TOTAL','AMT_CREDIT_x', 'AMT_ANNUITY']

In [105]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
remainder_features=[]

In [107]:
preprocessor = ColumnTransformer(transformers = [("nom", nom_cat_pipe, nom_cat_vars),
                                                 ("ord", ord_cat_pipe, ord_cat_vars), 
                                                 ('num', numeric_pipeline, numeric_features),
                                                 ('cat', categorical_pipeline, categorical_features),
                                                 #("rare", rare_cat_pipe, rare_cat_vars), 
                                                 #("norm", num_pipe, num_vars),
                                                 ("left_skew", left_skew_pipeline, left_skew_num),
                                                 ("right_skew", right_skew_pipeline, right_skew_num)], remainder = "passthrough")


preprocessor.set_output(transform = "pandas")





In [108]:
preprocessor.fit(df)
df_processed = preprocessor.transform(df)

In [113]:
df_processed

Unnamed: 0,nom__NAME_TYPE_SUITE_Children,nom__NAME_TYPE_SUITE_Family,nom__NAME_TYPE_SUITE_Group of people,nom__NAME_TYPE_SUITE_Other_A,nom__NAME_TYPE_SUITE_Other_B,"nom__NAME_TYPE_SUITE_Spouse, partner",nom__NAME_TYPE_SUITE_Unaccompanied,nom__NAME_TYPE_SUITE_missing,nom__NAME_INCOME_TYPE_Businessman,nom__NAME_INCOME_TYPE_Commercial associate,...,cat__HOUSETYPE_MODE_specific housing,cat__HOUSETYPE_MODE_terraced house,cat__EMERGENCYSTATE_MODE_No,cat__EMERGENCYSTATE_MODE_Yes,cat__EMERGENCYSTATE_MODE_missing,left_skew__DAYS_REGISTRATION,left_skew__DAYS_LAST_PHONE_CHANGE,right_skew__AMT_INCOME_TOTAL,right_skew__AMT_CREDIT_x,right_skew__AMT_ANNUITY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.169883,-0.307837,12.218495,12.915579,10.114579
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864,10.482864
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864,10.482864
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864,10.482864
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,-0.007064,0.030436,11.119883,11.813030,8.817298
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430150,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468,10.801971
1430151,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468,10.801971
1430152,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468,10.801971
1430153,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468,10.801971


In [115]:
df_processed.isnull().sum()

nom__NAME_TYPE_SUITE_Children           0
nom__NAME_TYPE_SUITE_Family             0
nom__NAME_TYPE_SUITE_Group of people    0
nom__NAME_TYPE_SUITE_Other_A            0
nom__NAME_TYPE_SUITE_Other_B            0
                                       ..
left_skew__DAYS_REGISTRATION            0
left_skew__DAYS_LAST_PHONE_CHANGE       0
right_skew__AMT_INCOME_TOTAL            0
right_skew__AMT_CREDIT_x                0
right_skew__AMT_ANNUITY                 0
Length: 342, dtype: int64