In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [35]:
!pip install scikit-learn
!pip install imblearn
!pip install xgboost



In [3]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

### 1. Identify which type of machine learning problem is it?

This is a bi-class classification problem as the Target variabe(dependent variable) values are either 0 or 1.

### 2. Please suggest models that the team may consider for the problem at hand. Suggest at least three models and discuss their pros and cons.

1. Logistic regression

Pros:-
It is a simple and efficient algorithm that is easy to implement and interpret.
It can handle both continuous and categorical independent variables.
It provides probability estimates for each class, which can be useful for decision-making.
It is robust to noise and outliers in the data.

Cons:
It assumes a linear relationship between the independent variables and the log-odds of the dependent variable.
It is sensitive to overfitting when the number of independent variables is large.
It cannot handle non-linear relationships between the independent variables and the dependent variable.

2. Random Forest 

Pros:
It is a powerful and versatile algorithm that can handle both classification and regression problems.
It can handle large datasets with high dimensionality.
It is less prone to overfitting than other decision tree algorithms.
It can handle missing data and outliers in the data.

Cons:
It can be computationally expensive for large datasets and may require a lot of memory.
It can be difficult to interpret and understand the relationships between the variables and the outcome.
It may not perform well when there are many irrelevant features in the data.

3. XG Boost 

Pros:
It is a fast and efficient algorithm that can handle large datasets with high dimensionality.
It can handle missing data and outliers in the data.
It can automatically handle feature selection and feature engineering.
It can produce accurate predictions and has been shown to perform well in machine learning competitions.

Cons:
It can be computationally expensive for large datasets and may require a lot of memory.
It can be prone to overfitting if the hyperparameters are not tuned properly.
It may not perform well when there are many irrelevant features in the data.

### 3. Use the train dataset along with the five new variables you engineered during the EDA. 

In [36]:
df= pd.read_csv('Final_preprocessed_data.csv')

In [37]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,...,AMT_REQ_CREDIT_BUREAU_YEAR,SK_ID_PREV,Delayed_days,TOTAL_AMT_CREDIT_x,TOTAL_AMT_CREDIT_y,Utilization,MAX_DPD,AMT_CREDIT_y,AMT_DOWN_PAYMENT,ACTUAL_CREDIT
0,0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,...,1.0,1.0,-20.421053,865055.565,108131.945625,,0.0,179055.0,0.0,179055.0
1,1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0.0,3.0,-7.16,1017400.5,254350.125,,0.0,348637.5,0.0,348637.5
2,2,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0.0,3.0,-7.16,1017400.5,254350.125,,0.0,68053.5,6885.0,61168.5
3,3,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,...,0.0,3.0,-7.16,1017400.5,254350.125,,0.0,1035882.0,,
4,4,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,...,0.0,1.0,-7.666667,189037.8,94518.9,,0.0,20106.0,4860.0,15246.0


In [38]:
print(df.shape)

(1430155, 132)


#### No of columns

The total no of features is 129 which consists of 123 original features and 5 engineered features during the EDA.

In [39]:
df.columns

Index(['Unnamed: 0', 'SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE',
       'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT_x',
       ...
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'SK_ID_PREV', 'Delayed_days',
       'TOTAL_AMT_CREDIT_x', 'TOTAL_AMT_CREDIT_y', 'Utilization', 'MAX_DPD',
       'AMT_CREDIT_y', 'AMT_DOWN_PAYMENT', 'ACTUAL_CREDIT'],
      dtype='object', length=132)

In [40]:
df = df.drop(columns=['Unnamed: 0', 'SK_ID_CURR'], axis = 1)

In [41]:
vars_na50 = [i for i in df.columns if (df[i].isnull().sum() / len(df.columns) * 100) > 50]
vars_na50

['AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'NAME_TYPE_SUITE',
 'OWN_CAR_AGE',
 'OCCUPATION_TYPE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_AVG',
 'BASEMENTAREA_AVG',
 'YEARS_BEGINEXPLUATATION_AVG',
 'YEARS_BUILD_AVG',
 'COMMONAREA_AVG',
 'ELEVATORS_AVG',
 'ENTRANCES_AVG',
 'FLOORSMAX_AVG',
 'FLOORSMIN_AVG',
 'LANDAREA_AVG',
 'LIVINGAPARTMENTS_AVG',
 'LIVINGAREA_AVG',
 'NONLIVINGAPARTMENTS_AVG',
 'NONLIVINGAREA_AVG',
 'APARTMENTS_MODE',
 'BASEMENTAREA_MODE',
 'YEARS_BEGINEXPLUATATION_MODE',
 'YEARS_BUILD_MODE',
 'COMMONAREA_MODE',
 'ELEVATORS_MODE',
 'ENTRANCES_MODE',
 'FLOORSMAX_MODE',
 'FLOORSMIN_MODE',
 'LANDAREA_MODE',
 'LIVINGAPARTMENTS_MODE',
 'LIVINGAREA_MODE',
 'NONLIVINGAPARTMENTS_MODE',
 'NONLIVINGAREA_MODE',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',

In [42]:
df = df.drop([col for col in vars_na50 if col in df.columns], axis=1)

### Identify the data preparation steps required

1. Data cleaning: This involves identifying and handling missing or incorrect values, dealing with duplicates, and removing irrelevant data.
2. Data transformation: This involves transforming the data so that it can be used effectively by machine learning algorithms. This includes converting categorical variables into numerical variables using one-hot encoding or label encoding, scaling numerical variables, and normalizing or standardizing the data.
3.Handling class imbalance: This involves addressing class imbalance if present in the data, where the number of examples in one class is much greater than the other. Techniques such as oversampling, undersampling, or generating synthetic samples can be used to balance the classes.

### 5. Set up preprocessor and estimators using column transformers and pipeline.  While setting up preprocessing make your pipeline robust so that if a new category shows up in future, the pipeline does not break. 

For columnns with greater than 50% Null Values: Our approach is to drop them

For numerical Null value columns: We are trying median imputation as our primary approach

For categorical Null value features: We are imouting them with most frequent values as our primary batch

In [43]:
cat_vars = [var for var in df.columns if df[var].dtype == 'O']
cat_vars

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE']

In [44]:
num_vars = [var for var in df.columns if df[var].dtype in ['int64', 'float64']]
num_vars

['TARGET',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT_x',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

In [45]:
df

Unnamed: 0,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT_x,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,...,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21
0,1,Cash loans,M,N,Y,0,202500.0,406597.5,Working,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
1,0,Cash loans,F,N,N,0,270000.0,1293502.5,State servant,Higher education,...,0,0,0,0,0,0,0,0,0,0
2,0,Cash loans,F,N,N,0,270000.0,1293502.5,State servant,Higher education,...,0,0,0,0,0,0,0,0,0,0
3,0,Cash loans,F,N,N,0,270000.0,1293502.5,State servant,Higher education,...,0,0,0,0,0,0,0,0,0,0
4,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,Working,Secondary / secondary special,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430150,0,Cash loans,F,N,N,0,157500.0,675000.0,Commercial associate,Higher education,...,0,0,0,0,0,0,0,0,0,0
1430151,0,Cash loans,F,N,N,0,157500.0,675000.0,Commercial associate,Higher education,...,0,0,0,0,0,0,0,0,0,0
1430152,0,Cash loans,F,N,N,0,157500.0,675000.0,Commercial associate,Higher education,...,0,0,0,0,0,0,0,0,0,0
1430153,0,Cash loans,F,N,N,0,157500.0,675000.0,Commercial associate,Higher education,...,0,0,0,0,0,0,0,0,0,0


In [46]:
df.shape

(1430155, 56)

In [47]:
#X= df.drop(columns=['TARGET'])
#y=df['TARGET']

In [48]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.pipeline import Pipeline
#from feature_engine.encoding import RareLabelEncoder
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier

In [49]:
def log_transform(X):
    return np.log(X)

In [50]:
from sklearn.base import TransformerMixin, BaseEstimator
class SparseToDenseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if issparse(X):
            return X.toarray()
        else:
            return X

In [51]:
# pipeline for left skewed numeric columns 
left_skew_pipeline  = Pipeline(steps=[('skewness', PowerTransformer(method='yeo-johnson')),('imputer', SimpleImputer(strategy='median'))])

# pipeline for right skewed numeric columns 
right_skew_pipeline  = Pipeline(steps=[('log', FunctionTransformer(log_transform)),('imputer', SimpleImputer(strategy='median'))])


# pipeline for numeric columns that need to be discretized 
num_pipe = Pipeline(steps = [("imp", SimpleImputer(strategy= "median", add_indicator= True)),
                              ("disc", KBinsDiscretizer(strategy= "equal_width", encode = "ordinal"))]) 


# pipeline for nominal categorical columns 
nom_cat_pipe = Pipeline(steps = [("imp", SimpleImputer(strategy= "constant", fill_value = "missing")), 
                                 ("ohe", OneHotEncoder(handle_unknown= 'ignore',sparse_output=False)),])  


# pipeline for ordinal categorical columns 
ord_cat_pipe = Pipeline(steps = [("imp", SimpleImputer(strategy= "most_frequent", add_indicator = True)), 
                                 ("ord", OrdinalEncoder())])  


numeric_pipeline= Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

# categorical_pipeline = Pipeline([
#     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False))
# ])
# dense_transformer = Pipeline([
#     ("sparse_to_dense", SparseToDenseTransformer())
# ])

In [52]:
nom_cat_vars = ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE']
ord_cat_vars = ['FLAG_OWN_REALTY', 'FLAG_OWN_CAR', 'NAME_CONTRACT_TYPE', 'CODE_GENDER']


In [53]:
left_skew_num=['DAYS_REGISTRATION','DAYS_LAST_PHONE_CHANGE']
right_skew_num = ['AMT_INCOME_TOTAL','AMT_CREDIT_x']

In [54]:
num_rem = [col for col in num_vars if col not in left_skew_num and col not in right_skew_num]
num_rem

['TARGET',
 'CNT_CHILDREN',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_ID_PUBLISH',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_7',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_12',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21']

In [55]:
numeric_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = df.select_dtypes(include=['object']).columns.tolist()
remainder_features=[]

In [56]:
preprocessor = ColumnTransformer(transformers = [("nom", nom_cat_pipe, nom_cat_vars),
                                                 ("ord", ord_cat_pipe, ord_cat_vars), 
                                                 ('num', numeric_pipeline, num_rem),
                                                 #('cat', categorical_pipeline, categorical_features),
                                                 #("rare", rare_cat_pipe, rare_cat_vars), 
                                                 #("norm", num_pipe, num_vars),
                                                 ("left_skew", left_skew_pipeline, left_skew_num),
                                                 ("right_skew", right_skew_pipeline, right_skew_num)], remainder = "passthrough")


preprocessor.set_output(transform = "pandas")





In [57]:
preprocessor.fit(df)
df_processed = preprocessor.transform(df)

In [58]:
df_processed

Unnamed: 0,nom__NAME_INCOME_TYPE_Businessman,nom__NAME_INCOME_TYPE_Commercial associate,nom__NAME_INCOME_TYPE_Maternity leave,nom__NAME_INCOME_TYPE_Pensioner,nom__NAME_INCOME_TYPE_State servant,nom__NAME_INCOME_TYPE_Student,nom__NAME_INCOME_TYPE_Unemployed,nom__NAME_INCOME_TYPE_Working,nom__NAME_EDUCATION_TYPE_Academic degree,nom__NAME_EDUCATION_TYPE_Higher education,...,num__FLAG_DOCUMENT_16,num__FLAG_DOCUMENT_17,num__FLAG_DOCUMENT_18,num__FLAG_DOCUMENT_19,num__FLAG_DOCUMENT_20,num__FLAG_DOCUMENT_21,left_skew__DAYS_REGISTRATION,left_skew__DAYS_LAST_PHONE_CHANGE,right_skew__AMT_INCOME_TOTAL,right_skew__AMT_CREDIT_x
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.169883,-0.307837,12.218495,12.915579
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.007064,0.030436,11.119883,11.813030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430150,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468
1430151,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468
1430152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468
1430153,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468


In [59]:
df_processed.isnull().sum()

nom__NAME_INCOME_TYPE_Businessman             0
nom__NAME_INCOME_TYPE_Commercial associate    0
nom__NAME_INCOME_TYPE_Maternity leave         0
nom__NAME_INCOME_TYPE_Pensioner               0
nom__NAME_INCOME_TYPE_State servant           0
                                             ..
num__FLAG_DOCUMENT_21                         0
left_skew__DAYS_REGISTRATION                  0
left_skew__DAYS_LAST_PHONE_CHANGE             0
right_skew__AMT_INCOME_TOTAL                  0
right_skew__AMT_CREDIT_x                      0
Length: 140, dtype: int64

In [60]:
df_processed.columns = df_processed.columns.str.replace('nom__', '')
df_processed.columns = df_processed.columns.str.replace('ord__', '')
df_processed.columns = df_processed.columns.str.replace('num__', '')
df_processed.columns = df_processed.columns.str.replace('left_skew__', '')
df_processed.columns = df_processed.columns.str.replace('right_skew__', '')
df_processed.columns = df_processed.columns.str.replace('remainder__', '')

In [61]:
df_processed

Unnamed: 0,NAME_INCOME_TYPE_Businessman,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Maternity leave,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,NAME_INCOME_TYPE_Unemployed,NAME_INCOME_TYPE_Working,NAME_EDUCATION_TYPE_Academic degree,NAME_EDUCATION_TYPE_Higher education,...,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,DAYS_REGISTRATION,DAYS_LAST_PHONE_CHANGE,AMT_INCOME_TOTAL,AMT_CREDIT_x
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.169883,-0.307837,12.218495,12.915579
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.116672,0.015418,12.506177,14.072864
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.007064,0.030436,11.119883,11.813030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430150,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468
1430151,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468
1430152,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468
1430153,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.237219,0.063213,11.967181,13.422468


### Train - Test split

In [62]:
from sklearn.model_selection import train_test_split

In [63]:
df_processed = df_processed.sample(n=50000)

In [66]:
X= df_processed.drop(columns=['TARGET'])
y=df_processed['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


### 6. Choose three estimators to run. Set up relevant parameters and perform GridSearch CV for hyperparameter tuning. 

In [67]:
# logistic
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

logreg = LogisticRegression()
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
grid_search_log = GridSearchCV(logreg, param_grid=param_grid, cv=5, verbose=3)
grid_search_log.fit(X_train, y_train)

print("Best Hyperparameters: ", grid_search_log.best_params_)
print("Best Score: ", grid_search_log.best_score_)

# Best Hyperparameters:  {'C': 0.001, 'penalty': 'l2'}
# Best Score:  0.9139053928242348

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END .................C=0.001, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END .................C=0.001, penalty=l1;, score=nan total time=   0.0s
[CV 3/5] END .................C=0.001, penalty=l1;, score=nan total time=   0.0s
[CV 4/5] END .................C=0.001, penalty=l1;, score=nan total time=   0.0s
[CV 5/5] END .................C=0.001, penalty=l1;, score=nan total time=   0.0s
[CV 1/5] END ...............C=0.001, penalty=l2;, score=0.912 total time=   0.3s
[CV 2/5] END ...............C=0.001, penalty=l2;, score=0.912 total time=   0.4s
[CV 3/5] END ...............C=0.001, penalty=l2;, score=0.912 total time=   0.2s
[CV 4/5] END ...............C=0.001, penalty=l2;, score=0.912 total time=   0.2s
[CV 5/5] END ...............C=0.001, penalty=l2;, score=0.912 total time=   0.2s
[CV 1/5] END ..................C=0.01, penalty=l1;, score=nan total time=   0.0s
[CV 2/5] END ..................C=0.01, penalty=l

30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/karthikrpatil/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/karthikrpatil/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/karthikrpatil/opt/anaconda3/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'non

Best Hyperparameters:  {'C': 0.001, 'penalty': 'l2'}
Best Score:  0.9124285714285716


In [68]:
print("TEST SCORES LOGISTIC: ", grid_search_log.score(X_test, y_test))

TEST SCORES LOGISTIC:  0.9129333333333334


In [69]:
# randomforest
rf = RandomForestClassifier(random_state=42)

param_grid = {'n_estimators': [100, 300, 500],
              'max_depth': [5, 10, 15],
              'min_samples_split': [2, 5, 10]}

grid_search_rf = GridSearchCV(rf, param_grid=param_grid, cv=5, verbose=3)

grid_search_rf.fit(X_train, y_train)

# Printing the best hyperparameters and best score
print("Best Hyperparameters: ", grid_search_rf.best_params_)
print("Best Score: ", grid_search_rf.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.912 total time=   1.7s
[CV 2/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.912 total time=   1.6s
[CV 3/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.912 total time=   1.6s
[CV 4/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.912 total time=   1.5s
[CV 5/5] END max_depth=5, min_samples_split=2, n_estimators=100;, score=0.912 total time=   1.5s
[CV 1/5] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.912 total time=   4.5s
[CV 2/5] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.912 total time=   4.4s
[CV 3/5] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.912 total time=   4.8s
[CV 4/5] END max_depth=5, min_samples_split=2, n_estimators=300;, score=0.912 total time=   4.6s
[CV 5/5] END max_depth=5, min_samples_split=2, n_estimators=300;,

[CV 5/5] END max_depth=10, min_samples_split=10, n_estimators=300;, score=0.912 total time=   7.6s
[CV 1/5] END max_depth=10, min_samples_split=10, n_estimators=500;, score=0.912 total time=  12.8s
[CV 2/5] END max_depth=10, min_samples_split=10, n_estimators=500;, score=0.912 total time=  12.9s
[CV 3/5] END max_depth=10, min_samples_split=10, n_estimators=500;, score=0.912 total time=  13.0s
[CV 4/5] END max_depth=10, min_samples_split=10, n_estimators=500;, score=0.912 total time=  13.5s
[CV 5/5] END max_depth=10, min_samples_split=10, n_estimators=500;, score=0.912 total time=  12.8s
[CV 1/5] END max_depth=15, min_samples_split=2, n_estimators=100;, score=0.913 total time=   3.6s
[CV 2/5] END max_depth=15, min_samples_split=2, n_estimators=100;, score=0.913 total time=   3.5s
[CV 3/5] END max_depth=15, min_samples_split=2, n_estimators=100;, score=0.913 total time=   3.5s
[CV 4/5] END max_depth=15, min_samples_split=2, n_estimators=100;, score=0.913 total time=   3.5s
[CV 5/5] END m

In [70]:
print("TEST SCORES RANDOM FOREST: ", grid_search_rf.score(X_test, y_test))

TEST SCORES RANDOM FOREST:  0.9131333333333334


In [71]:
# xgboost
import xgboost as xgb

xgb_model = xgb.XGBClassifier(random_state=42)

param_grid = {'max_depth': [3, 5, 7],
              'min_child_weight': [1, 3, 5],
              'learning_rate': [0.1, 0.01, 0.001]}

grid_search_xgb = GridSearchCV(xgb_model, param_grid=param_grid, cv=5, verbose=3)

grid_search_xgb.fit(X_train, y_train)

print("Best Hyperparameters: ", grid_search_xgb.best_params_)
print("Best Score: ", grid_search_xgb.best_score_)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=1;, score=0.912 total time=   3.2s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.1s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.1s
[CV 4/5] END learning_rate=0.1, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.1s
[CV 5/5] END learning_rate=0.1, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.3s
[CV 1/5] END learning_rate=0.1, max_depth=3, min_child_weight=3;, score=0.912 total time=   4.6s
[CV 2/5] END learning_rate=0.1, max_depth=3, min_child_weight=3;, score=0.912 total time=   4.9s
[CV 3/5] END learning_rate=0.1, max_depth=3, min_child_weight=3;, score=0.912 total time=   5.1s
[CV 4/5] END learning_rate=0.1, max_depth=3, min_child_weight=3;, score=0.912 total time=   4.3s
[CV 5/5] END learning_rate=0.1, max_depth=3, min_child_weight=3;,

[CV 5/5] END learning_rate=0.01, max_depth=7, min_child_weight=3;, score=0.912 total time=   9.7s
[CV 1/5] END learning_rate=0.01, max_depth=7, min_child_weight=5;, score=0.912 total time=   9.5s
[CV 2/5] END learning_rate=0.01, max_depth=7, min_child_weight=5;, score=0.912 total time=   9.7s
[CV 3/5] END learning_rate=0.01, max_depth=7, min_child_weight=5;, score=0.912 total time=  10.1s
[CV 4/5] END learning_rate=0.01, max_depth=7, min_child_weight=5;, score=0.912 total time=   9.5s
[CV 5/5] END learning_rate=0.01, max_depth=7, min_child_weight=5;, score=0.912 total time=   9.6s
[CV 1/5] END learning_rate=0.001, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.5s
[CV 2/5] END learning_rate=0.001, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.1s
[CV 3/5] END learning_rate=0.001, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.6s
[CV 4/5] END learning_rate=0.001, max_depth=3, min_child_weight=1;, score=0.912 total time=   4.6s
[CV 5/5] END lea

In [72]:
print("TEST SCORES XGBOOST: ", grid_search_xgb.score(X_test, y_test))

TEST SCORES XGBOOST:  0.9134666666666666


Summary

In [73]:
#Logistic
print("Scores for LOGISTIC")
print("Train Score LOGISTIC: ", grid_search_log.best_score_)
print("Test Score LOGISTIC: ", grid_search_log.score(X_test, y_test))
print("---"* 50)

#RandomForest
print("Scores for RANDOM FOREST")
print("Train Score RANDOM FOREST: ", grid_search_rf.best_score_)
print("Test Score RANDOM FOREST: ", grid_search_rf.score(X_test, y_test))
print("---"* 50)

#XGboost
print("Scores for XGBOOST")
print("Train Score XGBOOST: ", grid_search_xgb.best_score_)
print("Test Score XGBOOST: ", grid_search_xgb.score(X_test, y_test))
print("---"* 50)

Scores for LOGISTIC
Train Score LOGISTIC:  0.9124285714285716
Test Score LOGISTIC:  0.9129333333333334
------------------------------------------------------------------------------------------------------------------------------------------------------
Scores for RANDOM FOREST
Train Score RANDOM FOREST:  0.913
Test Score RANDOM FOREST:  0.9131333333333334
------------------------------------------------------------------------------------------------------------------------------------------------------
Scores for XGBOOST
Train Score XGBOOST:  0.9128000000000001
Test Score XGBOOST:  0.9134666666666666
------------------------------------------------------------------------------------------------------------------------------------------------------


### 7a.  Suggest the metric(s) your team wants to use to compare the models. Justify your logic based on business goals.If needed, define your own cost function and pass it under "scoring" parameter in GridsearchCV(). 

Metrics like Accuracy, F1 Score, Precision, Recall & Confusion matrix can be used to evaluate the various models we intend to perform. Since, the problem revolves around a supervised classification problem, models like Logistic regression, Random Forest, XG boost can be used in this business context.  

In [74]:
def custom_cost_function(y_true, y_pred):
    # Define the costs associated with different types of errors
    false_positive_cost = 100
    false_negative_cost = 1000
    
    # Compute the number of false positives and false negatives
    false_positives = np.sum(np.logical_and(y_true == 0, y_pred == 1))
    false_negatives = np.sum(np.logical_and(y_true == 1, y_pred == 0))
    
    # Compute the total cost
    total_cost = false_positive_cost * false_positives + false_negative_cost * false_negatives
    
    # Return the negative of the total cost, as GridSearchCV tries to maximize the score
    return -total_cost


###7b. Fit three models, and report the performance of the best Gridsearch CV identified model for each of these three. 

#### Logistic Regression

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [82]:
#Predicting on the best fit model
y_pred_logistic = grid_search_log.predict(X_test)



#Evaluation Metrics
logistic_accuracy = accuracy_score(y_test, y_pred_logistic)
logistic_f1 = f1_score(y_test, y_pred_logistic)
logistic_precision = precision_score(y_test, y_pred_logistic)
logistic_recall = recall_score(y_test, y_pred_logistic)
logistic_conf_matrix = confusion_matrix(y_test, y_pred_logistic)
print('Logistic Regression - Best Parameters:', grid_search_log.best_params_)
print('Accuracy:', logistic_accuracy)
print('F1 Score:', logistic_f1)
print('Precision:', logistic_precision)
print('Recall:', logistic_recall)
print('Confusion Matrix:\n', logistic_conf_matrix)

Logistic Regression - Best Parameters: {'C': 0.001, 'penalty': 'l2'}
Accuracy: 0.9129333333333334
F1 Score: 0.0
Precision: 0.0
Recall: 0.0
Confusion Matrix:
 [[13694     0]
 [ 1306     0]]


  _warn_prf(average, modifier, msg_start, len(result))


#### Randomn Forest

In [83]:
from sklearn.metrics import *

In [84]:
#Predicting on the best fit model
y_pred_rf = grid_search_rf.predict(X_test)

#Evaluation Metrics
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_conf_matrix = confusion_matrix(y_test, y_pred_rf)
print('Random Forest - Best Parameters:', grid_search_rf.best_params_)
print('Accuracy:', rf_accuracy)
print('F1 Score:', rf_f1)
print('Precision:', rf_precision)
print('Recall:', rf_recall)
print('Confusion Matrix:\n', rf_conf_matrix)


Random Forest - Best Parameters: {'max_depth': 15, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9131333333333334
F1 Score: 0.004583651642475172
Precision: 1.0
Recall: 0.002297090352220521
Confusion Matrix:
 [[13694     0]
 [ 1303     3]]


#### XGBoost

In [85]:
#Predicting on the best fit model
y_pred_xgb = grid_search_xgb.predict(X_test)

#Evaluation Metrics
xgb_accuracy = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_precision = precision_score(y_test, y_pred_xgb)
xgb_recall = recall_score(y_test, y_pred_xgb)
xgb_conf_matrix = confusion_matrix(y_test, y_pred_xgb)
print('XGBoost - Best Parameters:', grid_search_xgb.best_params_)
print('Accuracy:', xgb_accuracy)
print('F1 Score:', xgb_f1)
print('Precision:', xgb_precision)
print('Recall:', xgb_recall)
print('Confusion Matrix:\n', xgb_conf_matrix)


XGBoost - Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1}
Accuracy: 0.9134666666666666
F1 Score: 0.015174506828528072
Precision: 0.8333333333333334
Recall: 0.007656967840735069
Confusion Matrix:
 [[13692     2]
 [ 1296    10]]


In [95]:
y_pred_logistic_df=pd.DataFrame(y_pred_logistic)
y_pred_logistic_df.to_csv('logistic_predictions.csv')

In [96]:
y_pred_rf_df=pd.DataFrame(y_pred_rf)
y_pred_rf_df.to_csv('Random_forest_predictions.csv')

In [97]:
y_pred_xgb_df=pd.DataFrame(y_pred_xgb)
y_pred_xgb_df.to_csv('XG_boost_predictions.csv')

In [99]:
X_test.to_csv('Test_X.csv')

In [101]:
y_test.to_csv('Test_Y.csv')

In [86]:
import pickle

In [105]:

f = open("grid_search_log.pkl","wb")
pickle.dump(grid_search_log, f)

In [106]:
f1 = open("grid_search_rf.pkl","wb")
pickle.dump(grid_search_rf, f1)

In [107]:
f1 = open("grid_search_xgb.pkl","wb")
pickle.dump(grid_search_xgb, f1)