In [1]:
import pandas as pd
import numpy as np
import io
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, log_loss, roc_auc_score
!pip install category_encoders
from category_encoders import OneHotEncoder



You are using pip version 18.1, however version 19.2.2 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [2]:
df = pd.read_csv('Demo_Lending_Club.csv')

In [3]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [4]:
# Custom function for initial checks
def DF_initial_observations(df):
    '''Gives basic details of columns in a dataframe : Data types, distinct values, NAs and sample'''
    if isinstance(df, pd.DataFrame):
        total_na= df.isna().sum().sum()
        print('Dimensions : %d rows, %d columns' % (df.shape[0],df.shape[1]))
        print("Total NA values : %d" % (total_na))
        print('%38s %10s     %10s %10s' % ('Column name', ' Data Type', '# Distinct', ' NA values'))
        col_name = df.columns
        dtyp = df.dtypes
        uniq = df.nunique()
        na_val = df.isna().sum()
        for i in range(len(df.columns)):
            print('%38s %10s :   %10d  %10d' % (col_name[i],dtyp[i],uniq[i],na_val[i]))
    else:
        print('Expected a DataFrame but got a %15s ' % (type(df)))

In [5]:
DF_initial_observations(df)

Dimensions : 10000 rows, 24 columns
Total NA values : 15560
                           Column name  Data Type     # Distinct  NA values
                                    Id      int64 :        10000           0
                                is_bad      int64 :            2           0
                            emp_length     object :           14           0
                        home_ownership     object :            5           0
                            annual_inc    float64 :         1901           1
                   verification_status     object :            3           0
                            pymnt_plan     object :            2           0
                           purpose_cat     object :           27           0
                              zip_code     object :          720           0
                            addr_state     object :           50           0
                        debt_to_income    float64 :         2585           0
                 

In [16]:
class Modeller:
    ''' <desc> '''
    def __init__(self):
        self.num_cols = [] 
        self.cat_cols = [] 
        self.model = []
        self.params = []
        self.cat_encoder = []
    
    def basic_preprocessing(self, df):
        # In attributes like mths_since_last_delinq and mths_since_last_record, absence of data speaks for the non-occurence 
        # of the acivity (indicating the same by a negative value, -99)
        df['mths_since_last_delinq'] = df['mths_since_last_delinq'].fillna(-99)
        df['mths_since_last_record'] = df['mths_since_last_record'].fillna(-99)

        # coerce emp_length to numeric
        df['emp_length'] = pd.to_numeric(df['emp_length'], errors='coerce')

        # Extract zip code and convert attribute to numeric
        df['zip_code'] = df['zip_code'].str.replace('zip_code_','')
        df['zip_code'] = df['zip_code'].str.replace('xx','')
        df['zip_code'] = pd.to_numeric(df['zip_code'], downcast='integer')
        
        self.num_cols = list(df.select_dtypes(exclude=['object']).columns)
        self.cat_cols = list(df.select_dtypes(include=['object']).columns)

        n = pd.DataFrame(df.isna().sum())
        miss_cols = list(n[n[0]>0].index)
        miss_num = [*set(miss_cols).intersection(set(self.num_cols))]
        miss_cat = [*set(miss_cols).intersection(set(self.cat_cols))]

        for i in miss_num:
            df[i] = df[i].fillna(df[i].median())
        for j in miss_cat:
            df[j] = df[j].fillna(df[j].mode()[0])
        
        return df
    
    def fit_encoder(self, df):
        ohe = OneHotEncoder()
        df_cat = ohe.fit_transform(df[self.cat_cols])
        #df_cat = pd.get_dummies(df[self.cat_cols])
        self.cat_encoder = ohe
        df = pd.concat([df_cat, df[self.num_cols]], axis=1)
        return df
    
    def transform_encoder(self, df):
        ohe = self.cat_encoder
        df_cat = ohe.transform(df[self.cat_cols])
        df = pd.concat([df_cat, df[self.num_cols]], axis=1)
        return df
    
    def fit(self, X, y):
        X = self.basic_preprocessing(X)
        X = self.fit_encoder(X)
        if self.params == []:
            print('Hyperparameter tuning is due, please try after tuning the parameters')
        else:
            xg = XGBClassifier(**self.params)
            xg.fit(X,y)
            self.model = xg

    def tune_parameters(self, X, y):
        params = {'reg_alpha':[10,50,100],
                  'reg_lambda':[10,50,100],
                  'learning_rate':[0.01,0.1,0.3],'n_estimators':[10,20,50]}
        xg = XGBClassifier(booster='dart', colsample_bylevel=1, n_jobs=-1, 
                           objective='binary:logistic',eval_metric='logloss', sub_sample=0.8)
        grid = GridSearchCV(estimator=xg, param_grid=params, scoring='roc_auc', cv=3, n_jobs=4)
        
        X = self.basic_preprocessing(X)
        X = self.fit_encoder(X)
        if X.isna().sum().sum()==0:
            grid.fit(X, y)
            self.params = grid.best_params_
        else:
            print('pre-processing is not done')
        
        return grid.cv_results_
    
    def predict(self, X):
        if self.model == []:
            print('fit the model before predict')
        else:
            X = self.basic_preprocessing(X)
            X = self.transform_encoder(X)
            pred = self.model.predict(X)
        return pred

    def predict_proba(self, X):
        if self.model == []:
            print('fit the model before predict')
        else:
            X = self.basic_preprocessing(X)
            X = self.transform_encoder(X)
            pred_prob = self.model.predict_proba(X)
        return pred_prob

    def evaluate(self, X, y):
        pred = self.predict_proba(X)[1]
        f1 = f1_score(y, pred)
        ll = log_loss(y, pred)

        dt = {'f1_score': f1, 'log_loss': ll}
        return dt

In [11]:
pred_attr = list(set(df.columns)-set(['id','is_bad']))

In [21]:
ml = Modeller()

In [None]:
ml.tune_parameters(X=df[pred_attr], y=df['is_bad'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

In [None]:
ml.fit(X=df[pred_attr], y=df['is_bad'])

In [None]:
y_pred = ml.predict(X=df[pred_attr])

In [None]:
ml.evaluate(X=df[pred_attr], y=y)