In [None]:
!git clone https://github.com/KushajveerSingh/ds_cup

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, recall_score,precision_score
smallestF =  np.finfo('float').eps # smallest float value (numpy), will be used later

In [None]:
path = Path('../data_orig')
# path = Path('ds_cup/orig_data')
orig_train_df = pd.read_csv(path/'train.csv')
orig_valid_df = pd.read_csv(path/'valid.csv')
orig_test_df = pd.read_csv(path/'test.csv')

In [None]:
#Fix Name error:
orig_train_df.rename(columns={'auto_open_ 36_month_num': 'auto_open_36_month_num'},inplace=True)
orig_valid_df.rename(columns={'auto_open_ 36_month_num': 'auto_open_36_month_num'},inplace=True)
orig_test_df.rename(columns={'auto_open_ 36_month_num': 'auto_open_36_month_num'},inplace=True)

In [None]:
def print_size(df, name):
    counts = df['Default_ind'].value_counts()
    print(f'{name} dataset')
    print(f'Num 0 values = {counts[0]}')
    print(f'Num 1 values = {counts[1]}')
    print()

In [None]:
print_size(orig_train_df, 'train')
print_size(orig_valid_df, 'valid')
print_size(orig_test_df, 'test')

In [None]:
print(orig_train_df.isnull().sum())
print('--')
print(orig_valid_df.isnull().sum())
print('--')
print(orig_test_df.isnull().sum())
print('--')

In [None]:
def createThreshDict(colName,Arr):
    _dict = {}
    _dict['colName'] = colName
    _dict['ThreshArr'] = Arr
    return _dict
def binColumn(col,threshArr):
    assert col.isnull().all() != True, "All values are null?"
    def binVal(val):
        loc = 1
        for x in threshArr:
            if(val<=x):
                return loc
            else:
                loc = loc+1
        return loc
    theCol = col.copy()
    threshArr = np.sort(np.array(threshArr))
    #Replace Nulls with 0 if any
    theBinnedCol = []
    for val in theCol:
        if(not np.isnan(val)):
            theBinnedCol.extend([binVal(val)])
        else:
            theBinnedCol.extend([0])
        #theCol.update(pd.Series([], index=[ind]))
    return pd.Series(theBinnedCol,index=col.keys())

In [None]:
def trainDataBinClean(DF):
    DFThresh =  pd.DataFrame(columns=createThreshDict("",[]).keys())
    DF_Cleaned = pd.DataFrame(columns=DF.columns)
    #--'tot_credit_debt'--
    _col = 'tot_credit_debt'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'avg_card_debt'--
    cutoff_avg_card_debt = (13167.825020 + 99999.0)/2
    _col = 'avg_card_debt'
    _thresh = DF[DF.avg_card_debt < cutoff_avg_card_debt]['avg_card_debt'].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    _thresh = np.append(_thresh, [cutoff_avg_card_debt])
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'credit_age'--
    _col = 'credit_age'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'credit_good_age'--
    _col = 'credit_good_age'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'card_age'--
    _col = 'card_age'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'non_mtg_acc_past_due_12_months_num'--
    DF_Cleaned['non_mtg_acc_past_due_12_months_num'] = DF['non_mtg_acc_past_due_12_months_num'].copy()
    #--'non_mtg_acc_past_due_6_months_num'--
    DF_Cleaned['non_mtg_acc_past_due_6_months_num'] = DF['non_mtg_acc_past_due_6_months_num'].copy()
    #--'mortgages_past_due_6_months_num'--
    DF_Cleaned['mortgages_past_due_6_months_num'] = DF['mortgages_past_due_6_months_num'].copy()
    #--'credit_past_due_amount'--
    _col = 'credit_past_due_amount'
    _thresh = DF[DF.credit_past_due_amount > 0][_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    _thresh = np.insert(_thresh, 0, 0+smallestF)
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'inq_12_month_num'--
    _col = 'inq_12_month_num'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'card_inq_24_month_num'--
    _col = 'card_inq_24_month_num'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'card_open_36_month_num'--
    _col = 'card_open_36_month_num'
    DF_Cleaned[_col] = DF[_col].copy()
    #--'auto_open_ 36_month_num'--
    _col = 'auto_open_36_month_num'
    DF_Cleaned[_col] = DF[_col].copy()
    #--'uti_card'--
    _col = 'uti_card'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'uti_50plus_pct'--
    _col = 'uti_50plus_pct'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'uti_max_credit_line'--
    _col = 'uti_max_credit_line'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'uti_card_50plus_pct'--
    _col = 'uti_card_50plus_pct'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'ind_acc_XYZ'--
    _col = 'ind_acc_XYZ'
    DF_Cleaned[_col] = DF[_col].copy()
    #--'rep_income'--
    _col = 'rep_income'
    _thresh = DF[_col].quantile([0.2,0.4,0.6,0.8]).to_numpy()
    DFThresh = DFThresh.append(createThreshDict(_col,
        _thresh),
        ignore_index=True) 
    DF_Cleaned[_col] = binColumn(DF[_col],_thresh)
    #--'States'--
    _col = 'States'
    enc = OneHotEncoder(handle_unknown='ignore',sparse = False)
    SM = enc.fit_transform(DF[_col].to_numpy().reshape(-1,1)) #Got it!
    for ind,cat in zip(range(0,len(enc.categories_[0])),enc.categories_[0]):
        DF_Cleaned.drop(columns=['is'+str(cat)],inplace=True,errors='ignore')
        DF_Cleaned.insert(0,'is'+str(cat), SM[:,ind]) # Inserted column names are isAK, isAL, isDC..etc. 
    #And it will contain 0 if that is not the state and 1 if that is the state,
    DF_Cleaned.drop(columns=[_col],inplace=True) #Remove States Column
    #--'Default_ind'--
    _col = 'Default_ind'
    DF_Cleaned[_col] = DF[_col].copy()
    #--
    return DF_Cleaned,DFThresh

In [None]:
def binDataFrameWithThresh(DF,ThreshDF):
    DF_Cleaned = DF.copy()
    for col,arr in zip(ThreshDF.colName.values,ThreshDF.ThreshArr.values):
        DF_Cleaned[col] = binColumn(DF[col],arr)
    return DF_Cleaned
def oneHotEncodeStates(DF):
    _col = 'States'
    DF_Fixed = DF.copy()
    enc = OneHotEncoder(handle_unknown='ignore',sparse = False)
    SM = enc.fit_transform(DF[_col].to_numpy().reshape(-1,1)) #Got it!
    for ind,cat in zip(range(0,len(enc.categories_[0])),enc.categories_[0]):
        DF_Fixed.drop(columns=['is'+str(cat)],inplace=True,errors='ignore')
        DF_Fixed.insert(0,'is'+str(cat), SM[:,ind]) # Inserted column names are isAK, isAL, isDC..etc. 
    #And it will contain 0 if that is not the state and 1 if that is the state,
    DF_Fixed.drop(columns=[_col],inplace=True) #Remove States Column
    return DF_Fixed

In [None]:
df_train,df_thresh = trainDataBinClean(orig_train_df)

In [None]:
df_valid = binDataFrameWithThresh(orig_valid_df,df_thresh)
df_valid = oneHotEncodeStates(df_valid)
df_test = binDataFrameWithThresh(orig_test_df,df_thresh)
df_test = oneHotEncodeStates(df_test)

In [None]:
df_train = df_train.sample(frac=1) #Shuffle!
df_trainX = df_train.iloc[:,:-1]
df_trainY = df_train.iloc[:,-1]
#
df_valid = df_train.sample(frac=1)
df_validX = df_valid.iloc[:,:-1]
df_validY = df_valid.iloc[:,-1]
#
df_test = df_train.sample(frac=1)
df_testX = df_test.iloc[:,:-1]
df_testY = df_test.iloc[:,-1]

# --- Dataframes Ready ---

In [None]:
model = LogisticRegression(class_weight='balanced',solver='liblinear')
scores = cross_validate(model, df_trainX, df_trainY, cv=10,scoring=('accuracy','recall','precision'))

In [None]:
cv_scores = pd.DataFrame(scores)
cv_scores

In [None]:
print('Average Cross Validation Results')
print('Accuracy =',np.average(cv_scores.test_accuracy))
print('Recall =',np.average(cv_scores.test_recall))
print('Precision =',np.average(cv_scores.test_precision))

In [None]:
#checking manually

In [None]:
model2 = LogisticRegression(class_weight='balanced',solver='liblinear')
hist = model2.fit(df_trainX,df_trainY)
df_validYPred = model2.predict(df_validX)

In [None]:
accScore = accuracy_score(df_validY,df_validYPred)
recScore = recall_score(df_validY,df_validYPred)
preScore = precision_score(df_validY,df_validYPred)

In [None]:
print('Accuracy =',accScore)
print('Recall =',recScore)
print('Precision =',preScore)