In [172]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score
from sklearn.metrics import precision_recall_curve, f1_score,fbeta_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [210]:
DF = pd.read_csv("data.csv", dtype = {'desc':str,'next_paymnt_d':object,'verification_status_joint':str})
#DF.columns

In [211]:
# creating dummy variables for 3 of my cateragorical features 
Loan_grade = pd.get_dummies(DF['grade'])
Home_type = pd.get_dummies(DF['home_ownership'])
term = pd.get_dummies(DF['term'])
# creating Dummies data frame 
Dummy = pd.concat([Loan_grade,Home_type,term],axis=1)
DummyUse = Dummy[['E','F','G','RENT','MORTGAGE',' 60 months']]
# creating numerical data frame 
NumericalUse = DF[['id','loan_amnt','int_rate','annual_inc','default_ind']]

# creating my feature/target space data frame 
Features = pd.concat([DummyUse,NumericalUse],axis=1)
COLS = Features.columns
#sns.pairplot(Features)

In [159]:
#ax = sns.kdeplot(data=DF, x="loan_amnt", hue="default_ind", multiple="stack")

ax = sns.kdeplot(data=Features, x='annual_inc', hue="default_ind", multiple="stack")
ax.set_xlim(10000,200000)


In [212]:
#defining feature space and target space for logestic model
X= Features.iloc[:,0:len(COLS)-1]
y = Features.iloc[:,len(COLS)-1:len(COLS)]

# splitting data , setting aside 20% for testing
X_x, X_test, Y_y, y_test  = train_test_split(X, y, test_size = 0.20, random_state=5)
# creating a validation set
X_train, X_val, y_train, y_val = train_test_split(X_x,Y_y, test_size = .25, random_state = 57)


# Precision and Recall Using KNN
1. Only takes 20,000 data points, otherwise would be too slow 
2. Find K value that gives best accuracy
3. Precision and recall still zero 


In [213]:
def choose_K(X,y):
    ks = list(range(1, 70))
    K_scores = []
    for k in ks:
        knn = KNeighborsClassifier(n_neighbors=k)
        scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
        K_scores.append(scores.mean())
    Max_Score = max(K_scores)
    K_value = K_scores.index(max(K_scores))

    return K_scores,ks, Max_Score, K_value

def plot_K(K1,K0,K2,K3):
    plt.plot(K1, K0)
    plt.xlabel('Value of K for KNN')
    plt.ylabel('Cross-Validated Accuracy')

    print('KFold Cross Validation: ',K2)
    print('When K is: ',K3)


def fit_predict_scoreKNN(X,Y,x,y,k):
  
    #instantiating
    knn = KNeighborsClassifier(n_neighbors=k)
    
    #fitting 
    knn.fit(X.iloc[:20000,1:len(COLS)-1], Y.iloc[:20000])
    
    # predicting 
    y_predict = knn.predict(x.iloc[:20000,1:len(COLS)-1])
   
    
    FB = fbeta_score(y_predict,Y.iloc[:20000], average='macro', beta=0.70)
    Recall = recall_score(Y.iloc[:20000],y_predict)
    Precision = precision_score(Y.iloc[:20000],y_predict)

    return FB, Recall, Precision 
    
   


In [214]:
# calling functions for KNN
#K = choose_K(X.iloc[:20000,1:],y.iloc[:20000])
#plot_K(K[1],K[0],K[2],K[3])
fit_predict_scoreKNN(X_train,y_train,X_val,y_val,67)

#  Defining Functions for use in Logistic Regression Analysis 
1. Fit model, Predict using different models, Score using different models 
2. all 'models' are just different versions of logestic regression 
3. Cash flow function is for buisness analysis 

In [215]:
def fit_predict_scoreLOGREG(X,Y,x,y):
    # X,Y are for fitting 
    # x,y are for predicting and scoring
    # instantiating models
    lr = LogisticRegression(solver='liblinear')
    lr_balanced = LogisticRegression(class_weight = 'balanced',solver = 'liblinear')
    lr_Xx = LogisticRegression(class_weight= {1:10, 0:1}, solver='liblinear')
    
    # fitting models to data set aside in train/test split
    lr.fit(X.iloc[:,1:len(COLS)-1], Y)
    lr_balanced.fit(X.iloc[:,1:len(COLS)-1], Y)
    lr_Xx.fit(X.iloc[:,1:len(COLS)-1], Y)

    # predict for base and weights 
    y_predictW = lr_Xx.predict(x.iloc[:,1:len(COLS)-1])
    y_predictBase = lr.predict(x.iloc[:,1:len(COLS)-1])

    # scoring models on validation set 
    F1B_score = f1_score(y, lr_balanced.predict(x.iloc[:,1:len(COLS)-1]))
    F1Xx_score = f1_score(y, lr_Xx.predict(x.iloc[:,1:len(COLS)-1]))
    AccuracyReg = lr.score(x.iloc[:,1:len(COLS)-1],y)
    AccuracyWeight = lr_Xx.score(x.iloc[:,1:len(COLS)-1],y)
    FBb = fbeta_score(lr_balanced.predict(x.iloc[:,1:len(COLS)-1]),y, average='macro', beta=0.70)
    FBW = fbeta_score(lr_Xx.predict(x.iloc[:,1:len(COLS)-1]),y, average='macro', beta=0.70)
    FBbase = fbeta_score(y_predictBase,y, average = 'macro', beta=.70)
    RecallB = recall_score(y,y_predictBase)
    PrecisionB = precision_score(y,y_predictBase)
    RecallW = recall_score(y,y_predictW)
    PrecisionW = precision_score(y,y_predictW)
    
    # Predict class 1 if probability of being in class 1 is greater than threshold
    y_predict = (lr_Xx.predict_proba(x.iloc[:,1:len(COLS)-1])[:,1] >= .50)
    default_confusion = confusion_matrix(y, y_predict)
    plt.figure(dpi=80)
    sns.heatmap(default_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
            xticklabels=['NoDefault', 'Default'],
            yticklabels=['NoDefault', 'Default']);
    plt.xlabel('prediction')
    plt.ylabel('actual')
    
    return F1B_score,F1Xx_score,FBb,FBW,FBbase,RecallB,PrecisionB,RecallW,PrecisionW 

def format_scores(A,B,C,D,E,F,G,H,I):
    # F1 Scores 
    print('Balanced class weights Logistic Regression Test F1:', A)
    print('20:1 class weights Logistic Regression Test F1: ', B)
    print("")

    # FBeta Scores
    print("FBeta Score Stock: ",E )
    print('FBeta Score Balanced: ', C)
    print('FBeta Score weights: ',D)
    print("")
    
    #Recall 
    print('Recall Score No weights: ',F)
    print('Recall Score weighted: ',H)
    print("")

    #Precision
    print('Precision Score No weights: ',G)
    print('Precision Score weighted: ',I)
    print("")
    
def make_confusion_matrix(model, threshold):
    # Predict class 1 if probability of being in class 1 is greater than threshold
    y_predict = (model.predict_proba(X_test.iloc[:,1:len(COLS)-1])[:,1] >= threshold)
    default_confusion = confusion_matrix(y_test, y_predict)
    plt.figure(dpi=80)
    sns.heatmap(default_confusion, cmap=plt.cm.Blues, annot=True, square=True, fmt='d',
            xticklabels=['NoDefault', 'Default'],
            yticklabels=['NoDefault', 'Default']);
    plt.xlabel('prediction')
    plt.ylabel('actual')

def hard_soft(X_test):
    # hard and soft classification results for weighted logistic regression
    HardClassificationXx = lr_Xx.predict(X_test.iloc[:,1:len(COLS)-1])
    SoftClassificationXx = lr_Xx.predict_proba(X_test.iloc[:,1:len(COLS)-1])
    SoftClassificationReg = lr.predict_proba(X_test.iloc[:,1:len(COLS)-1])
    # putting into data frames for plotting in seaborn
    return HardClassificationXx,SoftClassificationXx,SoftClassificationReg

def convert_to_hard(SCXx):
    # for threshold of .39
    HardClass = []
    SC = SCXx.tolist()
    for item in SC:
        if item[1] > .39:
            HardClass.append(1)
        else:
            HardClass.append(0)
    return HardClass

def make_cashflow_df(HCXx,X,df):
    PREDICT = pd.DataFrame(HCXx, columns = {'Predict'})
    i_d = X['id']
    ID = pd.DataFrame(i_d)
    ID.reset_index(drop=True,inplace=True)
    PredId = pd.concat([ID,PREDICT],axis=1)
    MONEYLOST = DF[['id','loan_amnt','total_rec_prncp','total_rec_int','default_ind']]
    MONEY = pd.merge(left=PredId, right=MONEYLOST, left_on='id', right_on='id')
    MONEY.dropna(inplace=True)
    return MONEY

def make_mask(df):
    maskA = df['default_ind'] == 0
    maskB = df['default_ind'] == 1
    CASH = df[maskA]
    CASH2 = df[maskB]
    return CASH, CASH2

def cash_flow(CASH,CASH2):
    RANGE = RecallSpace.shape
    Saved = []
    Lost = []
    made = []
    walked = []
    for i in range(100):
        if (CASH['Predict'].iloc[i] == 0.0) and (CASH['default_ind'].iloc[i] == 1.0):
            Amount_lost = CASH['loan_amnt'].iloc[i] - (CASH['total_rec_prncp'].iloc[i] + CASH['total_rec_int'].iloc[i])
            Lost.append(Amount_lost)
        if (CASH['Predict'].iloc[i] == 1.0) and (CASH['default_ind'].iloc[i] == 1.0):
            Amount_saved = CASH['loan_amnt'].iloc[i]
            Saved.append(Amount_saved)
        
        if (CASH2['Predict'].iloc[i] == 0.0) and (CASH2['default_ind'].iloc[i] == 0.0):
            Amount_made = CASH2['loan_amnt'].iloc[i] + (CASH2['loan_amnt'].iloc[i]*.08)
            made.append(Amount_made)
                                                     
        if (CASH2['Predict'].iloc[i] == 1.0) and (CASH2['default_ind'].iloc[i] == 0.0):
            Amount_lost = CASH2['loan_amnt'].iloc[i] + (CASH2['loan_amnt'].iloc[i]*.08)
            walked.append(Amount_lost)
    Made_total = sum(made)
    Walked_total = sum(walked)
    Saved_Total = sum(Saved)
    Lost_Total = sum(Lost)
    return Made_total, Walked_total,Saved_Total,Lost_Total

In [216]:
# calling functions 
VAL = fit_predict_scoreLOGREG(X_train,y_train,X_val,y_val)
TEST = fit_predict_scoreLOGREG(X_x,Y_y,X_test,y_test)
print("-----VALIDATION-----")
VALIDATE = format_scores(VAL[0],VAL[1],VAL[2],VAL[3],VAL[4],VAL[5],VAL[6],VAL[7],VAL[8])
print("-----TEST-----")
TESTED = format_scores(TEST[0],TEST[1],TEST[2],TEST[3],TEST[4],TEST[5],TEST[6],TEST[7],TEST[8])

# Calling Cash Flow Functions 
1. Hard and soft classifications
2. Make a cash flow data frame
2. Seperate all actual positive and negative class 

In [39]:
HCSC = hard_soft(X_test)
MNY = make_cashflow_df(HCSC[0],X_test,DF)
Csh = make_mask(MNY)

# Decision Trees 


In [183]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import fbeta_score, make_scorer


def fit_predict_scoreDT(X,Y,x,y):
    weights = {1:20, 0:1}
    dtb = DecisionTreeClassifier(class_weight = 'balanced',max_depth=10)
    wdt = DecisionTreeClassifier(class_weight=weights,max_depth=10)
    dt = DecisionTreeClassifier(max_depth=10)

    dtb.fit(X.iloc[:,1:len(COLS)-1], Y)
    wdt.fit(X.iloc[:,1:len(COLS)-1], Y)
    dt.fit(X.iloc[:,1:len(COLS)-1], Y)
    
    y_predictW = wdt.predict(x.iloc[:,1:len(COLS)-1])
    y_predictBase = dt.predict(x.iloc[:,1:len(COLS)-1])
    
    F1B_score = f1_score(y, dtb.predict(x.iloc[:,1:len(COLS)-1]))
    F1Xx_score = f1_score(y, wdt.predict(x.iloc[:,1:len(COLS)-1]))
    AccuracyReg = dt.score(x.iloc[:,1:len(COLS)-1],y)
    AccuracyWeight = wdt.score(x.iloc[:,1:len(COLS)-1],y)
    FBb = fbeta_score(dtb.predict(x.iloc[:,1:len(COLS)-1]),y, average='macro', beta=0.70)
    FBW = fbeta_score(wdt.predict(x.iloc[:,1:len(COLS)-1]),y, average='macro', beta=0.70)
    FBbase = fbeta_score(y_predictBase,y, average = 'macro', beta=.70)
    RecallB = recall_score(y,y_predictBase)
    PrecisionB = precision_score(y,y_predictBase)
    RecallW = recall_score(y,y_predictW)
    PrecisionW = precision_score(y,y_predictW)
    
    return F1B_score,F1Xx_score,FBb,FBW,FBbase,RecallB,PrecisionB,RecallW,PrecisionW 

def format_scores(A,B,C,D,E,F,G,H,I):
    # F1 Scores 
    print('Balanced class weights Trees Test F1:', A)
    print('20:1 class weights Trees F1: ', B)
    print("")

    # FBeta Scores
    print("FBeta Score Stock: ",E )
    print('FBeta Score Balanced: ', C)
    print('FBeta Score weights: ',D)
    print("")
    
    #Recall 
    print('Recall Score No weights: ',F)
    print('Recall Score weighted: ',H)
    print("")

    #Precision
    print('Precision Score No weights: ',G)
    print('Precision Score weighted: ',I)
    print("")
                                 
                                
VALTREE = fit_predict_scoreDT(X_train,y_train,X_val,y_val)


In [217]:
VALTREE = fit_predict_scoreDT(X_train,y_train,X_val,y_val)
TESTTREE = fit_predict_scoreDT(X_x,Y_y,X_test,y_test)
print("-----VALIDATION-----")
VALIDATE = format_scores(VALTREE[0],VALTREE[1],VALTREE[2],VALTREE[3],VALTREE[4],VALTREE[5],VALTREE[6],VALTREE[7],VALTREE[8])
print("-----TEST-----")
TESTED = format_scores(TESTTREE[0],TESTTREE[1],TESTTREE[2],TESTTREE[3],TESTTREE[4],TESTTREE[5],TESTTREE[6],TESTTREE[7],TESTTREE[8])

# Exploring log loss 
1. try calculating 'by hand' 
2. Use built in 



In [220]:
# lets try and interpret log loss 

from sklearn.metrics import log_loss

def LOGLOSS(model,X_test,y_test):
    HardClassificationXx = model.predict(X_test.iloc[:,1:len(COLS)-1])
    SoftClassificationXx = model.predict_proba(X_test.iloc[:,1:len(COLS)-1])
    ProbPos = [SoftClassificationXx[i][1] for i in range(len(SoftClassificationXx))]
    PosProb = pd.DataFrame(ProbPos,columns = {"Prob in Pos Class"})
    HardPred = pd.DataFrame(HardClassificationXx, columns = {'Predicted Hard'})
    y_test.reset_index(drop=True,inplace=True)

    LogLoss = pd.concat([PosProb,HardPred,y_test],axis=1)
    return LogLoss

LLWeights = LOGLOSS(lr_Xx,X_test,y_test)
LLStock = LOGLOSS(lr,X_test,y_test)
LL1 = log_loss(y_test,lr.predict_proba(X_test.iloc[:,1:len(COLS)-1]))
LL2 = log_loss(y_test,lr_Xx.predict_proba(X_test.iloc[:,1:len(COLS)-1]))
print("Log Loss Stock Logistic Regression: ", LL1)
print("Log Loss Weighted Logistic Regression: ",LL2)

LLStock
