In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import numpy as np
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
from sklearn import linear_model, metrics
import matplotlib.pyplot as plt

In [22]:
df = pd.read_csv('../data/final_cleaned.csv', na_values = -9)
df = df.dropna()
df.shape

(9861, 26)

In [25]:
usecols = ['ExternalRiskEstimate','NumSatisfactoryTrades','MaxDelq2PublicRecLast12M','MaxDelqEver','MSinceMostRecentInqexcl7days','NumInqLast6M',\
           'NumInqLast6Mexcl7days','NumRevolvingTradesWBalance', 'NumBank2NatlTradesWHighUtilization','y']

new_df =  df[usecols]

X = new_df.drop('y', axis= 1)
Y = new_df['y']

C = [0.003, 0.01,0.03, 0.1, 0.3, 1,3,10,30,100,300,1000, 3000, 10000, 30000, 100000, 300000]
best_acc = 0
best_c = None
X_train, X_test, y_train, y_test = train_test_split(X,Y)

for c in C:
    clf = linear_model.LogisticRegression(penalty='l1', C=c, 
                                                      intercept_scaling=1, 
                                                      solver='liblinear',
                                                      max_iter=1000)
        
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test.values)
    acc = metrics.accuracy_score(y_pred,y_test)

#  only use new hyper parameter if better by thresh amount
    if(acc > best_acc):
        best_acc = acc
        best_c = c

print(best_acc)
clf = linear_model.LogisticRegression(penalty='l1', C=best_c, 
                                                  intercept_scaling=1, 
                                                  solver='liblinear',
                                                  max_iter=1000)
clf.fit(X_train,y_train)

0.7214111922141119


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [26]:
df = pd.read_csv('../data/heloc_dataset_v1.csv')
df = pd.DataFrame(df)

pos_coeffs = [usecols[0], usecols[1], usecols[2], usecols[4], usecols[6]]

data_cutoffs = {}

for i in usecols[:-1]:
    percentiles = pd.Series(df[i])
    x = percentiles.quantile([.4,.6])
    vals = x.values                            
    if i in pos_coeffs:
         data_cutoffs[i] = [-1 * vals[1], -1 * vals[0]]
    else:
        data_cutoffs[i] = [vals[0], vals[1]]

In [28]:
# %%python
from tkinter import Tk, Label, Button, Entry, IntVar, StringVar, END, W, E

class Calculator:

    def __init__(self, master):
        self.master = master
        master.title("Sub Prime Mortgage")
        vcmd = master.register(self.validate) # we have to wrap the command
        
        # LISTS TO CONTAIN REFERENCES TO WIDGETS
        self.param_values = list()
        self.param_value_types = list()
        self.param_labels = list()
        self.param_entries = list()
        
        # CREATING WIDGETS (ENTRIES AND LABELS)
        for var in usecols[:-1]:
            param_value = 0
            param_label = Label(master, text=var)
            param = IntVar()
            param.set(param_value)
            param_entry = Entry(master, validate="key", validatecommand=(vcmd, '%P'))
            
            self.param_values.append(param_value)
            self.param_labels.append(param_label)
            self.param_value_types.append(param)
            self.param_entries.append(param_entry)
        
        self.result_label_text = StringVar()
        self.result_label = Label(master, textvariable=self.result_label_text)
        self.submit_button = Button(master, text="Submit", command=lambda: self.update("submit"))
        
        # LAYOUT
        for i in range(0, len(self.param_labels)):
            self.param_labels[i].grid(row=i, column=0, sticky=W)
            self.param_entries[i].grid(row=i, column=1, columnspan=2, sticky=E)
        self.submit_button.grid(row=len(self.param_entries), column=1)
        self.result_label.grid(row=len(self.param_entries) + 1, column=1)

    def validate(self, new_text):
        if not new_text: # the field is being cleared
            self.entered_number = 0
            return True 
        try:
            self.entered_number = int(new_text)
            return True
        except ValueError:
            return False

    def update_background_color(self):
        i = 0
        for entry in self.param_entries:
            param = usecols[i]
            threshold = data_cutoffs[param]
            entry_value = float(entry.get())
            
            # case where coefficients are positive
            if param in pos_coeffs:
                entry_value = entry_value * -1 if entry_value == abs(entry_value) else entry_value

            if entry_value < threshold[0]:
                entry.config({"background": "#baef56"}) # green
            elif entry_value < threshold[1]:
                entry.config({"background": "#fffd9b"}) # yellow
            else:
                entry.config({"background": "#f7c0c0"}) # red
            i = i + 1
    
    def update_result_label(self):
        entry_vals = [[float(entry.get()) for entry in self.param_entries]]
        print(entry_vals)
        prediction = clf.predict(entry_vals)[0]
        print(prediction)
        self.result_label_text.set("Result: " + str(prediction))

    def update(self, method):
        if method == "submit":
            self.update_background_color()
            self.update_result_label()            
                
root = Tk()
my_gui = Calculator(root)
root.mainloop()

[[11.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
0
[[2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
0
[[2.0, 1.0, 1.0, 122222.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
0
[[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0]]
0
[[0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 777.0, 1.0, 1.0]]
1
