# With preprocessed data

In [2]:
# Data handling
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

In [3]:
df = pd.read_csv("../data/preprocessed_data.csv")
df

Unnamed: 0,ID,LIMIT_BAL,AGE,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,...,PAY_6_-2,PAY_6_-1,PAY_6_0,PAY_6_2,PAY_6_3,PAY_6_4,PAY_6_5,PAY_6_6,PAY_6_7,PAY_6_8
0,1,20000.0,24,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,2,120000.0,26,2682.0,1725.0,2682.0,3272.0,3455.0,3261.0,0.0,...,0,0,0,1,0,0,0,0,0,0
2,3,90000.0,34,29239.0,14027.0,13559.0,14331.0,14948.0,15549.0,1518.0,...,0,0,1,0,0,0,0,0,0,0
3,4,50000.0,37,46990.0,48233.0,49291.0,28314.0,28959.0,29547.0,2000.0,...,0,0,1,0,0,0,0,0,0,0
4,5,50000.0,57,8617.0,5670.0,35835.0,20940.0,19146.0,19131.0,2000.0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,39,188948.0,192815.0,208365.0,88004.0,31237.0,15980.0,8500.0,...,0,0,1,0,0,0,0,0,0,0
29996,29997,150000.0,43,1683.0,1828.0,3502.0,8979.0,5190.0,0.0,1837.0,...,0,0,1,0,0,0,0,0,0,0
29997,29998,30000.0,37,3565.0,3356.0,2758.0,20878.0,20582.0,19357.0,0.0,...,0,0,1,0,0,0,0,0,0,0
29998,29999,80000.0,41,-1645.0,78379.0,76304.0,52774.0,11855.0,48944.0,85900.0,...,0,1,0,0,0,0,0,0,0,0


In [4]:
X = df.drop(["default.payment.next.month"], axis=1)
y = df["default.payment.next.month"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Without scaled values

In [6]:
models = [LogisticRegression(), SVC(), LinearSVC(), RandomForestClassifier(), KNeighborsClassifier(), GaussianNB(), Perceptron(), SGDClassifier(), DecisionTreeClassifier()]

In [7]:
for model in models:
    print("Model:", model)
    classifier = model
    classifier.fit(X_train, y_train)
    print(classifier.score(X_test, y_test))

Model: LogisticRegression()
0.7824444444444445
Model: SVC()
0.7804444444444445
Model: LinearSVC()
0.2867777777777778
Model: RandomForestClassifier()
0.8143333333333334
Model: KNeighborsClassifier()
0.7502222222222222
Model: GaussianNB()
0.3658888888888889
Model: Perceptron()
0.7801111111111111
Model: SGDClassifier()
0.7781111111111111
Model: DecisionTreeClassifier()
0.7214444444444444


### With scaled values

In [8]:
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [9]:
for model in models:
    print("Model:", model)
    classifier = model
    classifier.fit(X_train, y_train)
    print(classifier.score(X_test, y_test))

Model: LogisticRegression()
0.8195555555555556
Model: SVC()
0.8215555555555556
Model: LinearSVC()
0.8203333333333334
Model: RandomForestClassifier()
0.8172222222222222
Model: KNeighborsClassifier()
0.7945555555555556
Model: GaussianNB()
0.21955555555555556
Model: Perceptron()
0.6976666666666667
Model: SGDClassifier()
0.8171111111111111
Model: DecisionTreeClassifier()
0.7223333333333334
