# Credit Score Detection

## Importing Libraries 

In [5]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn import svm
from sklearn.model_selection import cross_val_score

## Get Dataset 

In [6]:
data = pd.read_csv("credit_customers.csv")
print(data)

    checking_status  duration                  credit_history  \
0                <0       6.0  critical/other existing credit   
1          0<=X<200      48.0                   existing paid   
2       no checking      12.0  critical/other existing credit   
3                <0      42.0                   existing paid   
4                <0      24.0              delayed previously   
..              ...       ...                             ...   
995     no checking      12.0                   existing paid   
996              <0      30.0                   existing paid   
997     no checking      12.0                   existing paid   
998              <0      45.0                   existing paid   
999        0<=X<200      45.0  critical/other existing credit   

                 purpose  credit_amount    savings_status  employment  \
0               radio/tv         1169.0  no known savings         >=7   
1               radio/tv         5951.0              <100      1<=X<4   


## One-hot-encoding

In [7]:
features_to_encode = ["checking_status", "credit_history", "purpose", "savings_status", "other_payment_plans", 
                      "employment", "personal_status", "other_parties", "property_magnitude","housing", "job"]

one_hot_encoded = pd.get_dummies(data[features_to_encode]) 
data = pd.concat([one_hot_encoded, data], axis=1)
data = data.drop(features_to_encode, axis=1)

data['own_telephone']  = data['own_telephone'].replace({"none": 0, "yes": 1})
data['foreign_worker']  = data['foreign_worker'].replace({"no": 0, "yes": 1})
data['class']  = data['class'].replace({"bad": 0, "good": 1})
df = pd.DataFrame(data)
print(len(df.columns.tolist()))

60


## Scaling Data using MinMax

In [8]:
df = pd.DataFrame(data)

columns_to_scale = ['num_dependents', 'existing_credits', 'age', 'residence_since',
                    'installment_commitment', 'credit_amount', 'duration']


scaler = StandardScaler()
scaled = scaler.fit_transform(df[columns_to_scale])
scaled_columns = pd.DataFrame(scaled, columns=columns_to_scale, index=df.index)
df[columns_to_scale] = scaled_columns
print(df)

     checking_status_0<=X<200  checking_status_<0  checking_status_>=200  \
0                           0                   1                      0   
1                           1                   0                      0   
2                           0                   0                      0   
3                           0                   1                      0   
4                           0                   1                      0   
..                        ...                 ...                    ...   
995                         0                   0                      0   
996                         0                   1                      0   
997                         0                   0                      0   
998                         0                   1                      0   
999                         1                   0                      0   

     checking_status_no checking  credit_history_all paid  \
0                         

## Data Splitting and Bias

In [9]:
X = df.drop("class", axis = 1)
y = data["class"]

X = np.array(X)
y = np.array(y)

ones_column = np.ones((X.shape[0], 1))
X = np.hstack((ones_column, X))

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(700, 60) (300, 60) (700,) (300,)


## SVM

In [10]:
# try different kernels
for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    clf = svm.SVC(kernel=kernel)
    clf.fit(X_train, y_train)
    # print(clf.support_vectors_)

    accuracy_train = clf.score(X_train, y_train)
    accuracy_test  = clf.score(X_test, y_test)

    print(f"{kernel=},\t {accuracy_train=:.3f},\t {accuracy_test=:.3f}")

kernel='linear',	 accuracy_train=0.786,	 accuracy_test=0.767
kernel='poly',	 accuracy_train=0.921,	 accuracy_test=0.773
kernel='rbf',	 accuracy_train=0.870,	 accuracy_test=0.770
kernel='sigmoid',	 accuracy_train=0.700,	 accuracy_test=0.727


In [12]:
# k-fold cross validation with different C values
for kernel in ["linear", "poly", "rbf", "sigmoid"]:
    for kfold in range(2, 10):
        best_mean_for_c = (0, "")
        for c in [0.0000001, 0.00001, 0.001, 0.01, 0.1, 1, 10, 100, 10000, 1000000]:
            clf = svm.SVC(kernel="rbf", C=c)
            clf.fit(X_train, y_train)  # train on train set
            scores = cross_val_score(clf, X_test, y_test, cv=kfold)  # test on test set
            if scores.mean() > best_mean_for_c[0]:
                best_mean_for_c = (scores.mean(), f"{kernel=} {kfold=} {c=},\t acc mean:{scores.mean():.3f},\t acc std:{scores.std():.3f},\t {scores=}")
        print(best_mean_for_c[1])


kfold=2 c=100,	 mean:0.757,	 std:0.030,	 scores=array([0.78666667, 0.72666667])
kfold=3 c=1,	 mean:0.737,	 std:0.045,	 scores=array([0.79, 0.74, 0.68])
kfold=4 c=1,	 mean:0.733,	 std:0.034,	 scores=array([0.77333333, 0.76      , 0.70666667, 0.69333333])
kfold=5 c=1,	 mean:0.723,	 std:0.036,	 scores=array([0.76666667, 0.7       , 0.7       , 0.76666667, 0.68333333])
kfold=6 c=10,	 mean:0.730,	 std:0.044,	 scores=array([0.7 , 0.78, 0.8 , 0.72, 0.68, 0.7 ])
kfold=7 c=10,	 mean:0.733,	 std:0.017,	 scores=array([0.74418605, 0.72093023, 0.74418605, 0.74418605, 0.74418605,
       0.69767442, 0.73809524])
kfold=8 c=10,	 mean:0.737,	 std:0.045,	 scores=array([0.71052632, 0.71052632, 0.84210526, 0.71052632, 0.7027027 ,
       0.7027027 , 0.75675676, 0.75675676])
kfold=9 c=10,	 mean:0.746,	 std:0.030,	 scores=array([0.73529412, 0.76470588, 0.79411765, 0.72727273, 0.72727273,
       0.75757576, 0.72727273, 0.6969697 , 0.78787879])
