### General Concept

Given a dataset with several features related to credit score, create a model that predicts the credit score according to the features

### 1. Process Data

In [1]:
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [2]:
# put data into dataframe
df = pd.read_csv("creditscores.csv")

df.head()

Unnamed: 0,Age,Gender,Income,Education,Marital Status,Number of Children,Home Ownership,Credit Score
0,25,Female,50000,Bachelor's Degree,Single,0,Rented,High
1,30,Male,100000,Master's Degree,Married,2,Owned,High
2,35,Female,75000,Doctorate,Married,1,Owned,High
3,40,Male,125000,High School Diploma,Single,0,Owned,High
4,45,Female,100000,Bachelor's Degree,Married,3,Owned,High


In [3]:
pure_df = df.copy()

# create label encoder to make strings into categorized numericals
le = LabelEncoder()

# list of columns needed to be encoded
le_cols = ["Gender", "Education", "Marital Status", "Home Ownership", "Credit Score"]

# encode every listed column
for column in df.columns:
    if column in le_cols:
        le.fit(df[column])
        df[column] = le.transform(df[column])


df.head()

Unnamed: 0,Age,Gender,Income,Education,Marital Status,Number of Children,Home Ownership,Credit Score
0,25,0,50000,1,1,0,1,1
1,30,1,100000,4,0,2,0,1
2,35,0,75000,2,0,1,0,1
3,40,1,125000,3,1,0,0,1
4,45,0,100000,1,0,3,0,1


In [4]:
# split data
X_train, X_test, y_train, y_test = train_test_split(df.drop(["Credit Score"], axis=1), 
                                                    df["Credit Score"], 
                                                    test_size=0.2, 
                                                    random_state=53)

# normalize features
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

### 2. Create Model

In [5]:
clf = tree.DecisionTreeClassifier()

### 3. Train Model

In [6]:
clf = clf.fit(X_train, y_train)

tree.plot_tree(clf)

[Text(0.444444,0.916667,'x[6] <= 0.347\ngini = 0.473\nsamples = 131\nvalue = [32, 89, 10]'),
 Text(0.222222,0.75,'x[0] <= -0.974\ngini = 0.045\nsamples = 87\nvalue = [2, 85, 0]'),
 Text(0.333333,0.833333,'True  '),
 Text(0.111111,0.583333,'gini = 0.0\nsamples = 2\nvalue = [2, 0, 0]'),
 Text(0.333333,0.583333,'gini = 0.0\nsamples = 85\nvalue = [0, 85, 0]'),
 Text(0.666667,0.75,'x[2] <= -1.435\ngini = 0.475\nsamples = 44\nvalue = [30.0, 4.0, 10.0]'),
 Text(0.555556,0.833333,'  False'),
 Text(0.555556,0.583333,'gini = 0.0\nsamples = 10\nvalue = [0, 0, 10]'),
 Text(0.777778,0.583333,'x[2] <= -0.107\ngini = 0.208\nsamples = 34\nvalue = [30, 4, 0]'),
 Text(0.666667,0.416667,'x[0] <= -1.445\ngini = 0.062\nsamples = 31\nvalue = [30, 1, 0]'),
 Text(0.555556,0.25,'x[2] <= -0.992\ngini = 0.198\nsamples = 9\nvalue = [8, 1, 0]'),
 Text(0.444444,0.0833333,'gini = 0.0\nsamples = 1\nvalue = [0, 1, 0]'),
 Text(0.666667,0.0833333,'gini = 0.0\nsamples = 8\nvalue = [8, 0, 0]'),
 Text(0.777778,0.25,'gini =

### 4. Evaluate Model

Average = 0, High = 1, Low = 2

In [7]:
accuracy_score(y_test, clf.predict(X_test))

1.0

In [8]:
clf.predict_proba(X_test[:1])

array([[0., 1., 0.]])

In [9]:
def predict(pred_x):
    pred_x = pd.DataFrame([{
        "Age": pred_x[0],
        "Gender": pred_x[1],
        "Income": pred_x[2],
        "Education": pred_x[3],
        "Marital Status": pred_x[4],
        "Number of Children": pred_x[5],
        "Home Ownership": pred_x[6]
    }])

    for column in pred_x.columns:
        if column in le_cols:
            le.fit(pure_df[column])
            pred_x[column] = le.transform(pred_x[column])

    pred_x = scaler.transform(pred_x)

    probs = clf.predict_proba(pred_x)[0]
        
    print(f"Credit Score Rating: Low ({probs[2]*100:.2f}%) | Average ({probs[0]*100:.2f}%) | High ({probs[1]*100:.2f}%)")

Predictions: Age, Gender, Income, Degree, Marital Status, No. of Children, Home Ownership

In [10]:
prediction = [40, "Male", 200000, "High School Diploma", "Single", 0, "Owned"]

predict(prediction)

Credit Score Rating: Low (0.00%) | Average (0.00%) | High (100.00%)
