In [663]:
# Import all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame, Series

# BROAD PREPROCESSING

I want to manually encode binary columns, assigning the Y and N (or equivalent) values to 1 and 0, respectively. For string columns such as "EDUCATION", however, I will use one hot encoding to minimize potential errors.

Null values will also need to be handled effectively. I will replace all null values in binary or ordinal columns with the column's mode, since that means that there is essentially a 50% chance to get the entry "right," but I will use the column's median for numerical columns.

In [664]:
# Read in the Credit Card dataset
credit_base = pd.read_csv("Credit_card.csv")
labels = pd.read_csv("Credit_card_label.csv")

# Merge Credit_card.csv and Credit_card_label.csv so that there's an additional column representing whether the entry was accepted or denied
# for a credit card
credit_df = pd.merge(credit_base, labels, on = 'Ind_ID')

credit_df.head(10)

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members,label
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2,1
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2,1
3,5009749,F,Y,N,0,,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
4,5009752,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
5,5009753,,Y,N,0,315000.0,Pensioner,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
6,5009754,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2,1
7,5009894,F,N,N,0,180000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-22134.0,365243,1,0,0,0,,2,1
8,5010864,M,Y,Y,1,450000.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-18173.0,-678,1,0,1,1,Core staff,3,1
9,5010868,M,Y,Y,1,450000.0,Pensioner,Secondary / secondary special,Married,House / apartment,-18173.0,-678,1,0,1,1,Core staff,3,1


In [665]:
# Define the function that standardizes the given column, with type hinting and a default argument when appropriate
def std_gender(col: Series = credit_df['GENDER']) -> Series:
    m = {'M' : 1, 'F' : 2}
    col = col.map(m)
    col = col.fillna(col.mode().iloc[0]) # I'm replacing all null values with the mode since the responses in this column are binary; the
    # most commonly occurring value makes sense to substitute null with as we proverbially have a 50-50 shot of getting the answer right
    return col

credit_df['GENDER'] = std_gender()

# Define the function that standardizes the given column, with type hinting and a default argument when appropriate
def std_car(col: Series = credit_df['Car_Owner']) -> Series:
    m = {'Y' : 1, 'N' : 2}
    col = col.map(m)
    col = col.fillna(col.mode().iloc[0]) # I'm replacing all null values with the mode since the responses in this column are binary; the
    # most commonly occurring value makes sense to substitute null with as we proverbially have a 50-50 shot of getting the answer right
    return col

credit_df['Car_Owner'] = std_car()

# Define the function that standardizes the given column, with type hinting and a default argument when appropriate
def std_property(col: Series = credit_df['Propert_Owner']) -> Series:
    m = {'Y' : 1, 'N' : 2}
    col = col.map(m)
    col = col.fillna(col.mode().iloc[0]) # I'm replacing all null values with the mode since the responses in this column are binary; the
    # most commonly occurring value makes sense to substitute null with as we proverbially have a 50-50 shot of getting the answer right
    return col

credit_df['Propert_Owner'] = std_property()

# One hot encode all categorical string columns
def encode(col: Series, df: DataFrame = credit_df) -> DataFrame:
    temp = pd.get_dummies(df[col])
    df = df.drop(columns = [col])
    df = df.join(temp)
    return df


cols = ['Type_Income', 'EDUCATION', 'Marital_status', 'Housing_type', 'Type_Occupation']

for col in cols:
    credit_df = encode(col, credit_df)

# For miscellaneous null cleaning
def null_sub(df: DataFrame = credit_df) -> DataFrame:
    for col in df.columns:
        if df[col].dtype in ['int64', 'float64']: # Numerical columns
            median = df[col].median()
            df[col] = df[col].fillna(median)
        elif df[col].dtype == 'object': # Categorical columns
            mode = df[col].mode().iloc[0]
            df[col] = df[col].fillna(mode)
    return df

credit_df = null_sub()

credit_df.head(10)

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Birthday_count,Employed_days,Mobile_phone,Work_Phone,...,Laborers,Low-skill Laborers,Managers,Medicine staff,Private service staff,Realty agents,Sales staff,Secretaries,Security staff,Waiters/barmen staff
0,5008827,1.0,1,1,0,180000.0,-18772.0,365243,1,0,...,False,False,False,False,False,False,False,False,False,False
1,5009744,2.0,1,2,0,315000.0,-13557.0,-586,1,1,...,False,False,False,False,False,False,False,False,False,False
2,5009746,2.0,1,2,0,315000.0,-15661.5,-586,1,1,...,False,False,False,False,False,False,False,False,False,False
3,5009749,2.0,1,2,0,166500.0,-13557.0,-586,1,1,...,False,False,False,False,False,False,False,False,False,False
4,5009752,2.0,1,2,0,315000.0,-13557.0,-586,1,1,...,False,False,False,False,False,False,False,False,False,False
5,5009753,2.0,1,2,0,315000.0,-13557.0,-586,1,1,...,False,False,False,False,False,False,False,False,False,False
6,5009754,2.0,1,2,0,315000.0,-13557.0,-586,1,1,...,False,False,False,False,False,False,False,False,False,False
7,5009894,2.0,2,2,0,180000.0,-22134.0,365243,1,0,...,False,False,False,False,False,False,False,False,False,False
8,5010864,1.0,1,1,1,450000.0,-18173.0,-678,1,0,...,False,False,False,False,False,False,False,False,False,False
9,5010868,1.0,1,1,1,450000.0,-18173.0,-678,1,0,...,False,False,False,False,False,False,False,False,False,False


# UNIVARIATE LINEAR REGRESSION

In [666]:
# X is what we are using to predict, while y is what we are predicting. Univariate linear regression only involves one independent variable,
# so I chose the 'Annual_income' column since it logically seems like the predominant factor deciding the label, other than residence.
X = credit_df[['Annual_income']]
y = credit_df['label']

# Conduct the train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42) # Set to 42 for congruence with our class 
# notes; I like 02 as well, though :)

# Create and fit the linear regression model
lReg = LinearRegression()
lReg.fit(X_train, y_train)

# Predict on the test set
lPred = lReg.predict(X_test)

# Check accuracy; abhorrent
r2 = lReg.score(X_test, y_test)
print(r2)

-0.01587415490612476


### TAKEAWAYS

An R-squared score of -0.016 is genuienly terrible; it seems as though annual_income alone is not enough to predict whether someone will receive a credit card. With so many different variables at play, it lines up that there are other considerations that need to be made when predicting 'label.' Regardless, perhaps the most damning reason for univariate linear regression's failure stems not from "univariate," but from "linear." We are trying to predict a binary outcome-- whether someone will be approved for a credit card or not, yes or no-- which linear regression is not suitable for, since it is liable to produce predictions outside our range of acceptable answers, which is [0,1].

# K-NEAREST NEIGHBOR (KNN)

In [667]:
# First, we need to establish our train-test split as well as our variables

X = credit_df.drop('label', axis = 1)
y = credit_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

# Find the optimal K value
def optimalNeighbors(X_train, X_test, y_train, y_test) -> int:
    maxScore = 0
    maxK = 1

    for currK in range(1, 100):
        knn = KNeighborsClassifier(n_neighbors = currK)
        knn.fit(X_train, y_train)
        knnPred = knn.predict(X_test)
        currScore = knn.score(X_test, y_test)
        if (currScore > maxScore):
            maxScore = currScore
            maxK = currK

    return maxK

# Establish K
k = optimalNeighbors(X_train, X_test, y_train, y_test)

# Train the KNN model using the optimal number of neighbors found above
knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)

# Predict
kPred = knn.predict(X_test)
print(knn.score(X_test, y_test))

0.9075268817204301


### TAKEAWAYS

A score of 90.75 is pretty damn good, all things considered; finding the optimal K value seems to have affected our accuracy in a pretty major way. KNN does work far better for predicting deterministic outcomes than univariate linear regression, so this outcome makes sense.

# LOGISTIC REGRESSION

In [668]:
X = credit_df.drop('label', axis = 1)
y = credit_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

regression = LogisticRegression(random_state = 42).fit(X_train, y_train)
y_predicted = regression.predict(X_test)

regression.score(X_test, y_test)

0.9032258064516129

### TAKEAWAYS

A score of 90.3 is still good, albeit slightly less than KNN. Still, logistic regression is far superior to linear regression for this use case, which, of course, is to be expected, as logistic regression is far more suited to predicting binary or otherwise deterministic outcomes.

# NORMALIZATION

In [669]:
X = credit_df.drop('label', axis = 1)
y = credit_df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
# Use StandardScaler() so that the mean is 0 and the standard deviation is 1, making the data more centralized
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

# Use univariate linear regression on the newly normalized data
lReg = LinearRegression()
lReg.fit(X_train, y_train)

# Predict on the test set
lPred = lReg.predict(X_test)

# Check accuracy
r2 = lReg.score(X_test, y_test)
print(r2)

-4.779045750960979e+18


In [670]:
# Use KNN using the newly normalized data

knn = KNeighborsClassifier(n_neighbors = k)
knn.fit(X_train, y_train)

# Predict
kPred = knn.predict(X_test)
print(knn.score(X_test, y_test))

0.896774193548387


In [671]:
# Use logistic regression using the newly normalized data

regression = LogisticRegression(random_state = 42).fit(X_train, y_train)
y_predicted = regression.predict(X_test)

regression.score(X_test, y_test)

0.9064516129032258

### TAKEAWAYS

Interestingly, it seems as though normalizing the data made univariate linear regression and KNN perform slightly worse, while logistic regression performed marginally better. I am unsure what this is indicative of-- do you have any thoughts?

# WHAT METRIC IS MOST CONDUCIVE?

For our purposes, there's three primary metrics that measure our dataset's performance: accuracy, precision, and recall. Let's go through each one.

Accuracy is simply the number of correct predictions divided by the number of total predictions. This seems pretty accurate on paper; however, this can be skewed if one prediction heavily outweighs all others, which is more likely if not liable to occur in situations we are predicting a binary outcome. To demonstrate:

In [672]:
credit_df['label'].value_counts()

label
0    1373
1     175
Name: count, dtype: int64

Above, you can see that far, far more people were denied a credit card than those who were accepted. In these cases, accuracy can prove to be misleading.

What about precision, then? Well, precision is the number of true positive predictions (i.e., values which were predicted to be positive and actually were) divided by the total number of positives, both true and false. This can prove useful in cases where we wish to minimize the number of false credit card approvals, which would seem applicable in times where credit card fraud is a genuine concern.

It's still worth discussing, recall, however, which is the number of true positives divided by the total actual positives (which not only includes true positives, but also values the model flagged as negative, which were actually positive-- false negatives!). This metric can aid us in minimizing the number of rejections of valid applicants, which would seem applicable in times where the bank has a need to establish new credit lines.

### TAKEAWAYS

With the above in mind, precision and recall seem like the most conducive metrics for our purposes, with the former helping us identify false positives and the latter helping us identify false negatives. For a successful bank, preventing fraud is definitely high on the priority list; as such, precision would be the most helpful metric for the institution, as well as us, to look at when measuring the dataset's performance.