In [20]:
from pathlib import Path
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler

import warnings
warnings.filterwarnings('ignore')

In [34]:
df_data = pd.read_csv(Path('../Resources/CleanedDataset.csv'))
df_data = df_data.drop(columns='Unnamed: 0')
df_status = pd.read_csv(Path('../Resources/Status.csv'))
df = df_status.join(df_data, how='left')
df

Unnamed: 0,STATUS,AMT_INCOME_TOTAL,AGE,YEARS_EXPERIENCE,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,...,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff
0,0,112500.0,58,3,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,1,0
1,1,270000.0,52,8,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,0,270000.0,52,8,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0,270000.0,52,8,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,1,270000.0,46,2,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17038,1,315000.0,47,6,0,1,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
17039,0,157500.0,33,3,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
17040,0,157500.0,33,3,1,0,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
17041,1,283500.0,49,1,1,0,1,0,0,1,...,0,0,0,0,0,0,1,0,0,0


## Scaling

This section will examine some of the different preprocessing scalers from SKlearn found in the doccumentation here: `https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#plot-all-scaling-standard-scaler-section`

StandardScaler:
* \+ Will shrink the `AMT_INCOME_TOTAL`, `DAYS_BIRTH`, and `DAYS_EMPLOYED` columns to fit into a range based off the variance
* \+ All values which are binary (either 1 or 0) should remain the same as there is only 1 value of unit variance
* \- The range may be difficult to understand for exploratory purposes

MaxAbsScaler:
* \+ A MinMax Scaler which places values between \[-1,1\]
* \+ The range is easier to understand as every feature is on the same scale
* \- As each feature is on the same scale, values between 1 and zero will probably be very small values who's meaning may get lost

RobustScaler:
* \+ The scale is based off percentiles rather than variance and will therefore not be influenced by outliers
* \+ Basically just the standard scaler, but with a range that is easier to understand

### No Scaler PCA and KMeans clustering

In [42]:
pca = PCA(n_components=2)
loan_pca = pca.fit_transform(df)
var_ratio = pca.explained_variance_ratio_
print(loan_pca[:5], var_ratio)

[[-8.28769807e+04  1.59292302e+01]
 [ 7.46230193e+04  1.11381479e+01]
 [ 7.46230193e+04  1.11380279e+01]
 [ 7.46230193e+04  1.11380279e+01]
 [ 7.46230192e+04  3.43533355e+00]] [9.99999988e-01 8.92090824e-09]


In [43]:
loan_pca_df = pd.DataFrame(
    loan_pca,
    columns=['PCA1', 'PCA2']
)

inertia = []
k = list(range(1,11))

for j in k:
    k_model = KMeans(n_clusters=j, random_state=129)
    k_model.fit(loan_pca_df)
    inertia.append(k_model.inertia_)

pca_elbow_data = {'k': k, 'inertia' : inertia}
pd.DataFrame(pca_elbow_data).hvplot.line(
    x='k',
    y='inertia',
    title='Elbow Curve',
    xticks=k
)

In [44]:
# Calculating variance ratio of first two PCA variables
print(f"The Variance ratio of the first two PCA variables is: {round((var_ratio[0] + var_ratio[1]) * 100)}%")

The Variance ratio of the first two PCA variables is: 100%


The variance ratio for the first two PCA variables is 0.999999988 and 8.92090824e-09 which is around 100% of the total variance. This is either perfect or terrible.

The best number of clusters is 4 according to the elbow curve.

In [45]:
model = KMeans(n_clusters=4, random_state=129)
model.fit(loan_pca_df)
k_4 = model.predict(loan_pca_df)

loan_pca_predictions_df = loan_pca_df.copy()
loan_pca_predictions_df['loan_segments'] = k_4

In [46]:
loan_pca_predictions_df.hvplot.scatter(
    x='PCA1',
    y='PCA2',
    by='loan_segments'
)

This graph shows a fairly clean spread of the data. There are no extreme outliers.

Zooming in shows some clear verticle lines.

In [50]:
df['PCA_k4_cluster'] = k_4

In [51]:
df.to_csv('clustered.csv', index=False)
df.head()

Unnamed: 0,STATUS,AMT_INCOME_TOTAL,AGE,YEARS_EXPERIENCE,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,...,OCCUPATION_TYPE_Low-skill Laborers,OCCUPATION_TYPE_Managers,OCCUPATION_TYPE_Medicine staff,OCCUPATION_TYPE_Private service staff,OCCUPATION_TYPE_Realty agents,OCCUPATION_TYPE_Sales staff,OCCUPATION_TYPE_Secretaries,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_Waiters/barmen staff,PCA_k4_cluster
0,0,112500.0,58,3,0,1,0,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,270000.0,52,8,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
2,0,270000.0,52,8,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
3,0,270000.0,52,8,1,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,1
4,1,270000.0,46,2,0,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1


In [55]:
df.hvplot.scatter(
    x='AGE',
    y='STATUS',
    by='PCA_k4_cluster'
)