**Importing data and train-test split**

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("cc_data.csv", encoding = "UTF-8")

In [None]:
from sklearn.model_selection import train_test_split

df_X1 = pd.get_dummies(df[df.columns[df.columns != 'REALTYPE']].copy()) # get columns that are not 'good cx'
df_X = pd.get_dummies(df_X1[df_X1.columns[df_X1.columns != 'ID']].copy())
df_y = df['REALTYPE'].copy() # get the column named 'REALTYPE'; this is our label

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=1)

print ("Number of training instances: ", len(X_train), "\nNumber of test instances: ", len(X_test))

Number of training instances:  28884 
Number of test instances:  7221


In [None]:
import imblearn

**Near Miss Undersampling**

In [None]:
from imblearn.under_sampling import NearMiss 
undersample = NearMiss()
# transform the dataset
X_nm, y_nm = undersample.fit_resample(X_train, y_train)

K Means Clustering

In [None]:
from imblearn.under_sampling import NearMiss 
undersample = NearMiss()
# transform the dataset
X_nm, y_nm = undersample.fit_resample(df_X, df_y)
#X_train, X_test, y_train, y_test = train_test_split(X_nm, y_nm, test_size=0.2, random_state=1)
from sklearn.cluster import KMeans
kmeans_nm = KMeans(n_clusters = 2,random_state=0).fit(X_nm)
kmeans_nm.labels_


array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0,

In [None]:
kmeans_1 = KMeans(n_clusters = 2,random_state=0).fit(df_X)
kmeans_1.labels_
pd.value_counts(kmeans_1.labels_)

1    30036
0     6069
dtype: int64

In [None]:
#Silhouette Score to evaluate separation of clusters
from sklearn.metrics import silhouette_score
score1 = silhouette_score(df_X,kmeans_1.labels_,metric='euclidean')
score2 = silhouette_score(X_nm, kmeans_nm.labels_,metric='euclidean')
print(score1, score2)

0.7484033688427337 0.7884076559030081


In [None]:
#Comparing cluster with Actual label, using resampling method to navigate imbalance
from sklearn.metrics import rand_score

score3 = rand_score(df_y, kmeans_1.labels_)
score4 = rand_score(y_nm, kmeans_nm.labels_)
print(score3, score4)

0.7141097317132095 0.4990512333965844


In [None]:
#using confusion matrix to count FP,FN
from sklearn.metrics import confusion_matrix

confusion_matrix(y_true=df_y, y_pred=kmeans_1.labels_)
#False Positive is high, tend to overfit

array([[ 6022, 29819],
       [   47,   217]])

In [None]:
#confusion matrix for undersampled data, aimed for stress on negative.
confusion_matrix(y_true=y_nm, y_pred=kmeans_nm.labels_)
#False negative is high

array([[217,  47],
       [217,  47]])

In [None]:
#DB score to recomfirm the result of clustering quality
from sklearn.metrics import davies_bouldin_score
score5 = davies_bouldin_score(df_X, kmeans_1.labels_)
score6 = davies_bouldin_score(X_nm, kmeans_nm.labels_)
print(score5, score6)

0.35665999644976126 0.29449170174270206


In [None]:
kmeans = KMeans(n_clusters = 2,random_state=0).fit(df_X)