In [32]:
# Import the moduels
import pandas as pd
from pathlib import Path
import hvplot.pandas

## Import the Pandas DataFrame

In [33]:
# Read in the CSV file as a Pandas Dataframe
ccinfo_df = pd.read_csv(
    Path("../Resources/cc_info_default.csv")
)

In [34]:
# Review the DataFrame
ccinfo_df.tail()

Unnamed: 0,limit_bal,education,marriage,age,bill_amt,pay_amt,default
4994,20000,secondary,yes,36,110994,7293,0
4995,180000,other,yes,34,35240,22066,0
4996,200000,secondary,yes,45,691806,21443,1
4997,310000,post-grad,yes,44,1548067,72000,0
4998,160000,primary,no,40,4440,3725,0


In [35]:
# Review the info
ccinfo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   limit_bal  4999 non-null   int64 
 1   education  4999 non-null   object
 2   marriage   4999 non-null   object
 3   age        4999 non-null   int64 
 4   bill_amt   4999 non-null   int64 
 5   pay_amt    4999 non-null   int64 
 6   default    4999 non-null   int64 
dtypes: int64(5), object(2)
memory usage: 273.5+ KB


## Transform "education" column with get_dummies

In [36]:
# Get value_counts of eduction column
value_counts = ccinfo_df['education'].value_counts()
value_counts

secondary    2267
primary      1862
post-grad     822
other          48
Name: education, dtype: int64

In [37]:
# Transform the education column using get_dummies
education_dummies = pd.get_dummies(ccinfo_df.education)

# Display the transformed data
education_dummies

Unnamed: 0,other,post-grad,primary,secondary
0,0,0,0,1
1,0,0,0,1
2,0,0,0,1
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
4994,0,0,0,1
4995,1,0,0,0
4996,0,0,0,1
4997,0,1,0,0


In [38]:
# Concatenate the df_shopping_transformed and the card_dummies DataFrames
ccinfo_df = pd.concat([ccinfo_df, education_dummies], axis = 1)

# Drop the original education column
ccinfo_df = ccinfo_df.drop(columns = ['education'])

# Display the DataFrame
ccinfo_df

Unnamed: 0,limit_bal,marriage,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary
0,20000,yes,24,7704,689,1,0,0,0,1
1,120000,no,26,17077,5000,1,0,0,0,1
2,90000,no,34,101653,11018,0,0,0,0,1
3,50000,yes,37,231334,8388,0,0,0,0,1
4,50000,yes,57,109339,59049,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
4994,20000,yes,36,110994,7293,0,0,0,0,1
4995,180000,yes,34,35240,22066,0,1,0,0,0
4996,200000,yes,45,691806,21443,1,0,0,0,1
4997,310000,yes,44,1548067,72000,0,0,1,0,0


## Transform "marriage" column with encoding function

In [39]:
# Encoding the marriage column using a custom function
def encode_marriage(marriage): 
    if marriage == 'yes': 
        return 1 
    else: 
        return 0

# Call the encode_marriage function on the marriage column
ccinfo_df['marriage_dummy'] = ccinfo_df.marriage.apply(encode_marriage)

# Review the DataFrame 
ccinfo_df['marriage_dummy'] = ccinfo_df.marriage.apply(encode_marriage)
ccinfo_df = ccinfo_df.drop(columns = ['marriage'])
ccinfo_df

Unnamed: 0,limit_bal,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,marriage_dummy
0,20000,24,7704,689,1,0,0,0,1,1
1,120000,26,17077,5000,1,0,0,0,1,0
2,90000,34,101653,11018,0,0,0,0,1,0
3,50000,37,231334,8388,0,0,0,0,1,1
4,50000,57,109339,59049,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
4994,20000,36,110994,7293,0,0,0,0,1,1
4995,180000,34,35240,22066,0,1,0,0,0,1
4996,200000,45,691806,21443,1,0,0,0,1,1
4997,310000,44,1548067,72000,0,0,1,0,0,1


## Apply the Standard Scaler to "limit_bal", "bill_amt", "pay_amt"

In [40]:
# Import the module
from sklearn.preprocessing import StandardScaler

In [41]:
# Scaling the numeric columns
ccinfo_data_scaled = StandardScaler().fit_transform(ccinfo_df[['limit_bal', 'bill_amt', 'pay_amt']])

# Review the scaled data
ccinfo_data_scaled

array([[-1.1173411 , -0.66070266, -0.5427793 ],
       [-0.3499424 , -0.63637003, -0.46399421],
       [-0.58016201, -0.41680786, -0.35401308],
       ...,
       [ 0.26397655,  1.1152494 , -0.16349243],
       [ 1.10811512,  3.33813208,  0.76045505],
       [-0.04298292, -0.66917611, -0.4872953 ]])

In [42]:
# Create a DataFrame of the scaled data
ccinfo_data_scaled = pd.DataFrame(ccinfo_data_scaled, columns = [['limit_bal', 'bill_amt', 'pay_amt']])

# Replace the original data with the columns of information from the scaled Data
ccinfo_df['limit_bal'] = ccinfo_data_scaled['limit_bal'] 
ccinfo_df['bill_amt'] = ccinfo_data_scaled['bill_amt'] 
ccinfo_df['pay_amt'] = ccinfo_data_scaled['pay_amt'] 

# Review the DataFrame
ccinfo_df

Unnamed: 0,limit_bal,age,bill_amt,pay_amt,default,other,post-grad,primary,secondary,marriage_dummy
0,-1.117341,24,-0.660703,-0.542779,1,0,0,0,1,1
1,-0.349942,26,-0.636370,-0.463994,1,0,0,0,1,0
2,-0.580162,34,-0.416808,-0.354013,0,0,0,0,1,0
3,-0.887121,37,-0.080152,-0.402077,0,0,0,0,1,1
4,-0.887121,57,-0.396855,0.523771,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
4994,-1.117341,36,-0.392558,-0.422089,0,0,0,0,1,1
4995,0.110497,34,-0.589218,-0.152107,0,1,0,0,0,1
4996,0.263977,45,1.115249,-0.163492,1,0,0,0,1,1
4997,1.108115,44,3.338132,0.760455,0,0,1,0,0,1


## Elbow Method to find k

In [None]:
# Import the KMeans module from SKLearn


In [None]:
# Create a a list to store inertia values and the values of k


In [None]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance


In [None]:
# Define a DataFrame to hold the values for k and the corresponding inertia


# Review the DataFrame


In [None]:
# Plot the DataFrame


## Kmeans algo to cluster data

In [None]:
# Define the model with 3 clusters

# Fit the model

# Make predictions

# Create a copy of the preprocessed data

# Add a class column with the labels


In [None]:
# Plot the clusters
