In [2]:
#Apply PCA on CC General data set
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load the dataset
cc_data = pd.read_csv('CC GENERAL.csv')

# Drop the categorical columns and ID column if present
cc_data = cc_data.drop(['CUST_ID', 'TENURE'], axis=1)

# Fill any missing values with mean of respective column
cc_data = cc_data.fillna(cc_data.mean())

# Scale the data using StandardScaler
scaler = StandardScaler()
cc_scaled = scaler.fit_transform(cc_data)

# Initialize PCA model with 2 components
pca = PCA(n_components=2)

# Fit and transform the data using PCA
cc_pca = pca.fit_transform(cc_scaled)

# Print the explained variance ratio
print('Explained variance ratio:', pca.explained_variance_ratio_)

# Create a new dataframe with the transformed data
cc_pca_df = pd.DataFrame(data=cc_pca, columns=['PC1', 'PC2'])

# Print the transformed data
print('Transformed data:', cc_pca_df.head())

Explained variance ratio: [0.28845814 0.21570572]
Transformed data:         PC1       PC2
0 -1.718894 -1.072938
1 -1.169305  2.509318
2  0.938413 -0.382586
3 -0.907501  0.045869
4 -1.637832 -0.684977


In [3]:
#Calculate silhouette score without applying pca
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
cc_data = pd.read_csv('CC GENERAL.csv')

# Drop the categorical columns and ID column if present
cc_data = cc_data.drop(['CUST_ID', 'TENURE'], axis=1)

# Fill any missing values with mean of respective column
cc_data = cc_data.fillna(cc_data.mean())

# Scale the data using StandardScaler
scaler = StandardScaler()
cc_scaled = scaler.fit_transform(cc_data)

# Initialize k-means model with 2 clusters
kmeans = KMeans(n_clusters=2)

# Fit the k-means model on the scaled data
kmeans.fit(cc_scaled)

# Calculate the silhouette score of the clustered data
silhouette_avg = silhouette_score(cc_scaled, kmeans.labels_)

# Print the silhouette score
print('Silhouette score:', silhouette_avg)



Silhouette score: 0.22588997653013274


In [4]:
#Calculate silhouette score applying pca
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
cc_data = pd.read_csv('CC GENERAL.csv')

# Drop the categorical columns and ID column if present
cc_data = cc_data.drop(['CUST_ID', 'TENURE'], axis=1)

# Fill any missing values with mean of respective column
cc_data = cc_data.fillna(cc_data.mean())

# Scale the data using StandardScaler
scaler = StandardScaler()
cc_scaled = scaler.fit_transform(cc_data)

# Initialize PCA model with 2 components
pca = PCA(n_components=2)

# Fit and transform the data using PCA
cc_pca = pca.fit_transform(cc_scaled)

# Initialize k-means model with 2 clusters
kmeans = KMeans(n_clusters=2)

# Fit the k-means model on the PCA transformed data
kmeans.fit(cc_pca)

# Calculate the silhouette score of the clustered data
silhouette_avg = silhouette_score(cc_pca, kmeans.labels_)

# Print the silhouette score
print('Silhouette score:', silhouette_avg)

Silhouette score: 0.46720661083892595


In [5]:
#Perform Scaling+PCA+K-Means and report performance with 2 clusters in kmeans
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
cc_data = pd.read_csv('CC GENERAL.csv')

# Drop the categorical columns and ID column if present
cc_data = cc_data.drop(['CUST_ID', 'TENURE'], axis=1)

# Fill any missing values with mean of respective column
cc_data = cc_data.fillna(cc_data.mean())

# Scale the data using StandardScaler
scaler = StandardScaler()
cc_scaled = scaler.fit_transform(cc_data)

# Apply PCA
pca = PCA(n_components=2)
cc_pca = pca.fit_transform(cc_scaled)

# Initialize k-means model with 2 clusters
kmeans = KMeans(n_clusters=2)

# Fit the k-means model on the PCA data
kmeans.fit(cc_pca)

# Calculate the silhouette score of the clustered data
silhouette_avg = silhouette_score(cc_pca, kmeans.labels_)

# Print the silhouette score
print('Silhouette score:', silhouette_avg)

Silhouette score: 0.46357928828763445


In [6]:
#Perform Scaling+PCA+K-Means and report performance with 3 clusters in kmeans
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Load the dataset
cc_data = pd.read_csv('CC GENERAL.csv')

# Drop the categorical columns and ID column if present
cc_data = cc_data.drop(['CUST_ID', 'TENURE'], axis=1)

# Fill any missing values with mean of respective column
cc_data = cc_data.fillna(cc_data.mean())

# Scale the data using StandardScaler
scaler = StandardScaler()
cc_scaled = scaler.fit_transform(cc_data)

# Apply PCA
pca = PCA(n_components=2)
cc_pca = pca.fit_transform(cc_scaled)

# Initialize k-means model with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)

# Fit the k-means model on the PCA data
kmeans.fit(cc_pca)

# Calculate the silhouette score of the clustered data
silhouette_avg = silhouette_score(cc_pca, kmeans.labels_)

# Print the silhouette score
print('Silhouette score:', silhouette_avg)

Silhouette score: 0.4533204013901623
