In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import calinski_harabasz_score
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../data/clean_data.csv')

In [None]:
# Handle null values in text column
df['consumer_complaint_narrative'] = df['consumer_complaint_narrative'].fillna('')
domain_stop_words = list(ENGLISH_STOP_WORDS) + [
    'credit', 'report', 'account', 'information',
    'consumer', 'reporting', 'told', 'called',
    'did', 'received', 'company'
]
vectorizer = TfidfVectorizer(
    stop_words=domain_stop_words,
    token_pattern=r'(?u)\b(?!x{2,}\b)(?!X{2,}\b)[A-Za-z]\w+',
    min_df= 5,
    max_df=1.0,
    ngram_range=(1, 3),
    max_features=1000)
x = vectorizer.fit_transform(df['consumer_complaint_narrative'])

In [None]:
k = 5 ## test diffrent variable amounts
kmeans = KMeans(n_clusters=k, random_state=25)
labels = kmeans.fit_predict(x)

In [None]:
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

# Add cluster evaluation metrics
sil_score = silhouette_score(x, labels)
print(f"Silhouette Score: {sil_score:.3f}")
print(f"Cluster sizes: {np.bincount(labels)}")

for i in range(k):
    print(f"\nCluster {i} top terms:")
    for j in order_centroids[i, :10]:  # Top 10 terms
        print(terms[j], end=", ")
    print(f"\nSize: {np.bincount(labels)[i]} complaints")



Cluster 0 top terms:
credit, report, information, xxxx, reporting, identity, accounts, theft, account, inquiries, 
Cluster 1 top terms:
consumer, xxxx, information, 15, reporting, report, consent, code, agency, reports, 
Cluster 2 top terms:
section, usc, 15, 1681, consumer, states, xxxx, reporting, agency, furnish, 
Cluster 3 top terms:
xxxx, xxxxxxxx, account, credit, report, information, reporting, accounts, balance, date, 
Cluster 4 top terms:
debt, xxxx, collection, company, credit, validation, account, proof, alleged, report, 
Cluster 5 top terms:
accounts, credit, report, inaccurate, duty, litigation, fraudulent, accordingly, deny, inaccuracies, 
Cluster 6 top terms:
xxxx, xxxxxxxx, credit, report, items, information, consumer, balance, account, reporting, 
Cluster 7 top terms:
xxxx, account, bank, card, payment, loan, xxxxxxxx, told, credit, called, 
Cluster 8 top terms:
letters, xxxx, complaint, filing, party, involved, im, uploaded, falsely, misleading, 
Cluster 9 top terms:

In [None]:
# Elbow method for optimal K selection
inertias = []
k_range = range(2, 8)

for k_test in k_range:
    kmeans_test = KMeans(n_clusters=k_test, random_state=25)
    kmeans_test.fit(x)
    inertias.append(kmeans_test.inertia_)

plt.figure(figsize=(8, 5))
plt.plot(k_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.grid(True)
plt.show()

In [None]:
# Enhanced cluster interpretation
df['cluster'] = labels

print("=== CLUSTER INTERPRETATION ===")
for i in range(k):
    cluster_data = df[df['cluster'] == i]
    print(f"\nCluster {i} - Size: {len(cluster_data)} ({len(cluster_data)/len(df)*100:.1f}%)")
    
    # Sample complaints
    samples = cluster_data['consumer_complaint_narrative'].dropna().head(2)
    for idx, complaint in enumerate(samples, 1):
        if complaint.strip():
            print(f"  Sample {idx}: {complaint[:100]}...")

In [None]:
# Cluster visualization
plt.figure(figsize=(10, 4))

# Cluster size distribution
plt.subplot(1, 2, 1)
cluster_sizes = np.bincount(labels)
plt.bar(range(k), cluster_sizes)
plt.xlabel('Cluster')
plt.ylabel('Number of Complaints')
plt.title('Cluster Size Distribution')
plt.xticks(range(k))

# PCA visualization
plt.subplot(1, 2, 2)
pca = PCA(n_components=2, random_state=25)
x_pca = pca.fit_transform(x.toarray())
scatter = plt.scatter(x_pca[:, 0], x_pca[:, 1], c=labels, cmap='viridis', alpha=0.6)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('Clusters in PCA Space')
plt.colorbar(scatter)

plt.tight_layout()
plt.show()

In [None]:
# Export cluster results
df_clusters = df[['consumer_complaint_narrative', 'cluster']].copy()
df_clusters.to_csv('../data/cluster_results.csv', index=False)

print(f"Cluster results exported to ../data/cluster_results.csv")
print(f"Total records: {len(df):,}")
print(f"Silhouette score: {sil_score:.3f}")
print(f"Clusters: {k}")