In [None]:
# Week 8: Unsupervised Learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# Set up visualization style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Week 8 Unsupervised Learning Environment Ready!")

# Load your cleaned dataset
df = pd.read_csv('titanic_cleaned.csv')

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nColumns available:")
print(df.columns.tolist())

In [None]:
print("=== DATASET OVERVIEW FOR UNSUPERVISED LEARNING ===")

# Display basic information
print("First 5 rows:")
display(df.head())

print("\nData types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

# For unsupervised learning, we'll use numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"\nNumerical features available: {numerical_features}")

In [None]:
print("=== PREPARING DATA FOR UNSUPERVISED LEARNING ===")

# Select features for clustering
# We'll use features that describe passenger characteristics
features_for_clustering = ['Age', 'Fare', 'SibSp', 'Parch', 'Pclass']

print(f"Selected features for clustering: {features_for_clustering}")

# Create feature matrix
X = df[features_for_clustering].copy()

# Handle missing values
X = X.fillna(X.median())

print(f"Feature matrix shape: {X.shape}")
print(f"Missing values after cleaning: {X.isnull().sum().sum()}")

# Standardize the features (important for clustering)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("‚úÖ Features standardized (mean=0, std=1)")

# Create DataFrame with scaled features
X_scaled_df = pd.DataFrame(X_scaled, columns=features_for_clustering)
print("\nScaled features statistics:")
print(X_scaled_df.describe().round(2))

In [None]:
print("=== FEATURE RELATIONSHIPS ===")

# Create pairplot to see relationships between features
plt.figure(figsize=(12, 10))
sns.pairplot(X, diag_kind='hist', corner=True)
plt.suptitle('Feature Relationships for Clustering', y=1.02, fontsize=16, fontweight='bold')
plt.show()

# Correlation heatmap
plt.figure(figsize=(8, 6))
correlation_matrix = X.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, fmt='.2f')
plt.title('Feature Correlation Matrix', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.show()

print("üîç Initial Observations:")
print("‚Ä¢ Some features show clear relationships (e.g., Pclass vs Fare)")
print("‚Ä¢ Potential clusters may exist based on passenger profiles")
print("‚Ä¢ Standardization will help with clustering algorithm performance")

In [None]:
print("=== FINDING OPTIMAL NUMBER OF CLUSTERS ===")

# Use Elbow Method and Silhouette Analysis to find optimal k
range_k = range(2, 10)
inertia = []
silhouette_scores = []

for k in range_k:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(X_scaled)
    
    inertia.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, cluster_labels))

# Plot Elbow Method
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Elbow curve
ax1.plot(range_k, inertia, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Clusters (k)')
ax1.set_ylabel('Inertia (Within-cluster sum of squares)')
ax1.set_title('Elbow Method for Optimal k', fontweight='bold')
ax1.grid(alpha=0.3)

# Silhouette scores
ax2.plot(range_k, silhouette_scores, 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Clusters (k)')
ax2.set_ylabel('Silhouette Score')
ax2.set_title('Silhouette Analysis for Optimal k', fontweight='bold')
ax2.grid(alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal k (you can choose based on the plots)
optimal_k = range_k[np.argmax(silhouette_scores)]
print(f"üéØ Recommended number of clusters: {optimal_k}")
print(f"   Based on highest silhouette score: {max(silhouette_scores):.3f}")

print("\nüîç INTERPRETATION GUIDE:")
print("‚Ä¢ Elbow Method: Look for 'elbow' where inertia stops decreasing rapidly")
print("‚Ä¢ Silhouette Score: Higher values indicate better-defined clusters")
print("‚Ä¢ Choose k that balances cluster quality and interpretability")

In [None]:
print("=== K-MEANS CLUSTERING IMPLEMENTATION ===")

# Apply K-Means with optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(X_scaled)

# Add cluster labels to original data
df_clustered = df.copy()
df_clustered['Cluster'] = cluster_labels
X_clustered = X.copy()
X_clustered['Cluster'] = cluster_labels

print(f"‚úÖ K-Means clustering completed with {optimal_k} clusters")
print(f"Cluster distribution:")
cluster_counts = df_clustered['Cluster'].value_counts().sort_index()
for cluster, count in cluster_counts.items():
    print(f"  Cluster {cluster}: {count} passengers ({count/len(df_clustered)*100:.1f}%)")

# Calculate silhouette score
sil_score = silhouette_score(X_scaled, cluster_labels)
print(f"Silhouette Score: {sil_score:.3f}")

print("\nüîç SILHOUETTE SCORE INTERPRETATION:")
print("‚Ä¢ +1: Perfectly separated clusters")
print("‚Ä¢  0: Overlapping clusters") 
print("‚Ä¢ -1: Completely wrong assignments")
print(f"‚Ä¢ Our score ({sil_score:.3f}): {'Good separation' if sil_score > 0.5 else 'Moderate separation' if sil_score > 0.25 else 'Poor separation'}")

In [None]:
print("=== CLUSTER CHARACTERISTICS ANALYSIS ===")

# Calculate mean values for each cluster
cluster_means = X_clustered.groupby('Cluster').mean()
cluster_std = X_clustered.groupby('Cluster').std()

print("üìä CLUSTER PROFILES (Mean Values):")
display(cluster_means.round(2))

print("\nüìä CLUSTER VARIABILITY (Standard Deviation):")
display(cluster_std.round(2))

# Visualize cluster characteristics
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
features_to_plot = features_for_clustering

for i, feature in enumerate(features_to_plot):
    row, col = i // 3, i % 3
    
    # Boxplot for each feature by cluster
    sns.boxplot(data=X_clustered, x='Cluster', y=feature, ax=axes[row, col], palette='Set2')
    axes[row, col].set_title(f'{feature} Distribution by Cluster', fontweight='bold')
    axes[row, col].set_xlabel('Cluster')
    axes[row, col].set_ylabel(feature)
    axes[row, col].grid(alpha=0.3)

# Remove empty subplot if needed
if len(features_to_plot) < 6:
    for i in range(len(features_to_plot), 6):
        fig.delaxes(axes.flatten()[i])

plt.tight_layout()
plt.show()

print("üîç CLUSTER INTERPRETATION:")
print("Look for patterns in the cluster profiles:")
print("‚Ä¢ Which features distinguish the clusters?")
print("‚Ä¢ Do clusters represent meaningful passenger segments?")
print("‚Ä¢ Are there clear patterns in age, fare, family size, or class?")

In [None]:
print("=== PRINCIPAL COMPONENT ANALYSIS (PCA) ===")

# Apply PCA for dimensionality reduction
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Create PCA components DataFrame
pca_components = pd.DataFrame(
    pca.components_.T,
    columns=[f'PC{i+1}' for i in range(pca.n_components_)],
    index=features_for_clustering
)

print("üìä PCA COMPONENTS (Feature Contributions):")
display(pca_components.round(3))

# Explained variance
explained_variance = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(explained_variance)

print(f"\nüìà EXPLAINED VARIANCE:")
for i, (var, cum_var) in enumerate(zip(explained_variance, cumulative_variance)):
    print(f"PC{i+1}: {var:.3f} ({var*100:.1f}%) - Cumulative: {cum_var:.3f} ({cum_var*100:.1f}%)")

# Plot explained variance
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Scree plot
ax1.bar(range(1, len(explained_variance) + 1), explained_variance, alpha=0.7, color='skyblue')
ax1.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'ro-', linewidth=2)
ax1.set_xlabel('Principal Component')
ax1.set_ylabel('Explained Variance Ratio')
ax1.set_title('PCA Scree Plot', fontweight='bold')
ax1.grid(alpha=0.3)
ax1.legend(['Cumulative Variance', 'Individual Variance'])

# Cumulative variance
ax2.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, 'bo-', linewidth=2, markersize=8)
ax2.axhline(y=0.95, color='red', linestyle='--', label='95% Variance')
ax2.axhline(y=0.90, color='orange', linestyle='--', label='90% Variance')
ax2.axhline(y=0.80, color='green', linestyle='--', label='80% Variance')
ax2.set_xlabel('Number of Principal Components')
ax2.set_ylabel('Cumulative Explained Variance')
ax2.set_title('Cumulative Explained Variance', fontweight='bold')
ax2.grid(alpha=0.3)
ax2.legend()

plt.tight_layout()
plt.show()

print("üîç PCA INTERPRETATION:")
print("‚Ä¢ PC1, PC2 capture the most important patterns in the data")
print("‚Ä¢ Components show which features contribute most to each direction")
print("‚Ä¢ We can reduce dimensions while preserving most information")

In [None]:
print("=== PCA BIPLOT VISUALIZATION ===")

# Create biplot (PCA with feature vectors)
def pca_biplot(components, features, scale=1.5):
    plt.figure(figsize=(10, 8))
    
    # Scatter plot of first two components
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, 
                         cmap='viridis', alpha=0.6, s=50)
    
    # Plot feature vectors
    for i, feature in enumerate(features):
        plt.arrow(0, 0, components[i, 0] * scale, components[i, 1] * scale,
                 color='red', alpha=0.7, head_width=0.05)
        plt.text(components[i, 0] * scale * 1.15, components[i, 1] * scale * 1.15,
                feature, color='red', ha='center', va='center', fontweight='bold')
    
    plt.xlabel(f'PC1 ({explained_variance[0]*100:.1f}% Variance)')
    plt.ylabel(f'PC2 ({explained_variance[1]*100:.1f}% Variance)')
    plt.title('PCA Biplot - Clusters with Feature Directions', fontweight='bold', fontsize=14)
    plt.colorbar(scatter, label='Cluster')
    plt.grid(alpha=0.3)
    plt.axhline(y=0, color='k', linestyle='-', alpha=0.3)
    plt.axvline(x=0, color='k', linestyle='-', alpha=0.3)
    plt.show()

# Create biplot
pca_biplot(pca.components_.T, features_for_clustering)

print("üîç BIPLOT INTERPRETATION:")
print("‚Ä¢ Points: Individual passengers projected onto PC1 and PC2")
print("‚Ä¢ Arrows: Direction and importance of original features")
print("‚Ä¢ Colors: Cluster assignments from K-Means")
print("‚Ä¢ Interpretation: Features pointing in similar directions are correlated")

In [None]:
print("=== ASSIGNMENT 8: 2D CLUSTER VISUALIZATION WITH PCA ===")

# Use first two principal components for 2D visualization
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

# Create DataFrame for plotting
pca_df = pd.DataFrame({
    'PC1': X_pca_2d[:, 0],
    'PC2': X_pca_2d[:, 1],
    'Cluster': cluster_labels
})

# Add original features for interpretation
for feature in features_for_clustering:
    pca_df[feature] = X[feature].values

print(f"‚úÖ 2D PCA transformation completed")
print(f"Explained variance by PC1 and PC2: {pca_2d.explained_variance_ratio_.sum()*100:.1f}%")

# Create comprehensive visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Plot 1: Cluster visualization in PCA space
scatter = axes[0, 0].scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster'], 
                            cmap='viridis', alpha=0.7, s=50)
axes[0, 0].set_xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}% Variance)')
axes[0, 0].set_ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}% Variance)')
axes[0, 0].set_title('Passenger Clusters in 2D PCA Space', fontweight='bold')
plt.colorbar(scatter, ax=axes[0, 0], label='Cluster')

# Plot 2: Cluster centers in PCA space
cluster_centers_pca = pca_2d.transform(kmeans.cluster_centers_)
for i, center in enumerate(cluster_centers_pca):
    axes[0, 1].scatter(center[0], center[1], marker='X', s=200, color='red', 
                      label=f'Center {i}' if i == 0 else "")
    axes[0, 1].text(center[0] + 0.1, center[1] + 0.1, f'Center {i}', 
                   fontweight='bold', fontsize=10)

axes[0, 1].scatter(pca_df['PC1'], pca_df['PC2'], c=pca_df['Cluster'], 
                  cmap='viridis', alpha=0.3, s=30)
axes[0, 1].set_xlabel('PC1')
axes[0, 1].set_ylabel('PC2')
axes[0, 1].set_title('Cluster Centers in PCA Space', fontweight='bold')
axes[0, 1].legend()

# Plot 3: Feature contribution to PCs
pc_loadings = pd.DataFrame(
    pca_2d.components_.T,
    columns=['PC1', 'PC2'],
    index=features_for_clustering
)

pc_loadings.plot(kind='bar', ax=axes[1, 0], color=['skyblue', 'lightcoral'])
axes[1, 0].set_title('Feature Contributions to Principal Components', fontweight='bold')
axes[1, 0].set_ylabel('Loading')
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 0].grid(alpha=0.3)
axes[1, 0].legend()

# Plot 4: Cluster sizes
cluster_sizes = pca_df['Cluster'].value_counts().sort_index()
axes[1, 1].bar(cluster_sizes.index, cluster_sizes.values, color='lightgreen', alpha=0.7)
axes[1, 1].set_xlabel('Cluster')
axes[1, 1].set_ylabel('Number of Passengers')
axes[1, 1].set_title('Cluster Sizes', fontweight='bold')
axes[1, 1].grid(alpha=0.3)

# Add count labels on bars
for i, count in enumerate(cluster_sizes.values):
    axes[1, 1].text(i, count + 5, str(count), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
print("=== CLUSTER INTERPRETATION AND BUSINESS INSIGHTS ===")

# Analyze each cluster's characteristics
cluster_profiles = X_clustered.groupby('Cluster').agg(['mean', 'std']).round(2)

print("üìä DETAILED CLUSTER PROFILES:")
for cluster in range(optimal_k):
    cluster_data = X_clustered[X_clustered['Cluster'] == cluster]
    
    print(f"\nüéØ CLUSTER {cluster} (n={len(cluster_data)}):")
    
    # Interpret based on feature values
    age_mean = cluster_data['Age'].mean()
    fare_mean = cluster_data['Fare'].mean()
    pclass_mean = cluster_data['Pclass'].mean()
    family_size_mean = (cluster_data['SibSp'] + cluster_data['Parch']).mean()
    
    print(f"   Average Age: {age_mean:.1f} years")
    print(f"   Average Fare: ${fare_mean:.1f}")
    print(f"   Average Class: {pclass_mean:.1f} (1st=1, 2nd=2, 3rd=3)")
    print(f"   Average Family Size: {family_size_mean:.1f}")
    
    # Business interpretation
    if pclass_mean < 2 and fare_mean > 50:
        print("   üí° Interpretation: Affluent passengers")
    elif pclass_mean > 2 and fare_mean < 20:
        print("   üí° Interpretation: Economy class passengers")
    elif age_mean < 25:
        print("   üí° Interpretation: Younger passengers")
    elif family_size_mean > 2:
        print("   üí° Interpretation: Family travelers")
    else:
        print("   üí° Interpretation: Mixed characteristics")

# Compare with survival rates
print(f"\nüìà CLUSTER SURVIVAL ANALYSIS:")
cluster_survival = df_clustered.groupby('Cluster')['Survived'].agg(['mean', 'count'])
cluster_survival['survival_rate'] = (cluster_survival['mean'] * 100).round(1)
cluster_survival = cluster_survival.sort_values('survival_rate', ascending=False)

for cluster, row in cluster_survival.iterrows():
    print(f"   Cluster {cluster}: {row['survival_rate']}% survival ({row['count']} passengers)")

print(f"\nüîç KEY INSIGHTS:")
print("‚Ä¢ Clusters reveal natural passenger segments")
print("‚Ä¢ Survival rates vary significantly between clusters")
print("‚Ä¢ PCA visualization shows clear separation of some clusters")
print("‚Ä¢ Feature patterns align with known Titanic survival factors")

In [None]:
print("=== ADVANCED CLUSTERING ANALYSIS ===")

# Compare different numbers of clusters
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
k_values = [2, 3, 4, 5]

for i, k in enumerate(k_values):
    row, col = i // 2, i % 2
    
    # Apply K-Means
    kmeans_temp = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels_temp = kmeans_temp.fit_predict(X_scaled)
    
    # Apply PCA for visualization
    pca_temp = PCA(n_components=2)
    X_pca_temp = pca_temp.fit_transform(X_scaled)
    
    # Plot
    scatter = axes[row, col].scatter(X_pca_temp[:, 0], X_pca_temp[:, 1], 
                                   c=labels_temp, cmap='viridis', alpha=0.7, s=30)
    axes[row, col].set_title(f'K={k} Clusters\n(Silhouette: {silhouette_score(X_scaled, labels_temp):.3f})', 
                           fontweight='bold')
    axes[row, col].set_xlabel('PC1')
    axes[row, col].set_ylabel('PC2')
    axes[row, col].grid(alpha=0.3)

plt.tight_layout()
plt.show()

print("üîç CLUSTER COUNT COMPARISON:")
print("Different numbers of clusters reveal different patterns:")
print("‚Ä¢ K=2: Broad segmentation (e.g., wealthy vs economy)")
print("‚Ä¢ K=3: More nuanced passenger types")
print("‚Ä¢ K=4-5: Fine-grained segments, may capture special cases")
print("‚Ä¢ Choose based on business needs and cluster interpretability")

In [None]:
print("=" * 70)
print("üìä WEEK 8 ASSIGNMENT REPORT: UNSUPERVISED LEARNING")
print("=" * 70)

print(f"\nüéØ PROJECT GOALS:")
print("‚Ä¢ Discover natural passenger segments using clustering")
print("‚Ä¢ Visualize high-dimensional data in 2D using PCA")
print("‚Ä¢ Interpret clusters in business context")

print(f"\nüìä METHODOLOGY:")
print(f"‚Ä¢ Features used: {', '.join(features_for_clustering)}")
print(f"‚Ä¢ Clustering algorithm: K-Means with k={optimal_k}")
print(f"‚Ä¢ Dimensionality reduction: PCA (2 components)")
print(f"‚Ä¢ Data preprocessing: Standardization (mean=0, std=1)")

print(f"\nüìà RESULTS:")
print(f"‚Ä¢ Optimal clusters: {optimal_k} (based on silhouette analysis)")
print(f"‚Ä¢ Cluster quality: Silhouette score = {sil_score:.3f}")
print(f"‚Ä¢ PCA effectiveness: {pca_2d.explained_variance_ratio_.sum()*100:.1f}% variance captured in 2D")
print(f"‚Ä¢ Cluster sizes: {dict(cluster_counts)}")

print(f"\nüîç KEY FINDINGS:")
print("1. Clear passenger segments emerged from the data")
print("2. PCA successfully visualized clusters in 2D space")
print("3. Features like Fare and Pclass strongly influence clustering")
print("4. Survival rates vary significantly between clusters")

print(f"\nüí° BUSINESS INSIGHTS:")
print("‚Ä¢ Passenger base can be segmented into meaningful groups")
print("‚Ä¢ Different segments had different survival probabilities")
print("‚Ä¢ Clustering reveals patterns not obvious in supervised analysis")
print("‚Ä¢ Useful for understanding passenger demographics and behavior")

print(f"\nüöÄ RECOMMENDATIONS:")
print("1. Use clusters for targeted historical analysis")
print("2. Consider cluster membership as features in supervised models")
print("3. Explore other clustering algorithms (DBSCAN, Hierarchical)")
print("4. Apply similar analysis to other historical datasets")

In [None]:
import json
import joblib   # ‚úÖ Add this line

# Save clustering results                      
clustering_results = {
    'optimal_k': optimal_k,
    'silhouette_score': sil_score,
    'cluster_distribution': cluster_counts.to_dict(),
    'features_used': features_for_clustering,
    'pca_variance_explained': pca_2d.explained_variance_ratio_.sum(),
    'cluster_interpretation': {}
}

# Add cluster interpretations
for cluster in range(optimal_k):
    cluster_data = X_clustered[X_clustered['Cluster'] == cluster]
    clustering_results['cluster_interpretation'][f'cluster_{cluster}'] = {
        'size': len(cluster_data),
        'age_mean': cluster_data['Age'].mean(),
        'fare_mean': cluster_data['Fare'].mean(),
        'pclass_mean': cluster_data['Pclass'].mean(),
        'family_size_mean': (cluster_data['SibSp'] + cluster_data['Parch']).mean(),
        'survival_rate': df_clustered[df_clustered['Cluster'] == cluster]['Survived'].mean()
    }

# Save results and models
with open('clustering_results.json', 'w') as f:
    json.dump(clustering_results, f, indent=2)

df_clustered.to_csv('titanic_clustered.csv', index=False)

joblib.dump(kmeans, 'kmeans_cluster_model.pkl')
joblib.dump(pca_2d, 'pca_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')

print("üíæ RESULTS AND MODELS SAVED:")
print(" - 'clustering_results.json' (comprehensive results)")
print(" - 'titanic_clustered.csv' (dataset with cluster labels)")
print(" - 'kmeans_cluster_model.pkl' (trained clustering model)")
print(" - 'pca_model.pkl' (trained PCA model)")
print(" - 'feature_scaler.pkl' (feature scaler)")
print(f"\nüìÅ Save this notebook as 'week8_unsupervised_learning.ipynb'")
print("üöÄ Upload to GitHub to complete Week 8!")
