In [3]:
# load_crime_data.py
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the dataset
data = pd.read_csv('crime_data.csv', index_col=0)  # Set the first column (state names) as index

# Display basic information about the dataset
print("Dataset Info:")
print(data.info())
print("\nFirst 5 rows of the dataset:")
print(data.head())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Basic statistics of the dataset
print("\nDataset Statistics:")
print(data.describe())

# Standardize the features (important for clustering)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
scaled_df = pd.DataFrame(scaled_data, columns=data.columns, index=data.index)

# Display first 5 rows of standardized data
print("\nFirst 5 rows of Standardized Data:")
print(scaled_df.head())

# Save standardized data for further use
scaled_df.to_csv('scaled_crime_data.csv')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
Index: 50 entries, Alabama to Wyoming
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Murder    50 non-null     float64
 1   Assault   50 non-null     int64  
 2   UrbanPop  50 non-null     int64  
 3   Rape      50 non-null     float64
dtypes: float64(2), int64(2)
memory usage: 2.0+ KB
None

First 5 rows of the dataset:
            Murder  Assault  UrbanPop  Rape
Alabama       13.2      236        58  21.2
Alaska        10.0      263        48  44.5
Arizona        8.1      294        80  31.0
Arkansas       8.8      190        50  19.5
California     9.0      276        91  40.6

Missing Values:
Murder      0
Assault     0
UrbanPop    0
Rape        0
dtype: int64

Dataset Statistics:
         Murder     Assault   UrbanPop       Rape
count  50.00000   50.000000  50.000000  50.000000
mean    7.78800  170.760000  65.540000  21.232000
std     4.35551   83.337661  14.47476

In [4]:
# kmeans_clustering.py
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load the standardized data
scaled_df = pd.read_csv('scaled_crime_data.csv', index_col=0)

# Elbow method to find optimal number of clusters
wcss = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_df)
    wcss.append(kmeans.inertia_)

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(range(1, 11), wcss, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.grid(True)
plt.savefig('elbow_plot.png')
plt.close()

# Perform K-means clustering with chosen k (e.g., k=4, to be confirmed after elbow plot)
k_optimal = 4  # Adjust based on elbow plot inspection
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
cluster_labels = kmeans.fit_predict(scaled_df)

# Add cluster labels to the original and standardized data
data = pd.read_csv('crime_data.csv', index_col=0)
data['Cluster'] = cluster_labels
scaled_df['Cluster'] = cluster_labels

# Save the clustered data
data.to_csv('crime_data_kmeans.csv')
scaled_df.to_csv('scaled_crime_data_kmeans.csv')

# Display cluster counts
print("Number of states in each cluster:")
print(data['Cluster'].value_counts())

# Display first 5 rows of clustered data
print("\nFirst 5 rows of Clustered Data:")
print(data.head())

# Display mean values for each cluster
print("\nCluster Characteristics (Mean Values):")
cluster_means = data.groupby('Cluster').mean()
print(cluster_means)

Number of states in each cluster:
Cluster
3    17
0    13
2    12
1     8
Name: count, dtype: int64

First 5 rows of Clustered Data:
            Murder  Assault  UrbanPop  Rape  Cluster
Alabama       13.2      236        58  21.2        1
Alaska        10.0      263        48  44.5        2
Arizona        8.1      294        80  31.0        2
Arkansas       8.8      190        50  19.5        1
California     9.0      276        91  40.6        2

Cluster Characteristics (Mean Values):
            Murder     Assault   UrbanPop       Rape
Cluster                                             
0         3.600000   78.538462  52.076923  12.176923
1        13.937500  243.625000  53.750000  21.412500
2        10.966667  264.000000  76.500000  33.608333
3         5.852941  141.176471  73.647059  19.335294


In [5]:
# hierarchical_clustering.py
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

# Load the standardized data
scaled_df = pd.read_csv('scaled_crime_data.csv', index_col=0)

# Perform hierarchical clustering (linkage) for the dendrogram
Z = linkage(scaled_df, method='ward', metric='euclidean')

# Plot the dendrogram
plt.figure(figsize=(10, 6))
dendrogram(Z, labels=scaled_df.index, leaf_rotation=90)
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('States')
plt.ylabel('Euclidean Distance')
plt.tight_layout()
plt.savefig('dendrogram.png')
plt.close()

# Perform Hierarchical clustering with chosen number of clusters (e.g., k=4, to be confirmed after dendrogram)
k_optimal = 4  # Adjust based on dendrogram inspection
hierarchical = AgglomerativeClustering(n_clusters=k_optimal, linkage='ward', metric='euclidean')
cluster_labels = hierarchical.fit_predict(scaled_df)

# Add cluster labels to the original and standardized data
data = pd.read_csv('crime_data.csv', index_col=0)
data['Cluster'] = cluster_labels
scaled_df['Cluster'] = cluster_labels

# Save the clustered data
data.to_csv('crime_data_hierarchical.csv')
scaled_df.to_csv('scaled_crime_data_hierarchical.csv')

# Display cluster counts
print("Number of states in each cluster:")
print(data['Cluster'].value_counts())

# Display first 5 rows of clustered data
print("\nFirst 5 rows of Clustered Data:")
print(data.head())

# Display mean values for each cluster
print("\nCluster Characteristics (Mean Values):")
cluster_means = data.groupby('Cluster').mean()
print(cluster_means)

Number of states in each cluster:
Cluster
0    19
1    12
2    12
3     7
Name: count, dtype: int64

First 5 rows of Clustered Data:
            Murder  Assault  UrbanPop  Rape  Cluster
Alabama       13.2      236        58  21.2        3
Alaska        10.0      263        48  44.5        1
Arizona        8.1      294        80  31.0        1
Arkansas       8.8      190        50  19.5        0
California     9.0      276        91  40.6        1

Cluster Characteristics (Mean Values):
            Murder     Assault   UrbanPop       Rape
Cluster                                             
0         6.210526  142.052632  71.263158  19.184211
1        10.966667  264.000000  76.500000  33.608333
2         3.091667   76.000000  52.083333  11.833333
3        14.671429  251.285714  54.285714  21.685714


In [6]:
# compare_clustering.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import adjusted_rand_score

# Load the clustered data from both methods
kmeans_data = pd.read_csv('crime_data_kmeans.csv', index_col=0)
hierarchical_data = pd.read_csv('crime_data_hierarchical.csv', index_col=0)

# Rename cluster columns for clarity
kmeans_data = kmeans_data.rename(columns={'Cluster': 'Kmeans_Cluster'})
hierarchical_data = hierarchical_data.rename(columns={'Cluster': 'Hierarchical_Cluster'})

# Merge the data to compare cluster assignments
combined_data = kmeans_data[['Kmeans_Cluster']].join(hierarchical_data[['Hierarchical_Cluster']])

# Compute the Adjusted Rand Index to measure similarity between clusterings
ari_score = adjusted_rand_score(combined_data['Kmeans_Cluster'], combined_data['Hierarchical_Cluster'])
print("Adjusted Rand Index (ARI) between K-means and Hierarchical clustering:", ari_score)

# Display a cross-tabulation of cluster assignments
print("\nCross-tabulation of K-means vs. Hierarchical Clusters:")
print(pd.crosstab(combined_data['Kmeans_Cluster'], combined_data['Hierarchical_Cluster']))

# Load mean values for K-means clusters
kmeans_means = kmeans_data.groupby('Kmeans_Cluster').mean()
print("\nK-means Cluster Characteristics (Mean Values):")
print(kmeans_means)

# Load mean values for Hierarchical clusters
hierarchical_means = hierarchical_data.groupby('Hierarchical_Cluster').mean()
print("\nHierarchical Cluster Characteristics (Mean Values):")
print(hierarchical_means)

# Create scatter plots for visualization (Murder vs. Assault, colored by cluster)
plt.figure(figsize=(12, 5))

# K-means scatter plot
plt.subplot(1, 2, 1)
sns.scatterplot(data=kmeans_data, x='Murder', y='Assault', hue='Kmeans_Cluster', palette='deep')
plt.title('K-means Clusters: Murder vs. Assault')
plt.xlabel('Murder Rate')
plt.ylabel('Assault Rate')

# Hierarchical scatter plot
plt.subplot(1, 2, 2)
sns.scatterplot(data=hierarchical_data, x='Murder', y='Assault', hue='Hierarchical_Cluster', palette='deep')
plt.title('Hierarchical Clusters: Murder vs. Assault')
plt.xlabel('Murder Rate')
plt.ylabel('Assault Rate')

plt.tight_layout()
plt.savefig('cluster_scatter_plots.png')
plt.close()

# Save combined data for reference
combined_data.to_csv('combined_clustering.csv')

# Print first 5 rows of combined cluster assignments
print("\nFirst 5 rows of Combined Cluster Assignments:")
print(combined_data.head())

Adjusted Rand Index (ARI) between K-means and Hierarchical clustering: 0.8848796413604911

Cross-tabulation of K-means vs. Hierarchical Clusters:
Hierarchical_Cluster   0   1   2  3
Kmeans_Cluster                     
0                      1   0  12  0
1                      1   0   0  7
2                      0  12   0  0
3                     17   0   0  0

K-means Cluster Characteristics (Mean Values):
                   Murder     Assault   UrbanPop       Rape
Kmeans_Cluster                                             
0                3.600000   78.538462  52.076923  12.176923
1               13.937500  243.625000  53.750000  21.412500
2               10.966667  264.000000  76.500000  33.608333
3                5.852941  141.176471  73.647059  19.335294

Hierarchical Cluster Characteristics (Mean Values):
                         Murder     Assault   UrbanPop       Rape
Hierarchical_Cluster                                             
0                      6.210526  142.052632  

In [7]:
# clustering_inferences.py
import pandas as pd

# Load combined cluster assignments
combined_data = pd.read_csv('combined_clustering.csv', index_col=0)

# Identify states in each cluster for both methods
print("K-means Cluster States:")
for cluster in sorted(combined_data['Kmeans_Cluster'].unique()):
    states = combined_data[combined_data['Kmeans_Cluster'] == cluster].index.tolist()
    print(f"Cluster {cluster}: {', '.join(states)}")

print("\nHierarchical Cluster States:")
for cluster in sorted(combined_data['Hierarchical_Cluster'].unique()):
    states = combined_data[combined_data['Hierarchical_Cluster'] == cluster].index.tolist()
    print(f"Cluster {cluster}: {', '.join(states)}")

# Identify mismatched states
mismatches = combined_data[combined_data['Kmeans_Cluster'] != combined_data['Hierarchical_Cluster']]
print("\nStates with Different Cluster Assignments (K-means vs. Hierarchical):")
print(mismatches)

K-means Cluster States:
Cluster 0: Idaho, Iowa, Kentucky, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin
Cluster 1: Alabama, Arkansas, Georgia, Louisiana, Mississippi, North Carolina, South Carolina, Tennessee
Cluster 2: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas
Cluster 3: Connecticut, Delaware, Hawaii, Indiana, Kansas, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming

Hierarchical Cluster States:
Cluster 0: Arkansas, Connecticut, Delaware, Hawaii, Indiana, Kansas, Kentucky, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming
Cluster 1: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas
Cluster 2: Idaho, Iowa, Maine, Minnesota, Montana, Nebr

In [8]:
# final_clustering_inferences.py
import pandas as pd

# Load combined cluster assignments
combined_data = pd.read_csv('combined_clustering.csv', index_col=0)

# Map K-means clusters to Hierarchical clusters based on cross-tabulation
# K-means Cluster 0 -> Hierarchical Cluster 2
# K-means Cluster 1 -> Hierarchical Cluster 3
# K-means Cluster 2 -> Hierarchical Cluster 1
# K-means Cluster 3 -> Hierarchical Cluster 0
cluster_mapping = {0: 2, 1: 3, 2: 1, 3: 0}
combined_data['Kmeans_Cluster_Mapped'] = combined_data['Kmeans_Cluster'].map(cluster_mapping)

# Identify actual mismatches
mismatches = combined_data[combined_data['Kmeans_Cluster_Mapped'] != combined_data['Hierarchical_Cluster']]
print("States with Different Cluster Assignments (Corrected):")
print(mismatches[['Kmeans_Cluster', 'Hierarchical_Cluster']])

# Final cluster descriptions
print("\nFinal Cluster Inferences:")
print("1. Low-crime, rural states (K-means Cluster 0, Hierarchical Cluster 2):")
print("   - States: Idaho, Iowa, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin")
print("   - Characteristics: Low crime rates (Murder: ~3.1-3.6, Assault: ~76-78.5, Rape: ~11.8-12.2), low urban population (~52).")
print("   - Inference: Safe, rural states with minimal crime issues.")
print("\n2. High-crime, urban states (K-means Cluster 2, Hierarchical Cluster 1):")
print("   - States: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas")
print("   - Characteristics: High crime rates (Murder: ~11.0, Assault: ~264, Rape: ~33.6), high urban population (~76.5).")
print("   - Inference: Urban areas with significant crime challenges, likely metropolitan regions.")
print("\n3. Moderate-crime, urban states (K-means Cluster 3, Hierarchical Cluster 0):")
print("   - States: Connecticut, Delaware, Hawaii, Indiana, Kansas, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming (plus Kentucky and Arkansas in Hierarchical)")
print("   - Characteristics: Moderate crime rates (Murder: ~5.9-6.2, Assault: ~141-142, Rape: ~19.2), high urban population (~71-73.6).")
print("   - Inference: Urban states with controlled crime levels, possibly with better policing or socio-economic conditions.")
print("\n4. High-crime, semi-urban states (K-means Cluster 1, Hierarchical Cluster 3):")
print("   - States: Alabama, Georgia, Louisiana, Mississippi, North Carolina, South Carolina, Tennessee")
print("   - Characteristics: Very high crime rates (Murder: ~13.9-14.7, Assault: ~243-251, Rape: ~21.4-21.7), moderate urban population (~53-54).")
print("   - Inference: Semi-urban states with significant crime issues, possibly due to socio-economic challenges.")

# Save final inferences
with open('clustering_inferences.txt', 'w') as f:
    f.write("Final Cluster Inferences:\n")
    f.write("1. Low-crime, rural states (K-means Cluster 0, Hierarchical Cluster 2):\n")
    f.write("   - States: Idaho, Iowa, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin\n")
    f.write("   - Characteristics: Low crime rates (Murder: ~3.1-3.6, Assault: ~76-78.5, Rape: ~11.8-12.2), low urban population (~52).\n")
    f.write("   - Inference: Safe, rural states with minimal crime issues.\n\n")
    f.write("2. High-crime, urban states (K-means Cluster 2, Hierarchical Cluster 1):\n")
    f.write("   - States: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas\n")
    f.write("   - Characteristics: High crime rates (Murder: ~11.0, Assault: ~264, Rape: ~33.6), high urban population (~76.5).\n")
    f.write("   - Inference: Urban areas with significant crime challenges, likely metropolitan regions.\n\n")
    f.write("3. Moderate-crime, urban states (K-means Cluster 3, Hierarchical Cluster 0):\n")
    f.write("   - States: Connecticut, Delaware, Hawaii, Indiana, Kansas, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming (plus Kentucky and Arkansas in Hierarchical)\n")
    f.write("   - Characteristics: Moderate crime rates (Murder: ~5.9-6.2, Assault: ~141-142, Rape: ~19.2), high urban population (~71-73.6).\n")
    f.write("   - Inference: Urban states with controlled crime levels, possibly with better policing or socio-economic conditions.\n\n")
    f.write("4. High-crime, semi-urban states (K-means Cluster 1, Hierarchical Cluster 3):\n")
    f.write("   - States: Alabama, Georgia, Louisiana, Mississippi, North Carolina, South Carolina, Tennessee\n")
    f.write("   - Characteristics: Very high crime rates (Murder: ~13.9-14.7, Assault: ~243-251, Rape: ~21.4-21.7), moderate urban population (~53-54).\n")
    f.write("   - Inference: Semi-urban states with significant crime issues, possibly due to socio-economic challenges.\n")

States with Different Cluster Assignments (Corrected):
          Kmeans_Cluster  Hierarchical_Cluster
Arkansas               1                     0
Kentucky               0                     0

Final Cluster Inferences:
1. Low-crime, rural states (K-means Cluster 0, Hierarchical Cluster 2):
   - States: Idaho, Iowa, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin
   - Characteristics: Low crime rates (Murder: ~3.1-3.6, Assault: ~76-78.5, Rape: ~11.8-12.2), low urban population (~52).
   - Inference: Safe, rural states with minimal crime issues.

2. High-crime, urban states (K-means Cluster 2, Hierarchical Cluster 1):
   - States: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas
   - Characteristics: High crime rates (Murder: ~11.0, Assault: ~264, Rape: ~33.6), high urban population (~76.5).
   - Inference: Urban areas with significant crime challenges, l

In [9]:
# cluster_size_chart.py
import pandas as pd

# Data for cluster sizes
kmeans_sizes = [13, 8, 12, 17]  # From K-means output
hierarchical_sizes = [19, 12, 12, 7]  # From Hierarchical output
clusters = ['Low-crime, rural', 'High-crime, semi-urban', 'High-crime, urban', 'Moderate-crime, urban']

# Create a DataFrame for plotting
data = {
    'Cluster': clusters * 2,
    'Number of States': kmeans_sizes + hierarchical_sizes,
    'Method': ['K-means'] * 4 + ['Hierarchical'] * 4
}
df = pd.DataFrame(data)

# Create bar chart
```chartjs
{
  "type": "bar",
  "data": {
    "labels": ["Low-crime, rural", "High-crime, semi-urban", "High-crime, urban", "Moderate-crime, urban"],
    "datasets": [
      {
        "label": "K-means",
        "data": [13, 8, 12, 17],
        "backgroundColor": "rgba(54, 162, 235, 0.6)",
        "borderColor": "rgba(54, 162, 235, 1)",
        "borderWidth": 1
      },
      {
        "label": "Hierarchical",
        "data": [19, 12, 12, 7],
        "backgroundColor": "rgba(255, 99, 132, 0.6)",
        "borderColor": "rgba(255, 99, 132, 1)",
        "borderWidth": 1
      }
    ]
  },
  "options": {
    "scales": {
      "y": {
        "beginAtZero": true,
        "title": {
          "display": true,
          "text": "Number of States"
        }
      },
      "x": {
        "title": {
          "display": true,
          "text": "Cluster"
        }
      }
    },
    "plugins": {
      "legend": {
        "display": true,
        "position": "top"
      },
      "title": {
        "display": true,
        "text": "Cluster Sizes: K-means vs. Hierarchical"
      }
    }
  }
}

SyntaxError: invalid syntax (921713093.py, line 18)

In [10]:
# cluster_size_chart.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data for cluster sizes
kmeans_sizes = [13, 8, 12, 17]  # From K-means output
hierarchical_sizes = [19, 12, 12, 7]  # From Hierarchical output
clusters = ['Low-crime, rural', 'High-crime, semi-urban', 'High-crime, urban', 'Moderate-crime, urban']

# Create a DataFrame for plotting
data = {
    'Cluster': clusters * 2,
    'Number of States': kmeans_sizes + hierarchical_sizes,
    'Method': ['K-means'] * 4 + ['Hierarchical'] * 4
}
df = pd.DataFrame(data)

# Create bar chart
plt.figure(figsize=(10, 6))
sns.barplot(x='Cluster', y='Number of States', hue='Method', data=df, palette=['#36A2EB', '#FF6384'])
plt.title('Cluster Sizes: K-means vs. Hierarchical')
plt.xlabel('Cluster')
plt.ylabel('Number of States')
plt.xticks(rotation=45)
plt.legend(title='Method')
plt.tight_layout()
plt.savefig('cluster_sizes.png')
plt.close()

print("Bar chart saved as 'cluster_sizes.png'")

Bar chart saved as 'cluster_sizes.png'


In [11]:
# final_clustering_report.py
import pandas as pd

# Load combined cluster assignments for reference
combined_data = pd.read_csv('combined_clustering.csv', index_col=0)

# Write the final report
with open('final_clustering_report.txt', 'w') as f:
    f.write("Final Clustering Analysis Report\n")
    f.write("==============================\n\n")
    
    f.write("1. Problem Statement\n")
    f.write("-------------------\n")
    f.write("The goal was to perform clustering (K-means and Hierarchical) on the crime dataset to group U.S. states based on crime rates (Murder, Assault, Rape) and urban population (UrbanPop), identify the number of clusters, and draw inferences about crime patterns.\n\n")
    
    f.write("2. Dataset Description\n")
    f.write("---------------------\n")
    f.write("The dataset ('crime_data.csv') contains 50 U.S. states with four features:\n")
    f.write("- Murder: Murder rates per 100,000 people (range: 0.8-17.4).\n")
    f.write("- Assault: Assault rates per 100,000 people (range: 45-337).\n")
    f.write("- UrbanPop: Percentage of urban population (range: 32-91).\n")
    f.write("- Rape: Rape rates per 100,000 people (range: 7.3-46.0).\n")
    f.write("The data was standardized to ensure fair clustering.\n\n")
    
    f.write("3. Clustering Results\n")
    f.write("--------------------\n")
    f.write("Both K-means and Hierarchical clustering (with Ward's method and Euclidean distance) were applied, resulting in 4 clusters. The optimal number of clusters (k=4) was assumed based on prior outputs (elbow plot and dendrogram not described).\n\n")
    
    f.write("3.1 K-means Clustering\n")
    f.write("----------------------\n")
    f.write("Number of states per cluster:\n")
    f.write("- Cluster 0 (Low-crime, rural): 13 states\n")
    f.write("- Cluster 1 (High-crime, semi-urban): 8 states\n")
    f.write("- Cluster 2 (High-crime, urban): 12 states\n")
    f.write("- Cluster 3 (Moderate-crime, urban): 17 states\n\n")
    
    f.write("3.2 Hierarchical Clustering\n")
    f.write("--------------------------\n")
    f.write("Number of states per cluster:\n")
    f.write("- Cluster 0 (Moderate-crime, urban): 19 states\n")
    f.write("- Cluster 1 (High-crime, urban): 12 states\n")
    f.write("- Cluster 2 (Low-crime, rural): 12 states\n")
    f.write("- Cluster 3 (High-crime, semi-urban): 7 states\n\n")
    
    f.write("4. Cluster Characteristics and Inferences\n")
    f.write("---------------------------------------\n")
    f.write("4.1 Low-crime, rural states (K-means Cluster 0, Hierarchical Cluster 2):\n")
    f.write("   - States: Idaho, Iowa, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin\n")
    f.write("   - Characteristics: Low crime rates (Murder: ~3.1-3.6, Assault: ~76-78.5, Rape: ~11.8-12.2), low urban population (~52).\n")
    f.write("   - Inference: Safe, rural states with minimal crime, likely due to low population density and simpler socio-economic dynamics.\n\n")
    f.write("4.2 High-crime, urban states (K-means Cluster 2, Hierarchical Cluster 1):\n")
    f.write("   - States: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas\n")
    f.write("   - Characteristics: High crime rates (Murder: ~11.0, Assault: ~264, Rape: ~33.6), high urban population (~76.5).\n")
    f.write("   - Inference: Urban, metropolitan areas with significant crime challenges, driven by high population density or urban complexities.\n\n")
    f.write("4.3 Moderate-crime, urban states (K-means Cluster 3, Hierarchical Cluster 0):\n")
    f.write("   - States: Connecticut, Delaware, Hawaii, Indiana, Kansas, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming (plus Kentucky and Arkansas in Hierarchical)\n")
    f.write("   - Characteristics: Moderate crime rates (Murder: ~5.9-6.2, Assault: ~141-142, Rape: ~19.2), high urban population (~71-73.6).\n")
    f.write("   - Inference: Urban states with controlled crime levels, possibly due to effective policing or socio-economic stability.\n\n")
    f.write("4.4 High-crime, semi-urban states (K-means Cluster 1, Hierarchical Cluster 3):\n")
    f.write("   - States: Alabama, Georgia, Louisiana, Mississippi, North Carolina, South Carolina, Tennessee\n")
    f.write("   - Characteristics: Very high crime rates (Murder: ~13.9-14.7, Assault: ~243-251, Rape: ~21.4-21.7), moderate urban population (~53-54).\n")
    f.write("   - Inference: Semi-urban states with significant crime issues, possibly driven by socio-economic challenges or regional disparities.\n\n")
    
    f.write("5. Comparison of Clustering Methods\n")
    f.write("---------------------------------\n")
    f.write("The Adjusted Rand Index (ARI = 0.8849) indicates strong agreement between K-means and Hierarchical clustering. Key correspondences:\n")
    f.write("- K-means Cluster 0 ↔ Hierarchical Cluster 2 (Low-crime, rural)\n")
    f.write("- K-means Cluster 1 ↔ Hierarchical Cluster 3 (High-crime, semi-urban)\n")
    f.write("- K-means Cluster 2 ↔ Hierarchical Cluster 1 (High-crime, urban)\n")
    f.write("- K-means Cluster 3 ↔ Hierarchical Cluster 0 (Moderate-crime, urban)\n")
    f.write("Mismatches (2 states):\n")
    f.write("- Arkansas: K-means Cluster 1 (High-crime, semi-urban) vs. Hierarchical Cluster 0 (Moderate-crime, urban).\n")
    f.write("- Kentucky: K-means Cluster 0 (Low-crime, rural) vs. Hierarchical Cluster 0 (Moderate-crime, urban).\n")
    f.write("These mismatches reflect boundary cases due to intermediate crime rates or urban population values.\n\n")
    
    f.write("6. Cluster Size Visualization\n")
    f.write("---------------------------\n")
    f.write("A bar chart ('cluster_sizes.png') compares the number of states in each cluster:\n")
    f.write("- Low-crime, rural: 13 (K-means) vs. 12 (Hierarchical)\n")
    f.write("- High-crime, semi-urban: 8 (K-means) vs. 7 (Hierarchical)\n")
    f.write("- High-crime, urban: 12 (K-means) vs. 12 (Hierarchical)\n")
    f.write("- Moderate-crime, urban: 17 (K-means) vs. 19 (Hierarchical)\n")
    f.write("The chart highlights the larger size of the moderate-crime, urban cluster in Hierarchical clustering due to the inclusion of Arkansas and Kentucky.\n\n")
    
    f.write("7. Key Findings\n")
    f.write("--------------\n")
    f.write("- Crime patterns are strongly tied to urbanization: high-crime clusters are urban or semi-urban, while low-crime clusters are rural.\n")
    f.write("- The high-crime, semi-urban cluster (e.g., Alabama, Georgia) suggests regional socio-economic challenges, particularly in Southern states.\n")
    f.write("- The consistency between K-means and Hierarchical clustering (ARI = 0.8849) validates the robustness of the identified patterns.\n")
    f.write("- Mismatches (Arkansas, Kentucky) highlight states with ambiguous characteristics, warranting further investigation.\n\n")
    
    f.write("8. Limitations and Next Steps\n")
    f.write("----------------------------\n")
    f.write("- The optimal number of clusters (k=4) was assumed; elbow plot and dendrogram descriptions could confirm this choice.\n")
    f.write("- Additional visualizations (e.g., scatter plots of other feature pairs) could provide deeper insights.\n")
    f.write("- Further analysis of mismatched states (Arkansas, Kentucky) could explore specific socio-economic or regional factors.\n")
    f.write("- The dataset is limited to four features; incorporating additional variables (e.g., income, education) could enhance clustering.\n")

print("Final report saved as 'final_clustering_report.txt'")

UnicodeEncodeError: 'charmap' codec can't encode character '\u2194' in position 20: character maps to <undefined>

In [12]:
# final_clustering_report.py
import pandas as pd

# Load combined cluster assignments for reference
combined_data = pd.read_csv('combined_clustering.csv', index_col=0)

# Write the final report with UTF-8 encoding
with open('final_clustering_report.txt', 'w', encoding='utf-8') as f:
    f.write("Final Clustering Analysis Report\n")
    f.write("==============================\n\n")
    
    f.write("1. Problem Statement\n")
    f.write("-------------------\n")
    f.write("The goal was to perform clustering (K-means and Hierarchical) on the crime dataset to group U.S. states based on crime rates (Murder, Assault, Rape) and urban population (UrbanPop), identify the number of clusters, and draw inferences about crime patterns.\n\n")
    
    f.write("2. Dataset Description\n")
    f.write("---------------------\n")
    f.write("The dataset ('crime_data.csv') contains 50 U.S. states with four features:\n")
    f.write("- Murder: Murder rates per 100,000 people (range: 0.8-17.4).\n")
    f.write("- Assault: Assault rates per 100,000 people (range: 45-337).\n")
    f.write("- UrbanPop: Percentage of urban population (range: 32-91).\n")
    f.write("- Rape: Rape rates per 100,000 people (range: 7.3-46.0).\n")
    f.write("The data was standardized to ensure fair clustering.\n\n")
    
    f.write("3. Clustering Results\n")
    f.write("--------------------\n")
    f.write("Both K-means and Hierarchical clustering (with Ward's method and Euclidean distance) were applied, resulting in 4 clusters. The optimal number of clusters (k=4) was assumed based on prior outputs (elbow plot and dendrogram not described).\n\n")
    
    f.write("3.1 K-means Clustering\n")
    f.write("----------------------\n")
    f.write("Number of states per cluster:\n")
    f.write("- Cluster 0 (Low-crime, rural): 13 states\n")
    f.write("- Cluster 1 (High-crime, semi-urban): 8 states\n")
    f.write("- Cluster 2 (High-crime, urban): 12 states\n")
    f.write("- Cluster 3 (Moderate-crime, urban): 17 states\n\n")
    
    f.write("3.2 Hierarchical Clustering\n")
    f.write("--------------------------\n")
    f.write("Number of states per cluster:\n")
    f.write("- Cluster 0 (Moderate-crime, urban): 19 states\n")
    f.write("- Cluster 1 (High-crime, urban): 12 states\n")
    f.write("- Cluster 2 (Low-crime, rural): 12 states\n")
    f.write("- Cluster 3 (High-crime, semi-urban): 7 states\n\n")
    
    f.write("4. Cluster Characteristics and Inferences\n")
    f.write("---------------------------------------\n")
    f.write("4.1 Low-crime, rural states (K-means Cluster 0, Hierarchical Cluster 2):\n")
    f.write("   - States: Idaho, Iowa, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin\n")
    f.write("   - Characteristics: Low crime rates (Murder: ~3.1-3.6, Assault: ~76-78.5, Rape: ~11.8-12.2), low urban population (~52).\n")
    f.write("   - Inference: Safe, rural states with minimal crime, likely due to low population density and simpler socio-economic dynamics.\n\n")
    f.write("4.2 High-crime, urban states (K-means Cluster 2, Hierarchical Cluster 1):\n")
    f.write("   - States: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas\n")
    f.write("   - Characteristics: High crime rates (Murder: ~11.0, Assault: ~264, Rape: ~33.6), high urban population (~76.5).\n")
    f.write("   - Inference: Urban, metropolitan areas with significant crime challenges, driven by high population density or urban complexities.\n\n")
    f.write("4.3 Moderate-crime, urban states (K-means Cluster 3, Hierarchical Cluster 0):\n")
    f.write("   - States: Connecticut, Delaware, Hawaii, Indiana, Kansas, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming (plus Kentucky and Arkansas in Hierarchical)\n")
    f.write("   - Characteristics: Moderate crime rates (Murder: ~5.9-6.2, Assault: ~141-142, Rape: ~19.2), high urban population (~71-73.6).\n")
    f.write("   - Inference: Urban states with controlled crime levels, possibly due to effective policing or socio-economic stability.\n\n")
    f.write("4.4 High-crime, semi-urban states (K-means Cluster 1, Hierarchical Cluster 3):\n")
    f.write("   - States: Alabama, Georgia, Louisiana, Mississippi, North Carolina, South Carolina, Tennessee\n")
    f.write("   - Characteristics: Very high crime rates (Murder: ~13.9-14.7, Assault: ~243-251, Rape: ~21.4-21.7), moderate urban population (~53-54).\n")
    f.write("   - Inference: Semi-urban states with significant crime issues, possibly driven by socio-economic challenges or regional disparities.\n\n")
    
    f.write("5. Comparison of Clustering Methods\n")
    f.write("---------------------------------\n")
    f.write("The Adjusted Rand Index (ARI = 0.8849) indicates strong agreement between K-means and Hierarchical clustering. Key correspondences:\n")
    f.write("- K-means Cluster 0 <-> Hierarchical Cluster 2 (Low-crime, rural)\n")
    f.write("- K-means Cluster 1 <-> Hierarchical Cluster 3 (High-crime, semi-urban)\n")
    f.write("- K-means Cluster 2 <-> Hierarchical Cluster 1 (High-crime, urban)\n")
    f.write("- K-means Cluster 3 <-> Hierarchical Cluster 0 (Moderate-crime, urban)\n")
    f.write("Mismatches (2 states):\n")
    f.write("- Arkansas: K-means Cluster 1 (High-crime, semi-urban) vs. Hierarchical Cluster 0 (Moderate-crime, urban).\n")
    f.write("- Kentucky: K-means Cluster 0 (Low-crime, rural) vs. Hierarchical Cluster 0 (Moderate-crime, urban).\n")
    f.write("These mismatches reflect boundary cases due to intermediate crime rates or urban population values.\n\n")
    
    f.write("6. Cluster Size Visualization\n")
    f.write("---------------------------\n")
    f.write("A bar chart ('cluster_sizes.png') compares the number of states in each cluster:\n")
    f.write("- Low-crime, rural: 13 (K-means) vs. 12 (Hierarchical)\n")
    f.write("- High-crime, semi-urban: 8 (K-means) vs. 7 (Hierarchical)\n")
    f.write("- High-crime, urban: 12 (K-means) vs. 12 (Hierarchical)\n")
    f.write("- Moderate-crime, urban: 17 (K-means) vs. 19 (Hierarchical)\n")
    f.write("The chart highlights the larger size of the moderate-crime, urban cluster in Hierarchical clustering due to the inclusion of Arkansas and Kentucky.\n\n")
    
    f.write("7. Key Findings\n")
    f.write("--------------\n")
    f.write("- Crime patterns are strongly tied to urbanization: high-crime clusters are urban or semi-urban, while low-crime clusters are rural.\n")
    f.write("- The high-crime, semi-urban cluster (e.g., Alabama, Georgia) suggests regional socio-economic challenges, particularly in Southern states.\n")
    f.write("- The consistency between K-means and Hierarchical clustering (ARI = 0.8849) validates the robustness of the identified patterns.\n")
    f.write("- Mismatches (Arkansas, Kentucky) highlight states with ambiguous characteristics, warranting further investigation.\n\n")
    
    f.write("8. Limitations and Next Steps\n")
    f.write("----------------------------\n")
    f.write("- The optimal number of clusters (k=4) was assumed; elbow plot and dendrogram descriptions could confirm this choice.\n")
    f.write("- Additional visualizations (e.g., scatter plots of other feature pairs) could provide deeper insights.\n")
    f.write("- Further analysis of mismatched states (Arkansas, Kentucky) could explore specific socio-economic or regional factors.\n")
    f.write("- The dataset is limited to four features; incorporating additional variables (e.g., income, education) could enhance clustering.\n")

print("Final report saved as 'final_clustering_report.txt'")

Final report saved as 'final_clustering_report.txt'


In [13]:
# additional_visualization.py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load clustered data
kmeans_data = pd.read_csv('crime_data_kmeans.csv', index_col=0)
hierarchical_data = pd.read_csv('crime_data_hierarchical.csv', index_col=0)

# Create scatter plots for Rape vs. UrbanPop
plt.figure(figsize=(12, 5))

# K-means scatter plot
plt.subplot(1, 2, 1)
sns.scatterplot(data=kmeans_data, x='UrbanPop', y='Rape', hue='Cluster', palette='deep')
plt.title('K-means Clusters: UrbanPop vs. Rape')
plt.xlabel('Urban Population (%)')
plt.ylabel('Rape Rate')

# Hierarchical scatter plot
plt.subplot(1, 2, 2)
sns.scatterplot(data=hierarchical_data, x='UrbanPop', y='Rape', hue='Cluster', palette='deep')
plt.title('Hierarchical Clusters: UrbanPop vs. Rape')
plt.xlabel('Urban Population (%)')
plt.ylabel('Rape Rate')

plt.tight_layout()
plt.savefig('urbanpop_rape_scatter.png')
plt.close()

print("Scatter plot saved as 'urbanpop_rape_scatter.png'")

Scatter plot saved as 'urbanpop_rape_scatter.png'


In [14]:
# updated_final_clustering_report.py
import pandas as pd

# Load combined cluster assignments for reference
combined_data = pd.read_csv('combined_clustering.csv', index_col=0)

# Write the updated final report with UTF-8 encoding
with open('final_clustering_report.txt', 'w', encoding='utf-8') as f:
    f.write("Final Clustering Analysis Report\n")
    f.write("==============================\n\n")
    
    f.write("1. Problem Statement\n")
    f.write("-------------------\n")
    f.write("The goal was to perform clustering (K-means and Hierarchical) on the crime dataset to group U.S. states based on crime rates (Murder, Assault, Rape) and urban population (UrbanPop), identify the number of clusters, and draw inferences about crime patterns.\n\n")
    
    f.write("2. Dataset Description\n")
    f.write("---------------------\n")
    f.write("The dataset ('crime_data.csv') contains 50 U.S. states with four features:\n")
    f.write("- Murder: Murder rates per 100,000 people (range: 0.8-17.4).\n")
    f.write("- Assault: Assault rates per 100,000 people (range: 45-337).\n")
    f.write("- UrbanPop: Percentage of urban population (range: 32-91).\n")
    f.write("- Rape: Rape rates per 100,000 people (range: 7.3-46.0).\n")
    f.write("The data was standardized to ensure fair clustering.\n\n")
    
    f.write("3. Clustering Results\n")
    f.write("--------------------\n")
    f.write("Both K-means and Hierarchical clustering (with Ward's method and Euclidean distance) were applied, resulting in 4 clusters. The optimal number of clusters (k=4) was assumed based on prior outputs (elbow plot and dendrogram not described).\n\n")
    
    f.write("3.1 K-means Clustering\n")
    f.write("----------------------\n")
    f.write("Number of states per cluster:\n")
    f.write("- Cluster 0 (Low-crime, rural): 13 states\n")
    f.write("- Cluster 1 (High-crime, semi-urban): 8 states\n")
    f.write("- Cluster 2 (High-crime, urban): 12 states\n")
    f.write("- Cluster 3 (Moderate-crime, urban): 17 states\n\n")
    
    f.write("3.2 Hierarchical Clustering\n")
    f.write("--------------------------\n")
    f.write("Number of states per cluster:\n")
    f.write("- Cluster 0 (Moderate-crime, urban): 19 states\n")
    f.write("- Cluster 1 (High-crime, urban): 12 states\n")
    f.write("- Cluster 2 (Low-crime, rural): 12 states\n")
    f.write("- Cluster 3 (High-crime, semi-urban): 7 states\n\n")
    
    f.write("4. Cluster Characteristics and Inferences\n")
    f.write("---------------------------------------\n")
    f.write("4.1 Low-crime, rural states (K-means Cluster 0, Hierarchical Cluster 2):\n")
    f.write("   - States: Idaho, Iowa, Maine, Minnesota, Montana, Nebraska, New Hampshire, North Dakota, South Dakota, Vermont, West Virginia, Wisconsin\n")
    f.write("   - Characteristics: Low crime rates (Murder: ~3.1-3.6, Assault: ~76-78.5, Rape: ~11.8-12.2), low urban population (~52).\n")
    f.write("   - Inference: Safe, rural states with minimal crime, likely due to low population density and simpler socio-economic dynamics.\n\n")
    f.write("4.2 High-crime, urban states (K-means Cluster 2, Hierarchical Cluster 1):\n")
    f.write("   - States: Alaska, Arizona, California, Colorado, Florida, Illinois, Maryland, Michigan, Nevada, New Mexico, New York, Texas\n")
    f.write("   - Characteristics: High crime rates (Murder: ~11.0, Assault: ~264, Rape: ~33.6), high urban population (~76.5).\n")
    f.write("   - Inference: Urban, metropolitan areas with significant crime challenges, driven by high population density or urban complexities.\n\n")
    f.write("4.3 Moderate-crime, urban states (K-means Cluster 3, Hierarchical Cluster 0):\n")
    f.write("   - States: Connecticut, Delaware, Hawaii, Indiana, Kansas, Massachusetts, Missouri, New Jersey, Ohio, Oklahoma, Oregon, Pennsylvania, Rhode Island, Utah, Virginia, Washington, Wyoming (plus Kentucky and Arkansas in Hierarchical)\n")
    f.write("   - Characteristics: Moderate crime rates (Murder: ~5.9-6.2, Assault: ~141-142, Rape: ~19.2), high urban population (~71-73.6).\n")
    f.write("   - Inference: Urban states with controlled crime levels, possibly due to effective policing or socio-economic stability.\n\n")
    f.write("4.4 High-crime, semi-urban states (K-means Cluster 1, Hierarchical Cluster 3):\n")
    f.write("   - States: Alabama, Georgia, Louisiana, Mississippi, North Carolina, South Carolina, Tennessee\n")
    f.write("   - Characteristics: Very high crime rates (Murder: ~13.9-14.7, Assault: ~243-251, Rape: ~21.4-21.7), moderate urban population (~53-54).\n")
    f.write("   - Inference: Semi-urban states with significant crime issues, possibly driven by socio-economic challenges or regional disparities.\n\n")
    
    f.write("5. Comparison of Clustering Methods\n")
    f.write("---------------------------------\n")
    f.write("The Adjusted Rand Index (ARI = 0.8849) indicates strong agreement between K-means and Hierarchical clustering. Key correspondences:\n")
    f.write("- K-means Cluster 0 <-> Hierarchical Cluster 2 (Low-crime, rural)\n")
    f.write("- K-means Cluster 1 <-> Hierarchical Cluster 3 (High-crime, semi-urban)\n")
    f.write("- K-means Cluster 2 <-> Hierarchical Cluster 1 (High-crime, urban)\n")
    f.write("- K-means Cluster 3 <-> Hierarchical Cluster 0 (Moderate-crime, urban)\n")
    f.write("Mismatches (2 states):\n")
    f.write("- Arkansas: K-means Cluster 1 (High-crime, semi-urban) vs. Hierarchical Cluster 0 (Moderate-crime, urban).\n")
    f.write("- Kentucky: K-means Cluster 0 (Low-crime, rural) vs. Hierarchical Cluster 0 (Moderate-crime, urban).\n")
    f.write("These mismatches reflect boundary cases due to intermediate crime rates or urban population values.\n\n")
    
    f.write("6. Visualizations\n")
    f.write("----------------\n")
    f.write("6.1 Cluster Size Visualization\n")
    f.write("A bar chart ('cluster_sizes.png') compares the number of states in each cluster:\n")
    f.write("- Low-crime, rural: 13 (K-means) vs. 12 (Hierarchical)\n")
    f.write("- High-crime, semi-urban: 8 (K-means) vs. 7 (Hierarchical)\n")
    f.write("- High-crime, urban: 12 (K-means) vs. 12 (Hierarchical)\n")
    f.write("- Moderate-crime, urban: 17 (K-means) vs. 19 (Hierarchical)\n")
    f.write("The chart highlights the larger size of the moderate-crime, urban cluster in Hierarchical clustering due to the inclusion of Arkansas and Kentucky.\n\n")
    f.write("6.2 Scatter Plots\n")
    f.write("Scatter plots visualize cluster separation:\n")
    f.write("- 'cluster_scatter_plots.png': Murder vs. Assault for K-means and Hierarchical clusters.\n")
    f.write("- 'urbanpop_rape_scatter.png': UrbanPop vs. Rape for K-means and Hierarchical clusters.\n")
    f.write("These plots illustrate how clusters differ across key features, with UrbanPop vs. Rape highlighting the role of urbanization in rape rates.\n\n")
    
    f.write("7. Key Findings\n")
    f.write("--------------\n")
    f.write("- Crime patterns are strongly tied to urbanization: high-crime clusters are urban or semi-urban, while low-crime clusters are rural.\n")
    f.write("- The high-crime, semi-urban cluster (e.g., Alabama, Georgia) suggests regional socio-economic challenges, particularly in Southern states.\n")
    f.write("- The consistency between K-means and Hierarchical clustering (ARI = 0.8849) validates the robustness of the identified patterns.\n")
    f.write("- Mismatches (Arkansas, Kentucky) highlight states with ambiguous characteristics, warranting further investigation.\n\n")
    
    f.write("8. Limitations and Next Steps\n")
    f.write("----------------------------\n")
    f.write("- The optimal number of clusters (k=4) was assumed; elbow plot and dendrogram descriptions could confirm this choice.\n")
    f.write("- Additional visualizations (e.g., scatter plots of other feature pairs like Murder vs. Rape) could provide deeper insights.\n")
    f.write("- Further analysis of mismatched states (Arkansas, Kentucky) could explore specific socio-economic or regional factors.\n")
    f.write("- The dataset is limited to four features; incorporating additional variables (e.g., income, education) could enhance clustering.\n")

print("Updated final report saved as 'final_clustering_report.txt'")

Updated final report saved as 'final_clustering_report.txt'


In [15]:
# mismatch_analysis.py
import pandas as pd

# Load original and clustered data
data = pd.read_csv('crime_data.csv', index_col=0)
kmeans_data = pd.read_csv('crime_data_kmeans.csv', index_col=0)
hierarchical_data = pd.read_csv('crime_data_hierarchical.csv', index_col=0)

# Get cluster means
kmeans_means = kmeans_data.groupby('Cluster').mean()
hierarchical_means = hierarchical_data.groupby('Cluster').mean()

# Extract data for Arkansas and Kentucky
mismatch_states = data.loc[['Arkansas', 'Kentucky']]

# Print comparison
print("Mismatch Analysis: Arkansas and Kentucky\n")
print("Feature Values for Mismatched States:")
print(mismatch_states)
print("\nK-means Cluster Assignments:")
print(kmeans_data.loc[['Arkansas', 'Kentucky'], ['Cluster']])
print("\nHierarchical Cluster Assignments:")
print(hierarchical_data.loc[['Arkansas', 'Kentucky'], ['Cluster']])
print("\nK-means Cluster Means:")
print(kmeans_means)
print("\nHierarchical Cluster Means:")
print(hierarchical_means)

# Save to file
with open('mismatch_analysis.txt', 'w', encoding='utf-8') as f:
    f.write("Mismatch Analysis: Arkansas and Kentucky\n")
    f.write("======================================\n\n")
    f.write("Feature Values for Mismatched States:\n")
    f.write(str(mismatch_states) + "\n\n")
    f.write("K-means Cluster Assignments:\n")
    f.write(str(kmeans_data.loc[['Arkansas', 'Kentucky'], ['Cluster']]) + "\n\n")
    f.write("Hierarchical Cluster Assignments:\n")
    f.write(str(hierarchical_data.loc[['Arkansas', 'Kentucky'], ['Cluster']]) + "\n\n")
    f.write("K-means Cluster Means:\n")
    f.write(str(kmeans_means) + "\n\n")
    f.write("Hierarchical Cluster Means:\n")
    f.write(str(hierarchical_means) + "\n")

print("Mismatch analysis saved as 'mismatch_analysis.txt'")

Mismatch Analysis: Arkansas and Kentucky

Feature Values for Mismatched States:
          Murder  Assault  UrbanPop  Rape
Arkansas     8.8      190        50  19.5
Kentucky     9.7      109        52  16.3

K-means Cluster Assignments:
          Cluster
Arkansas        1
Kentucky        0

Hierarchical Cluster Assignments:
          Cluster
Arkansas        0
Kentucky        0

K-means Cluster Means:
            Murder     Assault   UrbanPop       Rape
Cluster                                             
0         3.600000   78.538462  52.076923  12.176923
1        13.937500  243.625000  53.750000  21.412500
2        10.966667  264.000000  76.500000  33.608333
3         5.852941  141.176471  73.647059  19.335294

Hierarchical Cluster Means:
            Murder     Assault   UrbanPop       Rape
Cluster                                             
0         6.210526  142.052632  71.263158  19.184211
1        10.966667  264.000000  76.500000  33.608333
2         3.091667   76.000000  52.083