In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# 加载数据
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
col_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']
df = pd.read_csv(url, sep='\s+', names=col_names)

# 处理非数值数据
for col in ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 选择连续字段
continuous_cols = ['mpg', 'displacement', 'horsepower', 'weight', 'acceleration']
X = df[continuous_cols]

# 标准化并填补缺失值
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=continuous_cols).fillna(X_scaled.mean())

# 层次聚类
agg_cluster = AgglomerativeClustering(n_clusters=3, linkage='average')
X_scaled['cluster'] = agg_cluster.fit_predict(X_scaled)

# 散点图
plt.figure(figsize=(8, 6))
sns.scatterplot(data=X_scaled, x='mpg', y='weight', hue='cluster', palette='deep', s=100)
plt.title('Hierarchical Clustering Results (Auto-MPG Dataset)')
plt.xlabel('MPG (Standardized)')
plt.ylabel('Weight (Standardized)')
plt.legend(title='Cluster')
plt.show()

  df = pd.read_csv(url, sep='\s+', names=col_names)
  df = pd.read_csv(url, sep='\s+', names=col_names)


ValueError: Input X contains NaN.
AgglomerativeClustering does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [4]:
# 计算交叉表
cluster_vs_origin_crosstab = pd.crosstab(X_scaled['cluster'], df['origin'])

# 热力图
plt.figure(figsize=(8, 6))
sns.heatmap(cluster_vs_origin_crosstab, annot=True, cmap='Blues', fmt='d')
plt.title('Cluster vs Origin Crosstab')
plt.xlabel('Origin')
plt.ylabel('Cluster')
plt.show()

KeyError: 'cluster'