## 数据加载与分析

In [None]:
import pandas as pd

df = pd.read_csv('Titanic.csv')
df.info()

## 数据预处理

In [None]:
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

## 划分数据集

In [None]:
from sklearn.model_selection import train_test_split
import gower

X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train_dis = gower.gower_matrix(X_train)
X_val_dis = gower.gower_matrix(X_val, X_train)

## 选择 K 值并训练模型

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt

k_values = range(1, 21)
scores = []
for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k, metric='precomputed')
    score = cross_val_score(model, X_train_dis, y_train, cv=5)
    scores.append(score.mean())

plt.figure(figsize=(10, 6))
plt.plot(k_values, scores, marker='o', linestyle='-', color='b', label='Cross-Validation Score')
plt.xlabel('K Values')
plt.ylabel('Mean Cross-Validation Score')
plt.title('K-Nearest Neighbors: K Value vs Cross-Validation Score')
plt.xticks(k_values)
plt.grid(True)
plt.legend()
plt.show()

optimal_k = k_values[scores.index(max(scores))]
print("Optimal K:", optimal_k)


## 模型训练与评估

In [None]:
model = KNeighborsClassifier(n_neighbors=optimal_k, metric='precomputed')
model.fit(X_train_dis, y_train)
accuracy = model.score(X_val_dis, y_val)
print("Validation Accuracy:", accuracy)