In [140]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.metrics import  make_scorer,accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
# 添加 GridSearchCV 导入语句
from sklearn.model_selection import GridSearchCV,cross_val_score
import warnings #用于忽略警告信息
warnings.filterwarnings("ignore") # 忽略所有警告信息
# 数据加载
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 数据预处理
def preprocess(df):
    # 从姓名中提取头衔
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    # 填充缺失值：根据头衔填充年龄
    title_age_mean = df.groupby('Title')['Age'].mean()
    for title in title_age_mean.index:
        df.loc[(df['Age'].isnull()) & (df['Title'] == title), 'Age'] = title_age_mean[title]
    
    df['Fare'] = df['Fare'].fillna(df['Fare'].median())
    df['Embarked'] = df['Embarked'].fillna('S')
    
    # 特征工程
    df['Sex'] = df['Sex'].map({'male':0, 'female':1})
    df['Embarked'] = df['Embarked'].map({'S':0, 'C':1, 'Q':2})
    df['FamilySize'] = df['SibSp'] + df['Parch']
    df['IsAlone'] = 0
    df.loc[df['FamilySize'] == 0, 'IsAlone'] = 1
    
    # 头衔编码
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    # 对 Fare 进行分箱处理
    df['FareBin'] = pd.qcut(df['Fare'], 4)
    df['FareBin'] = df['FareBin'].astype('category').cat.codes
    
    return df

train = preprocess(train)
test = preprocess(test)

# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FareBin', 'Embarked', 'FamilySize', 'IsAlone', 'Title']
X_train = train[features]
y_train = train['Survived']
X_test = test[features]

# 引入多个模型
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# 定义不同的模型
rf = RandomForestClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
dt = DecisionTreeClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
svc = SVC(random_state=42, probability=True)
knn = KNeighborsClassifier()

# 模型调优：使用网格搜索寻找最优参数
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}

param_grid_gb = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 4, 5]
}

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_ada = GridSearchCV(estimator=ada, param_grid=param_grid_ada, cv=5, n_jobs=-1, scoring='accuracy')
grid_search_gb = GridSearchCV(estimator=gb, param_grid=param_grid_gb, cv=5, n_jobs=-1, scoring='accuracy')

grid_search_rf.fit(X_train, y_train)
grid_search_ada.fit(X_train, y_train)
grid_search_gb.fit(X_train, y_train)

best_rf = grid_search_rf.best_estimator_
best_ada = grid_search_ada.best_estimator_
best_gb = grid_search_gb.best_estimator_

# 投票分类器
voting_clf = VotingClassifier(
    estimators=[('rf', best_rf), ('ada', best_ada), ('gb', best_gb)],
    voting='soft'
)

voting_clf.fit(X_train, y_train)

# 模型评估
scores = cross_val_score(voting_clf, X_train, y_train, cv=5)
print(f"交叉验证平均准确率: {scores.mean():.4f}")

# 生成提交文件
predictions = voting_clf.predict(X_test)
output = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': predictions
})
output.to_csv('submission_optimized_ensemble.csv', index=False)

交叉验证平均准确率: 0.8384
