## Titanic - Machine Learning from Disaster

## Step1 数据预处理

In [98]:
import pandas as pd

# 读取训练集和测试集
titanic_train = pd.read_csv('train.csv')
titanic_test = pd.read_csv('test.csv')

In [99]:
titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [100]:
# 训练集中 Age 与 Embarked 无效项的填充
titanic_train["Age"] = titanic_train["Age"].fillna(titanic_train["Age"].median())
titanic_train["Fare"] = titanic_train["Fare"].fillna(titanic_train["Fare"].median())
titanic_train["Embarked"] = titanic_train["Embarked"].fillna(titanic_train["Embarked"].mode()[0])

# 测试集中 Age 与 Embarked 无效项的填充
titanic_test["Age"] = titanic_test["Age"].fillna(titanic_test["Age"].median())
titanic_test["Fare"] = titanic_test["Fare"].fillna(titanic_test["Fare"].median())
titanic_test["Embarked"] = titanic_test["Embarked"].fillna(titanic_test["Embarked"].mode()[0])

titanic_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [101]:
titanic_train

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [102]:
# 替换性别为编码：male=0,female=1
titanic_train["Sex"] = titanic_train["Sex"].map({"male": 0, "female": 1})
titanic_test["Sex"] = titanic_test["Sex"].map({"male": 0, "female": 1})

# 替换上船港口为编码
embarked_map = {"S": 0, "C": 1, "Q": 2}
titanic_train["Embarked"] = titanic_train["Embarked"].map(embarked_map)
titanic_test["Embarked"] = titanic_test["Embarked"].map(embarked_map)

titanic_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,2
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0000,,0
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,2
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,0
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",0,27.0,0,0,A.5. 3236,8.0500,,0
414,1306,1,"Oliva y Ocana, Dona. Fermina",1,39.0,0,0,PC 17758,108.9000,C105,1
415,1307,3,"Saether, Mr. Simon Sivertsen",0,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,0
416,1308,3,"Ware, Mr. Frederick",0,27.0,0,0,359309,8.0500,,0


In [103]:
selected_features = ["Pclass", "Sex", "Age", "Fare", "Embarked", "FamilySize"]

titanic_train["FamilySize"] = titanic_train["SibSp"] + titanic_train["Parch"] + 1
titanic_test["FamilySize"] = titanic_test["SibSp"] + titanic_test["Parch"] + 1

x_train = titanic_train[selected_features]
y_train = titanic_train["Survived"].values

x_test = titanic_test[selected_features]

x_train

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,3,0,22.0,7.2500,0,2
1,1,1,38.0,71.2833,1,2
2,3,1,26.0,7.9250,0,1
3,1,1,35.0,53.1000,0,2
4,3,0,35.0,8.0500,0,1
...,...,...,...,...,...,...
886,2,0,27.0,13.0000,0,1
887,1,1,19.0,30.0000,0,1
888,3,1,28.0,23.4500,0,4
889,1,0,26.0,30.0000,1,1


## Step2 基础准备

In [104]:
import numpy as np


# 加一个 bias 列（全1列）到特征中
def add_bias(X):
    return np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)


# 在 add_bias 前添加标准化（修改Step2）
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# 然后添加偏置项
X_train_b = add_bias(x_train_scaled)
X_test_b = add_bias(x_test_scaled)

## Step3 实现 sigmoid 和 loss 函数

In [105]:
def sigmoid(z):
    z = np.clip(z, -500, 500)  # 限制输入范围，避免溢出
    return 1 / (1 + np.exp(-z))


def compute_loss(y_true, y_pred):
    eps = 1e-8
    y_pred = np.clip(y_pred, eps, 1 - eps)  # 避免log(0)
    return -np.mean(y_true * np.log(y_pred + eps) + (1 - y_true) * np.log(1 - y_pred + eps))

## Step4 训练函数

In [106]:
def train_logistic_regression(x, y, lr=0.001, epochs=100):
    m, n = x.shape
    weights = np.zeros(n)

    for epoch in range(epochs):
        # 前向传播
        z = np.dot(x, weights)
        y_pred = sigmoid(z)

        # 损失
        loss = compute_loss(y, y_pred)

        # 梯度
        gradient = np.dot(x.T, (y_pred - y)) / m

        # 更新参数
        weights -= lr * gradient

        # 打印部分训练过程
        if epoch % 1000 == 0 or epoch == epochs - 1:
            print(f"Epoch {epoch}, Loss: {loss:.4f}")

    return weights

## Step5 训练模型

In [107]:
weights = train_logistic_regression(X_train_b, y_train, lr=0.01, epochs=10000)

Epoch 0, Loss: 0.6931
Epoch 1000, Loss: 0.4560
Epoch 2000, Loss: 0.4452
Epoch 3000, Loss: 0.4431
Epoch 4000, Loss: 0.4425
Epoch 5000, Loss: 0.4424
Epoch 6000, Loss: 0.4423
Epoch 7000, Loss: 0.4423
Epoch 8000, Loss: 0.4423
Epoch 9000, Loss: 0.4423
Epoch 9999, Loss: 0.4423


## Step6 模型推理

In [108]:
def predict(X, weights):
    probs = sigmoid(np.dot(X, weights))
    return (probs >= 0.5).astype(int)


y_train_pred = predict(X_train_b, weights)
accuracy = np.mean(y_train_pred == y_train)
print(f"Train Accuracy: {accuracy:.4f}")

Train Accuracy: 0.8036


In [109]:
test_preds = predict(X_test_b, weights)

# test_df 中的 PassengerId 是测试集乘客的 ID
submission = pd.DataFrame({
    "PassengerId": titanic_test["PassengerId"],
    "Survived": test_preds
})

submission.to_csv("submission.csv", index=False)
print("保存完成：submission.csv")

保存完成：submission.csv
