In [None]:
!pip install scikit-learn
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler



In [None]:
# 加载数据集
url = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [None]:
# 查看缺失值情况
df.isnull().sum()

# 使用中位数填充数值型特征的缺失值
num_features = df.select_dtypes(include=['int64', 'float64']).columns
imputer = SimpleImputer(strategy='median')
df[num_features] = imputer.fit_transform(df[num_features])

# 使用最频繁值填充类别型特征的缺失值
cat_features = df.select_dtypes(include=['object']).columns
imputer = SimpleImputer(strategy='most_frequent')
df[cat_features] = imputer.fit_transform(df[cat_features])

# 查看处理后的缺失值情况
df.isnull().sum()

Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

In [None]:
# 使用独热编码处理类别型特征
encoder = OneHotEncoder(sparse=False, drop='first')  # drop='first'避免虚拟变量陷阱
encoded_features = pd.DataFrame(encoder.fit_transform(df[cat_features]), columns=encoder.get_feature_names_out(cat_features))

# 删除原来的类别型特征，并将编码后的特征加入数据集中
df = df.drop(cat_features, axis=1)
df = pd.concat([df, encoded_features], axis=1)

# 查看处理后的数据集
df.head()



Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Name_Col. John Weir,Name_Col. Oberst Alfons Simonius-Blumer,Name_Don. Manuel E Uruchurtu,Name_Dr. Alfred Pain,...,Name_Ms. Encarnacion Reynaldo,Name_Rev. Charles Leonard Kirkland,Name_Rev. Ernest Courtenay Carter,Name_Rev. John Harper,Name_Rev. Juozas Montvila,Name_Rev. Robert James Bateman,Name_Rev. Thomas Roussel Davids Byles,Name_Sir. Cosmo Edmund Duff Gordon,Name_the Countess. of (Lucy Noel Martha Dyer-Edwards) Rothes,Sex_male
0,0.0,3.0,22.0,1.0,0.0,7.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,38.0,1.0,0.0,71.2833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,3.0,26.0,0.0,0.0,7.925,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,35.0,1.0,0.0,53.1,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,3.0,35.0,0.0,0.0,8.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# 对数值型特征进行标准化处理
scaler = StandardScaler()
df[num_features] = scaler.fit_transform(df[num_features])

# 查看处理后的数据集
df.head()

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Name_Col. John Weir,Name_Col. Oberst Alfons Simonius-Blumer,Name_Don. Manuel E Uruchurtu,Name_Dr. Alfred Pain,...,Name_Ms. Encarnacion Reynaldo,Name_Rev. Charles Leonard Kirkland,Name_Rev. Ernest Courtenay Carter,Name_Rev. John Harper,Name_Rev. Juozas Montvila,Name_Rev. Robert James Bateman,Name_Rev. Thomas Roussel Davids Byles,Name_Sir. Cosmo Edmund Duff Gordon,Name_the Countess. of (Lucy Noel Martha Dyer-Edwards) Rothes,Sex_male
0,-0.792163,0.830524,-0.529366,0.429904,-0.474981,-0.503586,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.262366,-1.561277,0.604265,0.429904,-0.474981,0.783412,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.262366,0.830524,-0.245958,-0.475856,-0.474981,-0.49002,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.262366,-1.561277,0.391709,0.429904,-0.474981,0.417948,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.792163,0.830524,0.391709,-0.475856,-0.474981,-0.487507,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
