In [2]:
import pandas as pd 
import numpy as np 
from pandas import Series,DataFrame
import warnings
import torch
import d2l.torch as d2l
from torch import nn
from torch.utils import data
from IPython import display

In [15]:
# 去除警告框
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [4]:
data_train = pd.read_csv("../titanic/train.csv")
data_test = pd.read_csv("../titanic/test.csv")

In [7]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [8]:
data_train.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891,891.0,204,889
unique,,,,891,2,,,,681,,147,3
top,,,,"Parkes, Mr. Francis ""Frank""",male,,,,CA. 2343,,C23 C25 C27,S
freq,,,,1,577,,,,7,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [28]:
'''
    特征值处理：
        1、将PassengerId，Name，Ticket三个字段从特征中删除
        2、因为各属性值之间scale差距太大，对收敛速度影响很大，
           所以将特征重新缩放到零均值和单位方差来标准化数据
        3、因为年龄缺失值不是很多，将所有缺失的值替换为相应特征的平均值 
        4、Cabin处理为有或者无，有为1，无为0
        5、因为性别、登船港口为字符串形式，所以使用独热编码进行处理
    对测试数据做相同的特征值处理，但是要注意测试数据没有Survived字段
'''
# 从训练集中获取训练特征和训练标签
# 将PassengerId，Name，Ticket三个字段从特征中删除
train_features = pd.concat(
    (data_train.iloc[: , 2 : 3], data_train.iloc[: , 4 : 8],data_train.iloc[:, 9:]),
    axis=1)
train_label = data_train.iloc[: , 1]


# 将特征重新缩放到零均值和单位方差来标准化数据
numeric_features = train_features.dtypes[train_features.dtypes != 'object'].index
train_features[numeric_features] = \
        train_features[numeric_features].apply(lambda x:(x - x.mean()) / (x.std()))


# 对年龄中所有缺失的值替换为相应特征的平均值 
train_features[numeric_features] = train_features[numeric_features].fillna(0)


# Cabin处理为有或者无，有为1，无为0
train_features['Cabin'][pd.notnull(train_features['Cabin'])] = 1
train_features['Cabin'][pd.isnull(train_features['Cabin'])] = 0
train_features['Cabin'] = train_features['Cabin'].astype('int64')


# 对性别、登船港口使用独热编码进行处理
train_features = pd.get_dummies(train_features)

# 将特征和标签转换为tensor格式
# train_features = torch.tensor(train_features.values).float()
# train_label = torch.tensor(train_label.values).reshape(-1)
train_features.to_csv('../data/train_features.csv', index=False)
train_label.to_csv('../data/train_label.csv', index=False)

In [27]:
# 对测试数据做处理
# 将PassengerId，Name，Ticket三个字段从特征中删除
test_features = pd.concat(
    (data_test.iloc[: , 1], data_test.iloc[: , 3], data_test.iloc[: , 4 : 7], data_test.iloc[:, 8:]),
    axis=1)

# 将特征重新缩放到零均值和单位方差来标准化数据
numeric_features = test_features.dtypes[test_features.dtypes != 'object'].index
test_features[numeric_features] = \
        test_features[numeric_features].apply(lambda x:(x - x.mean()) / (x.std()))

# 对年龄中所有缺失的值替换为相应特征的平均值 
test_features[numeric_features] = test_features[numeric_features].fillna(0)

# Cabin处理为有或者无，有为1，无为0
test_features['Cabin'][pd.notnull(test_features['Cabin'])] = 1
test_features['Cabin'][pd.isnull(test_features['Cabin'])] = 0
test_features['Cabin'] = test_features['Cabin'].astype('int64')

# 对性别、登船港口使用独热编码进行处理
test_features = pd.get_dummies(test_features)

# 将特征和标签转换为tensor格式
# test_features = torch.tensor(test_features.values).float()
test_features.to_csv('../data/test_features.csv',index=False)