In [2]:
import urllib.request
import os

In [3]:
# note: need to create the directory '/data/' by yourself
url = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
file_path = "data/titanic3.xls"
if not os.path.isfile(file_path):
    result = urllib.request.urlretrieve(url, file_path)
    print('downloaded:', result)

In [4]:
import numpy
import pandas as pd
# will use pandas 'dataframe' to do preprocess

In [5]:
all_df = pd.read_excel(file_path)

In [6]:
all_df[:2]

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"


In [7]:
cols = ['survived', 'name', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
all_df = all_df[cols]

In [8]:
all_df[:2]

Unnamed: 0,survived,name,pclass,sex,age,sibsp,parch,fare,embarked
0,1,"Allen, Miss. Elisabeth Walton",1,female,29.0,0,0,211.3375,S
1,1,"Allison, Master. Hudson Trevor",1,male,0.9167,1,2,151.55,S


In [9]:
df = all_df.drop(['name'], axis=1)

In [10]:
all_df.isnull().sum()

survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64

In [11]:
age_mean = df['age'].mean()
df['age'] = df['age'].fillna(age_mean)

In [12]:
fare_mean = df['fare'].mean()
df['fare'] = df['fare'].fillna(fare_mean)

In [13]:
df['sex'] = df['sex'].map( {'female':0, 'male':1} ).astype(int)

In [14]:
x_one_hot_df = pd.get_dummies( data=df, columns=["embarked"] )

In [15]:
x_one_hot_df [:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0,0,0,211.3375,0,0,1
1,1,1,1,0.9167,1,2,151.55,0,0,1


In [16]:
# from dataframe to array
data_array = x_one_hot_df.values

In [17]:
data_array.shape

(1309, 10)

In [18]:
data_array[:2]

array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
        211.3375,   0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
        151.55  ,   0.    ,   0.    ,   1.    ]])

In [19]:
# Label
label = data_array[: , 0]
label[:2]

array([1., 1.])

In [20]:
# Feature
feature = data_array[: ,1:]
feature[:2]

array([[  1.    ,   0.    ,  29.    ,   0.    ,   0.    , 211.3375,
          0.    ,   0.    ,   1.    ],
       [  1.    ,   1.    ,   0.9167,   1.    ,   2.    , 151.55  ,
          0.    ,   0.    ,   1.    ]])

In [22]:
# use sklearn for standarization 
from sklearn import preprocessing
minmax_scale = preprocessing.MinMaxScaler( feature_range=(0,1) )
scaled_features = minmax_scale.fit_transform( feature )
scaled_features[:2]

array([[0.        , 0.        , 0.36116884, 0.        , 0.        ,
        0.41250333, 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.00939458, 0.125     , 0.22222222,
        0.2958059 , 0.        , 0.        , 1.        ]])

In [25]:
# devide into train and test dataframes
msk = numpy.random.rand( len(all_df) ) < 0.8
train_df = all_df[msk]
test_df = all_df[~msk]

print('total:', len(all_df))
print('train:', len(train_df))
print('test:', len(test_df))

total: 1309
train: 1034
test: 275


In [27]:
# define the preprocess function  
def preprocess_data(raw_df):
    df = raw_df.drop( ['name'], axis=1 )
    age_mean = df['age'].mean()
    df['age'] = df['age'].fillna(age_mean)
    fare_mean = df['fare'].mean()
    df['fare'] = df['fare'].fillna(fare_mean)
    df['sex'] = df['sex'].map( {'female':0, 'male':1} ).astype(int)
    x_one_hot_df = pd.get_dummies( data=df, columns=["embarked" ])
    
    nd_array = x_one_hot_df.values
    feature = nd_array[:, 1:]
    label = nd_array[:, 0]
    
    minmax_scale = preprocessing.MinMaxScaler( feature_range=(0,1) )
    scaled_features = minmax_scale.fit_transform( feature )
    
    return scaled_features,label

In [28]:
train_features, train_label = preprocess_data( train_df )
test_features, test_label = preprocess_data( test_df )

In [34]:
train_features[:5]

array([[0.        , 1.        , 0.00939458, 0.125     , 0.33333333,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.0229641 , 0.125     , 0.33333333,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.37369494, 0.125     , 0.33333333,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 0.        , 0.31106443, 0.125     , 0.33333333,
        0.2958059 , 0.        , 0.        , 1.        ],
       [0.        , 1.        , 0.59916476, 0.        , 0.        ,
        0.05182215, 0.        , 0.        , 1.        ]])

In [35]:
train_label[:5]

array([1., 0., 0., 0., 1.])