In [1]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
%matplotlib notebook

from sklearn.base            import BaseEstimator, TransformerMixin
from sklearn.pipeline        import *
from sklearn.preprocessing   import *
from sklearn.metrics         import *
from sklearn_pandas          import CategoricalImputer

In [2]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        num_var = X.shape[1]
        L = []
        for i in range(num_var):
            ohe, le = OneHotEncoder(sparse=False), LabelEncoder()
            L.append(ohe.fit_transform(np.reshape(le.fit_transform(X[:,i]),(-1,1))))
        return np.concatenate(L,1)

# Reading

In [3]:
train_path = "../../../../Documents/datasets/titanic/train.csv"
test_path  = "../../../../Documents/datasets/titanic/test.csv"
train_data, test_data = pd.read_csv(train_path), pd.read_csv(test_path)

# Visualization

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data.hist(bins=50,figsize=(20,15))
plt.show()

In [None]:
train_data.corr()

In [None]:
train_data.plot(kind="scatter", x="Age", y="Fare")

# Preprocessing

In [4]:
# Numerical features
num_attribs = ["Fare", "SibSp", "Parch"]
# Categorical features
cat_attribs = ["Sex", "Embarked", "Pclass"]
num_cat_attribs = [2, 2, 4]
# Target
targets = ["Survived"]

In [5]:
num_pipe = Pipeline([("selector",   DataFrameSelector(num_attribs)),  # select
                     ("imputer",    Imputer(strategy = "median")),    # replace missing values (nan) with median
                     ("std_scaler", StandardScaler())])               # rescale to N(0,1)

cat_pipe = Pipeline([("selector",      DataFrameSelector(cat_attribs)),  # select
                     ("imputer",       CategoricalImputer()),            # replace missing values (nan)
                     ("label_encoder", CategoricalEncoder()),            # encode to one hot vectors
                     ("std_scaler",    StandardScaler())])               # rescale to N(0,1)

full_pipe = FeatureUnion(transformer_list=[("num_pipe", num_pipe), ("cat_pipe", cat_pipe)])
train_X, test_X = full_pipe.fit_transform(train_data), full_pipe.transform(test_data)
train_X = np.concatenate([DataFrameSelector(targets).fit_transform(train_data), train_X],1)

In [6]:
oh_cat_attribs = []
for i in range(len(cat_attribs)):
    for j in range(num_cat_attribs[i]):
        oh_cat_attribs.append(cat_attribs[i] + str(j))

# Writing

In [7]:
prep_train_data = pd.DataFrame(train_X, columns = targets + num_attribs + oh_cat_attribs)
prep_test_data  = pd.DataFrame(test_X,  columns = num_attribs + oh_cat_attribs)

train_path = "../../../../Documents/datasets/titanic/prep_train.csv"
test_path  = "../../../../Documents/datasets/titanic/prep_test.csv"

prep_train_data.to_csv(train_path)
prep_test_data.to_csv(test_path)