In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
REMOVE_FIELDS = ['PassengerId', 'Cabin', 'Name', 'Ticket']
NORMALIZE_FIELDS = ['Pclass', 'Age_bin', 'Fare_bin', 'Relations']

In [4]:
def remove(dataset,remove_fields = REMOVE_FIELDS ):
    for field in remove_fields:
        del dataset[field]
    return dataset

In [5]:
def missing_data_filling(dataset,field):
    mean_val = dataset[field].mean()
    dataset[field].fillna(mean_val, inplace = True)
    return dataset

In [6]:
def bins(dataset,field,bins):
    label = field + '_bin'
    dataset[label] = pd.cut(x=dataset[field], bins= bins, labels = list(range(bins)), right = True )
    del dataset[field]
    return dataset

In [7]:
def feature_engineering(dataset):
    #SibSp and Parch Encoding
    conditions = [
                (dataset['Parch'] == 0) & (dataset['SibSp'] == 0),
                (dataset['Parch'] > 0) & (dataset['SibSp'] == 0),
                (dataset['Parch'] == 0) & (dataset['SibSp'] > 0),
                (dataset['Parch'] > 0) & (dataset['SibSp'] > 0),
                ]
    verdicts = [
            0,
            1,
            3,
            4
                ]
    dataset['Relations'] = np.select(conditions, verdicts, default = 0 )
    #del dataset['SibSp']
    #del dataset['Parch']
    return dataset

In [8]:
def normalization(dataset, field):
    dataset[field] = dataset[field].astype(int)
    minimum = dataset[field].min()
    maximum = dataset[field].max()
    
    dataset[field] = dataset[field]/ (maximum - minimum)
    return dataset
    

In [9]:
def preprocessing(dataset,normalized = NORMALIZE_FIELDS):
    dataset = remove(dataset,REMOVE_FIELDS )
    dataset = missing_data_filling(dataset, 'Age')
    dataset = missing_data_filling(dataset, 'Fare')
    dataset = bins(dataset,'Age', 32)
    dataset = bins(dataset,'Fare', 120)
    dataset = feature_engineering(dataset)
    dataset = pd.get_dummies(dataset,columns=['Sex'],drop_first=True)
    dataset = pd.get_dummies(dataset,columns=['Embarked'],drop_first=True)
    
    #for field in normalized:
        #dataset = normalization(dataset, field)
    return dataset

In [10]:
train = preprocessing(train)
test = preprocessing(test)

In [11]:
test.columns

Index(['Pclass', 'SibSp', 'Parch', 'Age_bin', 'Fare_bin', 'Relations',
       'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [12]:
train.to_csv('data/train_processed.csv', index = False)
test.to_csv('data/test_processed.csv', index = False)