In [12]:
import numpy as np
import pandas as pd
import math
from sklearn import preprocessing

In [13]:
df_train_raw = pd.read_csv('train.csv', na_values=['', ' ', 'na', 'nan'])
df_test_raw = pd.read_csv('test.csv', na_values=['', ' ', 'na', 'nan'])

In [14]:
print(df_train_raw.dtypes)

##### 결측치 개수 확인 ######
df_train_raw.isna().sum()
# 891개 Data
# 결측치 Age: 177개, Cabin: 687개, Embarked: 2개
df_test_raw.isna().sum()
# 418개 Data
# 결측치 Age: 86개, Fare: 1개 Cabin: 327개
###########################

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [15]:
def one_hot_encoding(df, feature_list, method='pandas', nan_drop_boolean='True', original_feature_drop_boolean='True'):

    p_df = df.copy()

    if method == 'sklearn': # Using scikit learn
        label_encoder = preprocessing.LabelEncoder()
        one_hot_encoder = preprocessing.OneHotEncoder(sparse=False)

        for feature in feature_list:
            p_df[feature] = p_df[feature].astype(str)
            element_name = list(set(p_df[feature]))
            exist_nan = False
            if 'nan' in element_name: exist_nan = True

            label_encoded = label_encoder.fit_transform(p_df[feature])
            one_hot_encoded = one_hot_encoder.fit_transform(p_df[feature].to_numpy().reshape(-1,1))

            df_one_hot_encoded = pd.DataFrame(one_hot_encoded)
            p_df = pd.concat([p_df,df_one_hot_encoded], axis=1)

            dict_col_name = {}
            for i in range(len(element_name)):
                dict_col_name[i] = feature + '_' + p_df[p_df[i] == 1][feature].iloc[0]

            p_df.rename(columns=dict_col_name, inplace=True)

            if nan_drop_boolean and exist_nan: p_df.drop([feature + '_nan'], axis=1, inplace=True)
            if original_feature_drop_boolean: p_df.drop([feature], axis=1, inplace=True)

    else: # Default - pandas get_dummies
        for feature in feature_list:
            df_one_hot_encoded = pd.get_dummies(p_df[feature], dummy_na=not(nan_drop_boolean))
            df_one_hot_encoded = df_one_hot_encoded.astype(float)

            element_name = df_one_hot_encoded.columns.to_list()
            element_name_mod = [feature + '_' + str(i) for i in element_name]

            dict_col_name = dict(zip(element_name, element_name_mod))

            df_one_hot_encoded.rename(columns=dict_col_name, inplace=True)

            p_df = pd.concat([p_df, df_one_hot_encoded], axis=1)

            if original_feature_drop_boolean: p_df.drop([feature], axis=1, inplace=True)

    return p_df


def cabin_only_letter(df, feature_return, original_feature_drop_boolean='True'):
    p_df = df.copy()
    p_df['Cabin_dtype'] = list(map(lambda x: type(x).__name__, p_df['Cabin']))
    p_df[feature_return] = p_df['Cabin'].copy()
    p_df.loc[p_df['Cabin_dtype'] == 'str', 'Cabin_str'] = list(map(lambda x: x[0], p_df.loc[p_df['Cabin_dtype'] == 'str', 'Cabin_str']))
    df[feature_return] = p_df[feature_return]
    if original_feature_drop_boolean: df.drop(['Cabin'], axis=1, inplace=True)
    return df


def name_title(df, feature_return, original_feature_drop_boolean='True'):
    p_df = df.copy()
    list_title_origin = list(map(lambda x:x.split('.')[0].split(' ')[-1],list(df['Name'])))
    list_title = []
    for title in list_title_origin:
        if title not in ['Mr', 'Miss', 'Mrs', 'Master']: title = np.nan
        list_title.append(title)
    p_df[feature_return] = list_title
    if original_feature_drop_boolean: p_df.drop(['Name'], axis=1, inplace=True)
    return p_df

def age_to_int(df, feature_return, na_treat='mean', na_check_feature=True, original_feature_drop_boolean='True'):
    p_df = df.copy()
    p_df['Age_check'] = p_df['Age'].copy()
    p_df.loc[p_df['Age'].notna(), 'Age_check'] = list(map(lambda x: (x-math.trunc(x))==0, p_df.loc[p_df['Age'].notna(), 'Age_check']))
    p_df.loc[p_df['Age'].isna(), 'Age_check'] = False
    p_df[feature_return] = p_df['Age'].copy()
    if na_treat=='na': p_df.loc[~p_df['Age_check'], feature_return] = np.nan
    elif na_treat=='max': p_df.loc[~p_df['Age_check'], feature_return] = p_df['Age'].max()
    elif na_treat=='min': p_df.loc[~p_df['Age_check'], feature_return] = p_df['Age'].min()
    else: p_df.loc[~p_df['Age_check'], feature_return] = p_df['Age'].mean()
    if na_check_feature:
        df[feature_return] = p_df['Age_check'].copy()
    df[feature_return] = p_df[feature_return].copy()
    if original_feature_drop_boolean: df.drop(['Age'], axis=1, inplace=True)
    return df


def scaling(df, feature_list, scaler_option='minmax', original_feature_drop_boolean='True'):
    if scaler_option == 'std': scaler = preprocessing.StandardScaler()
    else: scaler = preprocessing.MinMaxScaler()
    p_df = df.copy()
    for feature in feature_list:
        p_df[feature + '_scaled'] = scaler.fit_transform(np.array(df[feature]).reshape(-1,1))
    if original_feature_drop_boolean: p_df.drop(feature_list, axis=1, inplace=True)
    return p_df


def pre_train_test_merge(train, test, target, split_key):
    list_feature = train.columns
    df_train = train.copy()
    df_test = test.copy()
    df_test[target] = split_key
    df_concat = pd.concat([df_train, df_test], axis=0, sort=True, ignore_index=True)
    df_concat = df_concat[list_feature]
    return df_concat


def pre_train_test_split(merged_data, target, split_key, target_type):
    df_concat = merged_data.copy()
    df_train = df_concat[df_concat[target] != split_key].copy()
    df_test = df_concat[df_concat[target] == split_key].copy()
    df_train.reset_index(inplace=True, drop=True)
    df_test.reset_index(inplace=True, drop=True)
    df_test.drop([target], axis=1, inplace=True)
    df_train[target] = df_train[target].astype(target_type)
    return df_train, df_test

# df_merged = pre_train_test_merge(df_train_raw, df_test_raw, "Survived", "test_data")
# df_train, df_test = pre_train_test_split(df_merged, "Survived", "test_data", int)

In [18]:
df_merged = pre_train_test_merge(df_train_raw, df_test_raw, "Survived", "test_data")

df_merged = cabin_only_letter(df_merged, 'Cabin_str')
df_merged = name_title(df_merged, 'Name_title')
df_merged = age_to_int(df_merged, 'Age_int', na_treat='mean')
df_merged = scaling(df_merged, ['Age_int', 'Fare'])
df_merged = one_hot_encoding(df_merged, ['Embarked', 'Cabin_str', 'Name_title', 'Sex'], method='pandas', nan_drop_boolean=True)
df_merged = df_merged.drop(['PassengerId', 'Ticket'], axis=1)

df_train, df_test = pre_train_test_split(df_merged, "Survived", "test_data", int)

df_train.to_csv('train_hongkyu.csv', index=False)
df_test.to_csv('test_hongkyu.csv', index=False)