# Feature Imputation & Encoding

### In this notebook, missing feature data will be imputed using information obtained during the Exploratory Data Analysis to make as many informed and logical imputations as possible. The data will then be encoded and prepared for analysis.

In [1]:
# First start by importing the relevant packages:
import pandas as pd
import numpy as np

In [2]:
# Load data (stored locally)
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
# Once again define a function to subdivide the PassengerId and Cabin columns:
def column_split(data):
    data['FamilyId'] = data.PassengerId.str.extract('(^[0-9]+)')
    data[['Cabin_deck','Cabin_number','Cabin_side']] = data.Cabin.str.split('/', expand = True)
    data = data.drop(columns = 'Cabin')
    return data

In [4]:
# Execute this function:
train = column_split(train)
test = column_split(test)

In [5]:
# Now define the function to impute the missing data
def imputation(data_set):
    data = data_set.copy()
    # drop the columns that did not appear relevant based on the EDA
    data = data.drop(columns = ['Name','VIP','Cabin_number','Cabin_side'])
    # now start with imputing Cabin_deck based on FamilyId; most (but technically not all) families will be located near each other,
    # so if there are other family members on the passenger list, use their Cabin_deck value
    data_Cabin_deck_NA_subset = data.loc[data.Cabin_deck.isna()] # first subset the rows where Cabin_deck is NA
    # make a temporary dataframe where Cabin_deck = NA rows are removed
    data_Cabin_deck_NA_drop = data.drop(index = data_Cabin_deck_NA_subset.index.tolist())
    data_Cabin_deck_NA_subset_Fam_S = data_Cabin_deck_NA_subset.FamilyId # make a series from the FamilyId column of the Cabin_deck = NA rows
    for FamId in data_Cabin_deck_NA_subset_Fam_S: # iterate through the FamilyId's...
        if sum(data_Cabin_deck_NA_drop.FamilyId.str.contains(FamId)) > 0: # ... and see if there are other family members in the passenger list...
            fam_index = data_Cabin_deck_NA_subset_Fam_S.loc[data_Cabin_deck_NA_subset_Fam_S == FamId].index[0]
            fam_cabin_deck = data.Cabin_deck.loc[data.FamilyId == FamId].value_counts().index[0]
            data.loc[fam_index,'Cabin_deck'] = fam_cabin_deck # ... and if so, set the missing Cabin_deck to the family member Cabin_deck
    # Next is ShoppingMall expenditures; if the value of ShoppingMall was >0, then the passenger was likely to be on F deck.
    # Use this information to impute some more missing Cabin_deck values
    cabin_deck_shopmall_S = data.Cabin_deck.copy()
    cabin_deck_shopmall_S = cabin_deck_shopmall_S.mask(cond = (data.Cabin_deck.isnull()) & (data.ShoppingMall > 0),
                                                       other = 'F',
                                                       axis = 0)
    data.Cabin_deck = cabin_deck_shopmall_S
    # Some of the Cabin_decks were exclusive to passengers originating from a particular home planet;
    # Decks A, B, C, & T only had passengers from Europa
    # Deck G only had passengers from Earth
    # So passengers located on those decks must have originate from either Europa or Earth, respectively
    homeplanet_S = data.HomePlanet.copy()
    homeplanet_S = homeplanet_S.mask(cond = (data.Cabin_deck == 'A') | (data.Cabin_deck == 'B') | (data.Cabin_deck == 'C') | (data.Cabin_deck == 'T'),
                                     other = 'Europa',
                                     axis = 0)
    homeplanet_S = homeplanet_S.mask(cond = data.Cabin_deck == 'G', other = 'Earth', axis = 0)
    data.HomePlanet = homeplanet_S
    # Unable to draw any further inferences on HomePlanet, CryoSleep, or Destination, so replace the remaining NA's in those columns with the mode
    data.HomePlanet = data.HomePlanet.replace(to_replace = np.nan, value = 'Earth')
    data.CryoSleep = data.CryoSleep.replace(to_replace = np.nan, value = False)
    data.Destination = data.Destination.replace(to_replace = np.nan, value = 'TRAPPIST-1e')
    # Rather than use the full age distribution to impute age, it is better to use the distribution of the deck for the individual passenger
    cabin_decks = data.Cabin_deck.dropna().unique()
    for deck in cabin_decks:
        cabin_deck_age_S = data.Age.copy()
        cabin_deck_age_S = cabin_deck_age_S.mask(cond = (data.Age.isnull()) & (data.Cabin_deck == deck),
                                                 other = data.Age.loc[data.Cabin_deck == deck].median(),
                                                 axis = 0)
        data.Age = cabin_deck_age_S
    # Only use full distribution when the location of the passenger is unknown
    data.Age = data.Age.replace(to_replace = np.nan, value = data.Age.median())
    # Drop FamilyId column
    data = data.drop(columns = 'FamilyId')
    # Replace any remaining unknown deck locations for passengers with F deck, since that is the most common
    data.Cabin_deck = data.Cabin_deck.replace(to_replace = np.nan, value = 'F')
    # The mode for expenditures is 0 (and for the most part, so is the median) so replace any unknown expenditures with 0
    expenditures = ['RoomService',
                    'FoodCourt',
                    'ShoppingMall',
                    'Spa',
                    'VRDeck']
    for expenditure in expenditures:
        data[expenditure] = data[expenditure].replace(to_replace = np.nan, value = 0)
    return data

In [6]:
# Now execute for both the training and test data sets
train_impute = imputation(train)
test_impute = imputation(test)

In [7]:
# Now encode the remaining categorical variables using a function
def encoding(data_set):
    encode_var = ['HomePlanet','Destination','Cabin_deck']
    data_encoded = pd.get_dummies(data_set, columns = encode_var, drop_first = True)
    return data_encoded

In [8]:
# Execute the function
train_encoded = encoding(train_impute).drop(columns = ['PassengerId'])
# remove the PassengerId column from the training set since it is not needed for modeling
test_encoded = encoding(test_impute)

In [9]:
# Save imputed and encoded features to .csv file in order to import for modeling
# train_encoded.to_csv(path_or_buf = 'train_logic_impute.csv', index = False)
# test_encoded.to_csv(path_or_buf = 'test_logic_impute.csv', index = False)