### Dataset Description

In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.

### File and Data Field Descriptions:

    <>train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.
    <>PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is  their number within the group. People in a group are often family members, but not always.
    <>HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
    <>CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
    <>Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
    <>Destination - The planet the passenger will be debarking to.
    <>Age - The age of the passenger.
    <>VIP - Whether the passenger has paid for special VIP service during the voyage.
    <>RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
    <>Name - The first and last names of the passenger.
    <>Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.
    
test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. Your task is to predict the value of Transported for the passengers in this set.

sample_submission.csv - A submission file in the correct format.
PassengerId - Id for each passenger in the test set.
Transported - The target. For each passenger, predict either True or False.

In [1]:
# Import Libraries

# Data wrangling
import pandas as pd
import numpy as np
from collections import Counter


# Data visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# File
import os


# Remove warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import Files
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')

FileNotFoundError: ignored

In [None]:
# Sample Data
train.sample(3)

In [None]:
# Sample Data
test.sample(3)

In [None]:
# Inspect Data
train.info()

In [None]:
# Inspect Data
test.info()

In [None]:
#Print shape of Dataset
print(f'The shape of train data is: \n', train.shape)
print(f'The shape of test data is: \n', test.shape)

In [None]:
# Function to extract cabin grp and side 
def cabin_split(x):
    ## try and exception is used to navigate through the nan values
    try:
        u= x.split('/')
        return str(u[0] + u[2])
    except AttributeError as e:
        return x

# Extract cabin grp and side
for data in [test, train]:
    data["Cabin_grp"] = data.Cabin.apply(cabin_split)


In [None]:
# Function fill the Cabin_grp
for data in [train, test]:
    data.Cabin_grp.fillna(method="pad", inplace=True)

# Function to Fill the Cabin from the Carbin_grp    
def fill_cabin(data):
    ## try and exception is used to navigate through the nan values
    try:
        a = data.str.split('')
        return str(a[1] + "/" + str(np.random.choice(a=1500, size=1)[0])+ "/" + a[2])
    except AttributeError as e:
        return data

# Fill the Cabin from the Carbin_grp
for data in [test, train]:
    for index, value in enumerate(list(data.Cabin.isna())):
        if value:
            data["Cabin"].iloc[index]= data.Cabin_grp.apply(fill_cabin).iloc[index]


In [None]:
# Check for NaN values
train.isnull().sum().sort_values(ascending=False)

In [None]:
# Check for NaN values
test.isnull().sum().sort_values(ascending=False)

In [None]:
# Sample Data
train.sample(2)

In [None]:
# Sample Data
test.sample(2)

### EXPLOROTARY DATA ANALYSIS


#### Categorical Variables : HomePlanet, CryoSleep, Destination,VIP


##### 1. HomePlanet

In [None]:
train.HomePlanet.value_counts(dropna=False)

In [None]:
#Mean "Transported" by HomePlanet

train[['HomePlanet', 'Transported']].groupby('HomePlanet', as_index=False).mean().sort_values(by= 'Transported', ascending=False)

# Europa, Mars, Earth

In [None]:
# Plot
sns.barplot(x = 'HomePlanet', y ='Transported', data = train)
plt.ylabel('Transported Probability')
plt.title('Transported Probability by Home Planet')


##### 2. Destination

In [None]:
train['Destination'].value_counts(dropna=False)

In [None]:
# Mean "Transported" by Destination
train[["Destination", "Transported"]].groupby('Destination', as_index=False).mean().sort_values(by="Transported", ascending=False)

# 55 Cancri e, PSO J318.5-22, TRAPPIST-1e

In [None]:
# Plot
sns.barplot(x="Destination", y="Transported", data=train)
plt.ylabel('Transported Probability')
plt.title('Transported Probability by Destination')

##### 3. CryoSleep

In [None]:
train.CryoSleep.value_counts(dropna=False)

In [None]:
# Mean "Transpored" by CryoSleep 

train[['CryoSleep', 'Transported']].groupby("CryoSleep", as_index=False).mean().sort_values(by="Transported", ascending=False)

# True, False

In [None]:
# Plot
sns.barplot(x="CryoSleep", y="Transported", data=train)
plt.xlabel("Transported Probability")
plt.ylabel("Transported Probability by CryoSleep")

##### 4. VIP

In [None]:
train.VIP.value_counts(dropna=False)

In [None]:
# Mean "Transported" by VIP 

train[['VIP', 'Transported']].groupby("VIP", as_index=False).mean().sort_values(by="Transported", ascending=False)

# False, True

In [None]:
# Plot
sns.barplot(x="VIP", y="Transported", data=train)
plt.xlabel("Transported Probability")
plt.ylabel("Transported Probability by VIP")

### FILL MISSING TEST VALUES

##### From the EDA performed earlier by Rasheed, he defined the functions below for filling missing values

In [None]:
# For Functions for Missing Values
def fill_missing_1(data, target_column: str, cond_column1: str, cond_column2: str, cond_value1: str, cond_value2, fill):
    common= data[target_column].isna()
    condition= [(data[cond_column1]>= cond_value1) & (data[cond_column2]== cond_value2) & (common)]
    fill_with= [fill]
    data[target_column]= np.select(condition, fill_with, default= data[target_column].values)

def fill_missing_2(data, target_column: str, cond_column: str, cond_value:int, fill):
    common= data[target_column].isna()
    cond= [(data[cond_column] <= cond_value) &(common)]
    fill_with= [fill]
    data[target_column]= np.select(cond, fill_with, default= data[target_column].values)

def fill_missing_3(data, target_column: str, cond_column1: str, cond_column2: str, cond_value1: str, cond_value2, fill):
    common= data[target_column].isna()
    condition= [(data[cond_column1]== cond_value1) & (data[cond_column2]== cond_value2) & (common)]
    fill_with= [fill]
    data[target_column]= np.select(condition, fill_with, default= data[target_column].values)

def fill_missing_4(data, target_column: str, cond_column1: str,  cond_value1: str, fill):
    common= data[target_column].isna()
    condition= [(data[cond_column1]== cond_value1)  & (common)]
    fill_with= [fill]
    data[target_column]= np.select(condition, fill_with, default= data[target_column].values)

In [None]:
# Check if both train and test datas as same number of unique carbon_grp values

len(train.Cabin_grp.unique()) == len(test.Cabin_grp.unique())

In [None]:
# Fill: For ages greater than 40 and cabin_grp AP,BP, BS, CS , CP HomePlanet is Europa
# Fill: For ages greater than 40  and cabin_grp GS, GP homeplanet is Earth
for grp in ["AP","BP", "BS", "CS" , "CP", "GS", "GP"]:
    if grp in ["GS", "GP"]:
        fill_missing_1(train, 'HomePlanet', "Age", 'Cabin_grp', 40, grp, 'Earth')
        fill_missing_1(test, 'HomePlanet', "Age", 'Cabin_grp', 40, grp, 'Earth')
    else:
        fill_missing_1(train, 'HomePlanet', "Age", 'Cabin_grp', 40, grp, 'Europa')
        fill_missing_1(test, 'HomePlanet', "Age", 'Cabin_grp', 40, grp, 'Europa')


In [None]:
# Fill for Shopmall and VIP sujected to Age 12 and 20 respectively
for data in [train, test]:
    fill_missing_2(data, 'ShoppingMall', 'Age', 12, 0)
    fill_missing_2(data, 'VIP', 'Age', 20, False)

In [None]:
# Fill Missing Values (Contd)
for data in [train, test]:
    fill_missing_3(data, 'HomePlanet', 'Cabin_grp', 'Destination', 'ES', 'TRAPPIST-1e', 'Mars')
    fill_missing_3(data, 'HomePlanet', 'Cabin_grp', 'Destination', 'ES', 'PSO J318.5-22', 'Earth')
    fill_missing_3(data, 'HomePlanet', 'Cabin_grp', 'Destination', 'ES', '55 Cancri e', 'Europa')
    fill_missing_3(data, 'HomePlanet', 'Cabin_grp', 'Destination', 'ES', '55 Cancri e', 'Europa')
    fill_missing_3(data, 'HomePlanet', 'Cabin_grp', 'Destination', 'DS', '55 Cancri e', 'Europa')
    fill_missing_3(data, 'HomePlanet', 'Cabin_grp', 'Destination', 'DP', '55 Cancri e', 'Europa')


    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'AS', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'AP', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'BS', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'BP', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'CS', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'CP', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'TP', 'Europa')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'FS', 'Earth')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'GS', 'Earth')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'GP', 'Earth')
    fill_missing_4(data, 'HomePlanet', 'Cabin_grp', 'EP', 'Earth')

In [None]:
# Fill Missing Values (Contd)
for data in [train, test]:
    data['HomePlanet']= data['HomePlanet'].fillna('Mars')

    fill_missing_4(data, 'CryoSleep', 'Cabin_grp', 'BS', True)
    fill_missing_3(data, 'CryoSleep', 'Cabin_grp', 'Destination', 'GP', '55 Cancri e', True )
    fill_missing_3(data, 'CryoSleep', 'Cabin_grp', 'Destination', 'GS', '55 Cancri e', True )
    ## fill the remaining missing values with False
    data['CryoSleep'] = data['CryoSleep'].fillna(False)

    ## fill VIP the misiing values with False
    data['VIP']= data['VIP'].fillna(False)

    ## fill Destination with TRAPPIST-1e
    data['Destination']= data['Destination'].fillna('TRAPPIST-1e')



In [None]:
## Group by HomePlanet,  cabin_grp and destination then fill with median

for data in [train, test]:
    for col in ['Spa', 'VRDeck', 'ShoppingMall', 'RoomService', 'Age', 'FoodCourt']:
        data[col] = data.groupby(['HomePlanet','Cabin_grp', 'Destination'])[col].apply(lambda x: x.fillna(x.median()))

# Fill the remaining Nan Values:

for data in [train, test]:
    for col in ['Spa', 'VRDeck', 'ShoppingMall', 'RoomService', 'Age', 'FoodCourt']:
        median = data[col].median()
        data[col].fillna(value=median, inplace=True)

In [None]:
# Check for NaN values
train.isnull().sum().sort_values(ascending=False)

In [None]:
# Check for NaN values
test.isnull().sum().sort_values(ascending=False)

In [None]:
# Sample Data
train.sample(5)

In [None]:
# Sample Data
test.sample(5)

In [None]:
# Print out shape of datasets
print(f'The shape of train data is: \n', train.shape)
print(f'The shape of test data is: \n', test.shape)

#### Save to csv

In [None]:
from pandas import DataFrame

# Function to make directory and create file
def create_csv(filename:str, data:DataFrame):
    directory = "Wrangled_Data/"
    path = os.path.join(directory, filename)

    # Make directory if directory doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Check if file exists and delete
    if os.path.exists(path):
        os.remove(path)

    # Read Dataframe to csv    
    data.to_csv(path, index=False)

In [None]:
create_csv("train_new.csv", train)
create_csv("test_new.csv", test)