# 0. Importing Libraries & Data

In [3]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.model_selection import GridSearchCV

import warnings

pd.set_option('display.max_columns', None)
warnings.filterwarnings("ignore")

This dataset can be found at https://www.kaggle.com/competitions/spaceship-titanic 
***
#### Data Fields:
__PassengerId__ <br>
A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.<br>
__HomePlanet__ <br>
The planet the passenger departed from, typically their planet of permanent residence.<br>
__CryoSleep__ <br>
Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.<br>
__Cabin__ <br>
The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.<br>
__Destination__ <br>
The planet the passenger will be debarking to.<br>
__Age__ <br>
The age of the passenger.<br>
__VIP__ <br>
Whether the passenger has paid for special VIP service during the voyage.<br>
__RoomService, FoodCourt, ShoppingMall, Spa, VRDeck__ <br>
Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.<br>
__Name__ <br>
The first and last names of the passenger.<br>
__Transported__ <br>
Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.<br>

In [5]:
titanic_data = pd.read_csv("/home/mmmarinov/ProjectPortfolio/1. Titanic_SpaceShip - Binary Classification/Titanic_SpaceShip_Train_Data.csv")

# 1. Investigate the Data

## 1.1 High-level understanding
This segment is to get a general sense of what kind of information is held

In [6]:
titanic_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
titanic_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [8]:
titanic_data.describe(include='all')

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
count,8693,8492,8476,8494,8511,8514.0,8490,8512.0,8510.0,8485.0,8510.0,8505.0,8493,8693
unique,8693,3,2,6560,3,,2,,,,,,8473,2
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,,False,,,,,,Gollux Reedall,True
freq,1,4602,5439,8,5915,,8291,,,,,,2,4378
mean,,,,,,28.82793,,224.687617,458.077203,173.729169,311.138778,304.854791,,
std,,,,,,14.489021,,666.717663,1611.48924,604.696458,1136.705535,1145.717189,,
min,,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,
25%,,,,,,19.0,,0.0,0.0,0.0,0.0,0.0,,
50%,,,,,,27.0,,0.0,0.0,0.0,0.0,0.0,,
75%,,,,,,38.0,,47.0,76.0,27.0,59.0,46.0,,


Numerical: <br>
- Passenger_Id (If seperated) 
- Age (#)
- RoomService ($)
- FoodCourt ($)
- ShoppingMall ($)
- Spa ($)
- VRDeck ($)

Catagorical: <br>
- HomePlant (Unique: 3)
- Cryosleep (T/F)
- Cabin (Deck and Side)
- Destination (Unique: 3)
- VIP (T/F)
- Name (Mostly Unique)

Target Variable: <br>
- Transported (T/F)

## 1.2 Explore the Data & Analysis

In [9]:
# Make a copy so that the original remains untouched
titanic_data_eda = titanic_data.copy()

In [11]:
# Changing the catagorical into numerical so that it can all be analyzed
titanic_data_eda[['CryoSleep','VIP']] = titanic_data_eda[['CryoSleep','VIP']].astype('float64')

Unnamed: 0,CryoSleep,VIP
0,0.0,0.0
1,0.0,0.0
2,0.0,1.0
3,0.0,0.0
4,0.0,0.0
...,...,...
8688,0.0,1.0
8689,1.0,0.0
8690,0.0,0.0
8691,0.0,0.0
