In [103]:
#import libs

import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.utils import shuffle

Task of competition is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. 

Training data has been provided in the file train.csv - Personal records for about two-thirds (~8700) of the passengers, to be used as training data.

Testing data has been provided in the file test.csv - Personal records for the remaining one-third (~4300) of the passengers, to be used as test data. 

Your task is to predict the value of Transported for the passengers in this set.

In [129]:
train_data = pd.read_csv(r'C:\Users\krupa\OneDrive\Desktop\Titanic_ship_ML\data\train.csv')

train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [105]:
train_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [106]:
train_data.shape

(8693, 14)

In [107]:
train_data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [108]:
#check for missing values
print('missing values (%) per column: \n', 100*train_data.isnull().mean())

missing values (%) per column: 
 PassengerId     0.000000
HomePlanet      2.312205
CryoSleep       2.496261
Cabin           2.289198
Destination     2.093639
Age             2.059128
VIP             2.335212
RoomService     2.082135
FoodCourt       2.105142
ShoppingMall    2.392730
Spa             2.105142
VRDeck          2.162660
Name            2.300702
Transported     0.000000
dtype: float64


In [134]:
#fill the rows with missing values
test_data= test_data.dropna()
test_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
4273,9269_01,Earth,False,0,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
4274,9271_01,Mars,True,D/296/P,55 Cancri e,0.0,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
4275,9273_01,Europa,False,D/297/P,0,0.0,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


Columns with integer values include:-
-Age
-Room Service (amount billed)
-Food Court (amount billed)
-Shopping mall (amount billed)
-Spa (amount billed)
-VR Deck (amount billed)


Non-integer value columns include:-
-PassengerId 
-HomePlanet
-CryoSleep
-Cabin 
-Destination
-VIP
-Name
-Transported


Let's take a look at each non-integer column in more detail. 

In [110]:
passid=train_data['PassengerId'].nunique()
passid

8693

In [111]:
hplan=train_data['HomePlanet'].unique()
hplan

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [142]:
#as there are only three unique values for the home planet column, we can replace them with integer values for ease.

mapping_dict = {'Europa': 1, 'Earth': 2, 'Mars': 3}

train_data['HomePlanet'] = train_data['HomePlanet'].map(mapping_dict)


In [143]:
train_data['HomePlanet'].head(10)

0    1.0
1    2.0
2    1.0
3    1.0
4    2.0
5    2.0
6    2.0
7    2.0
8    2.0
9    1.0
Name: HomePlanet, dtype: float64

In [144]:
train_data['CryoSleep'].unique()


array([False, True, nan], dtype=object)

In [145]:
train_data.dtypes

PassengerId      object
HomePlanet      float64
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
dtype: object

In [137]:
train_data.dropna()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [147]:
#mapping_dict2 = {'True': 1, 'False': 2}

train_data['CryoSleep'] = train_data['CryoSleep'].fillna(0).astype(int)


In [148]:
train_data['CryoSleep']

0       0
1       0
2       0
3       0
4       0
       ..
8688    0
8689    1
8690    0
8691    0
8692    0
Name: CryoSleep, Length: 8693, dtype: int32

In [116]:
train_data['Cabin'].nunique()

6560

In [117]:
train_data['Destination'].unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [118]:
mapping_dict3 = {'TRAPPIST-1e': 1, 'PSO J318.5-22': 2, '55 Cancri e': 3}


train_data['Destination'] = train_data['Destination'].map(mapping_dict3)


In [119]:
train_data['Destination']

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
8688    3.0
8689    2.0
8690    1.0
8691    3.0
8692    1.0
Name: Destination, Length: 8693, dtype: float64

In [150]:
train_data['VIP'].unique()

array([False, True, nan], dtype=object)

In [151]:
#mapping_dict4 = {'True': 1, 'False': 2}

train_data['VIP'] = train_data['VIP'].fillna(0).astype(int)


In [152]:
train_data['VIP']

0       0
1       0
2       1
3       0
4       0
       ..
8688    1
8689    0
8690    0
8691    0
8692    0
Name: VIP, Length: 8693, dtype: int32

In [153]:
train_data.loc[train_data['VIP']==1]

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
2,0003_01,1.0,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
108,0112_01,1.0,0,B/1/S,55 Cancri e,48.0,1,0.0,2537.0,87.0,17.0,13.0,Moth Cowtale,True
120,0128_01,3.0,0,D/3/S,TRAPPIST-1e,61.0,1,2353.0,334.0,9.0,316.0,2.0,Grohs Fles,False
214,0224_01,3.0,0,F/42/S,TRAPPIST-1e,32.0,1,181.0,0.0,5.0,1634.0,0.0,Blues Queen,False
291,0321_01,,0,F/61/S,TRAPPIST-1e,59.0,1,1018.0,0.0,209.0,0.0,0.0,Quites Bache,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8579,9158_01,1.0,1,B/298/P,55 Cancri e,30.0,1,0.0,0.0,0.0,0.0,0.0,Magnon Maglible,True
8614,9194_02,1.0,0,E/603/S,TRAPPIST-1e,32.0,1,1003.0,909.0,0.0,0.0,15.0,Tachba Subwor,False
8621,9197_02,1.0,0,C/308/P,,41.0,1,0.0,7964.0,0.0,3238.0,5839.0,Aludram Platch,False
8652,9230_01,1.0,0,C/342/S,TRAPPIST-1e,36.0,1,0.0,5600.0,715.0,2868.0,971.0,,True


The next question is, which data is important and will have an influence on our final prediction (attribute- Transported). 

In [124]:
predict = 'Transported'

In [125]:
test_data = pd.read_csv(r"C:\Users\krupa\OneDrive\Desktop\Titanic_ship_ML\data\test.csv")

test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [126]:
test_data.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name'],
      dtype='object')