### Loading Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import pickle
import joblib

In [2]:
# Loading in Data in the Note book
test_data = pd.read_csv("test.csv")
train_data = pd.read_csv("train.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [3]:
print(sample_submission.head(5))

  PassengerId  Transported
0     0013_01        False
1     0018_01        False
2     0019_01        False
3     0021_01        False
4     0023_01        False


In [4]:
print(train_data.head(5))

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0001_01     Europa     False  B/0/P  TRAPPIST-1e  39.0  False   
1     0002_01      Earth     False  F/0/S  TRAPPIST-1e  24.0  False   
2     0003_01     Europa     False  A/0/S  TRAPPIST-1e  58.0   True   
3     0003_02     Europa     False  A/0/S  TRAPPIST-1e  33.0  False   
4     0004_01      Earth     False  F/1/S  TRAPPIST-1e  16.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck               Name  \
0          0.0        0.0           0.0     0.0     0.0    Maham Ofracculy   
1        109.0        9.0          25.0   549.0    44.0       Juanna Vines   
2         43.0     3576.0           0.0  6715.0    49.0      Altark Susent   
3          0.0     1283.0         371.0  3329.0   193.0       Solam Susent   
4        303.0       70.0         151.0   565.0     2.0  Willy Santantines   

   Transported  
0        False  
1         True  
2        False  
3        False  
4         True  


In [5]:
print(test_data.head(5))

  PassengerId HomePlanet CryoSleep  Cabin  Destination   Age    VIP  \
0     0013_01      Earth      True  G/3/S  TRAPPIST-1e  27.0  False   
1     0018_01      Earth     False  F/4/S  TRAPPIST-1e  19.0  False   
2     0019_01     Europa      True  C/0/S  55 Cancri e  31.0  False   
3     0021_01     Europa     False  C/1/S  TRAPPIST-1e  38.0  False   
4     0023_01      Earth     False  F/5/S  TRAPPIST-1e  20.0  False   

   RoomService  FoodCourt  ShoppingMall     Spa  VRDeck              Name  
0          0.0        0.0           0.0     0.0     0.0   Nelly Carsoning  
1          0.0        9.0           0.0  2823.0     0.0    Lerome Peckers  
2          0.0        0.0           0.0     0.0     0.0   Sabih Unhearfus  
3          0.0     6652.0           0.0   181.0   585.0  Meratz Caltilter  
4         10.0        0.0         635.0     0.0     0.0   Brence Harperez  


### Classifying Data and Cleaning Data

In [6]:
# Finding Seperate Columns
columns = ["HomePlanet","CryoSleep","Destination","VIP"]
for i in columns:
    name = i
    print("[=====================", name," Distinct ======================]")
    print(list(set(train_data[name])))

['Europa', nan, 'Earth', 'Mars']
[False, True, nan]
[nan, 'TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22']
[False, True, nan]


In [7]:
# Labelling the Distinct Values using Label Encoder for ordinal Relationship
# Label Encoder is used for encoding distinct Strings as numbers
label_encoder = LabelEncoder()
columns = ["HomePlanet","CryoSleep","Destination","VIP"]
x = train_data
for i in columns:
    name = i
    x[name] = label_encoder.fit_transform(x[name])

In [8]:
# Labelling the Distinct Values using Label Encoder for ordinal Relationship
# Label Encoder is used for encoding distinct Strings as numbers
label_encoder = LabelEncoder()
columns = ["HomePlanet","CryoSleep","Destination","VIP"]
a = test_data
for i in columns:
    name = i
    a[name] = label_encoder.fit_transform(a[name])

In [9]:
# Using One Hot Encoding for Making Distinct Data into binaries
# This is suitable when there is no ordinal relationship among the categories.
columns = ["HomePlanet","CryoSleep","Destination","VIP"]
y = train_data
for i in columns:
    name = i
    y = pd.get_dummies(y, columns=[name], prefix=[name])

In [10]:
# Using One Hot Encoding for Making Distinct Data into binaries
# This is suitable when there is no ordinal relationship among the categories.
columns = ["HomePlanet","CryoSleep","Destination","VIP"]
b = test_data
for i in columns:
    name = i
    b = pd.get_dummies(b, columns=[name], prefix=[name])

In [11]:
print("Train Data Optimization - Label Encoding")
columns_to_exclude = ["Cabin","Name"]
filtered_x = x.drop(columns=columns_to_exclude)
filtered_x.head(5)

Train Data Optimization - Label Encoding


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,0001_01,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False
1,0002_01,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True
2,0003_01,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False
3,0003_02,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False
4,0004_01,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True


In [12]:
print("Test Data Optimization - Label Encoding")    
columns_to_exclude = ["Cabin","Name"]
filtered_a = a.drop(columns=columns_to_exclude)
filtered_a.head(5)

Test Data Optimization - Label Encoding


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0013_01,0,1,2,27.0,0,0.0,0.0,0.0,0.0,0.0
1,0018_01,0,0,2,19.0,0,0.0,9.0,0.0,2823.0,0.0
2,0019_01,1,1,0,31.0,0,0.0,0.0,0.0,0.0,0.0
3,0021_01,1,0,2,38.0,0,0.0,6652.0,0.0,181.0,585.0
4,0023_01,0,0,2,20.0,0,10.0,0.0,635.0,0.0,0.0


In [13]:
print("Train Data Optimization - One Hot Encoding")
columns_to_exclude = ["Cabin","Name"]
filtered_y = y.drop(columns=columns_to_exclude)
filtered_y.head(5)

Train Data Optimization - One Hot Encoding


Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_0,HomePlanet_1,...,CryoSleep_0,CryoSleep_1,CryoSleep_2,Destination_0,Destination_1,Destination_2,Destination_3,VIP_0,VIP_1,VIP_2
0,0001_01,39.0,0.0,0.0,0.0,0.0,0.0,False,False,True,...,True,False,False,False,False,True,False,True,False,False
1,0002_01,24.0,109.0,9.0,25.0,549.0,44.0,True,True,False,...,True,False,False,False,False,True,False,True,False,False
2,0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,False,False,True,...,True,False,False,False,False,True,False,False,True,False
3,0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,False,False,True,...,True,False,False,False,False,True,False,True,False,False
4,0004_01,16.0,303.0,70.0,151.0,565.0,2.0,True,True,False,...,True,False,False,False,False,True,False,True,False,False


In [14]:
print("Test Data Optimization - One Hot Encoding")
columns_to_exclude = ["Cabin","Name"]
filtered_b = b.drop(columns=columns_to_exclude)
filtered_b.head(5)

Test Data Optimization - One Hot Encoding


Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_0,HomePlanet_1,HomePlanet_2,...,CryoSleep_0,CryoSleep_1,CryoSleep_2,Destination_0,Destination_1,Destination_2,Destination_3,VIP_0,VIP_1,VIP_2
0,0013_01,27.0,0.0,0.0,0.0,0.0,0.0,True,False,False,...,False,True,False,False,False,True,False,True,False,False
1,0018_01,19.0,0.0,9.0,0.0,2823.0,0.0,True,False,False,...,True,False,False,False,False,True,False,True,False,False
2,0019_01,31.0,0.0,0.0,0.0,0.0,0.0,False,True,False,...,False,True,False,True,False,False,False,True,False,False
3,0021_01,38.0,0.0,6652.0,0.0,181.0,585.0,False,True,False,...,True,False,False,False,False,True,False,True,False,False
4,0023_01,20.0,10.0,0.0,635.0,0.0,0.0,True,False,False,...,True,False,False,False,False,True,False,True,False,False


In [15]:
# Label Encoded Data
print("Label Encoded - Train Data")
print(filtered_x.isnull().sum())
print("Label Encoded - Test Data")
print(filtered_a.isnull().sum())
print("One Hot Encoded - Train Data")
print(filtered_y.isnull().sum())
print("One Hot Encoded - Test Data") 
print(filtered_b.isnull().sum())

Label Encoded - Train Data
PassengerId       0
HomePlanet        0
CryoSleep         0
Destination       0
Age             179
VIP               0
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64
Label Encoded - Test Data
PassengerId       0
HomePlanet        0
CryoSleep         0
Destination       0
Age              91
VIP               0
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64
One Hot Encoded - Train Data
PassengerId        0
Age              179
RoomService      181
FoodCourt        183
ShoppingMall     208
Spa              183
VRDeck           188
Transported        0
HomePlanet_0       0
HomePlanet_1       0
HomePlanet_2       0
HomePlanet_3       0
CryoSleep_0        0
CryoSleep_1        0
CryoSleep_2        0
Destination_0      0
Destination_1      0
Destination_2      0
Destination_3      0
VIP_0              0
VIP_1       

In [16]:
filtered_x.fillna(-1,inplace=True)
filtered_y.fillna(-1,inplace=True)
filtered_a.fillna(-1,inplace=True)
filtered_b.fillna(-1,inplace=True)

In [17]:
# Load your dataset - Label Encoded
data = filtered_x

# Separate features and target variable
X = data.drop(columns=['Transported'], axis=1)
Y = data['Transported']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor model
model = RandomForestRegressor()

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
label_encoded_trained = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, label_encoded_trained)
print("Label Encoded Trained Model")
print("Mean Squared Error:", mse)

with open('model_titanic_spaceship_label_encoded.pkl', 'wb') as file:
    pickle.dump(model, file)

Label Encoded Trained Model
Mean Squared Error: 0.15943778033352501


In [18]:
# Load your dataset - One Hot Encoded
data = filtered_y

# Separate features and target variable
X = data.drop(columns=['Transported'], axis=1)
y = data['Transported']

# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor model
model = RandomForestRegressor()

# Train the model
model.fit(X_train, Y_train)

# Make predictions on the test set
one_hot_trained = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(Y_test, one_hot_trained)
print("One Hot Encoded Trained Model")
print("Mean Squared Error:", mse)

with open('model_titanic_spaceship_one_hot_encoded.pkl', 'wb') as file:
    pickle.dump(model, file)

One Hot Encoded Trained Model
Mean Squared Error: 0.16018625646923518


In [19]:
# Running on Test Data - Label Encoded
# Load the new dataset
new_data = filtered_a
model = joblib.load('model_titanic_spaceship_label_encoded.pkl')

# Make predictions on the new dataset
predictions = model.predict(new_data)

# Making a Threshold
threshold = 0.5

# Convert predictions to binary outcomes based on the threshold
binary_predictions = [True if pred >= threshold else False for pred in predictions]

In [20]:
all_ids = test_data["PassengerId"]
data = ({
    "PassengerId": all_ids,
    "Transported": binary_predictions
})
if len(all_ids) == len(binary_predictions):
    df = pd.DataFrame(data)
    df.to_csv("submission-titanic-spaceship-competition.csv",index=False)
else:
    print("Mismatching Data",len(all_ids),len(binary_predictions))

out = pd.read_csv("submission-titanic-spaceship-competition.csv")
out

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True


In [21]:
# Running on Test Data - Label Encoded
# Load the new dataset
new_data = filtered_b
model = joblib.load('model_titanic_spaceship_one_hot_encoded.pkl')

# Make predictions on the new dataset
predictions = model.predict(new_data)

# Making a Threshold
threshold = 0.5

# Convert predictions to binary outcomes based on the threshold
binary_predictions1 = [True if pred >= threshold else False for pred in predictions]

In [22]:
all_ids1 = test_data["PassengerId"]
data = ({
    "PassengerId": all_ids1,
    "Transported": binary_predictions1
})
if len(all_ids1) == len(binary_predictions1):
    df1 = pd.DataFrame(data)
    df1.to_csv("submission-titanic-spaceship-competition-1.csv",index=False)
else:
    print("Mismatching Data",len(all_ids1),len(binary_predictions1))
    
out1 = pd.read_csv("submission-titanic-spaceship-competition-1.csv")
out1

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
