In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [16]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

test_ids = test["PassengerId"]

In [17]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [18]:
train = train.drop(["PassengerId", "Name"], axis=1)
print("train_data columns: ", train.shape[1])

test = test.drop(["PassengerId", "Name"], axis=1)
print("test_transaction columns: ", test.shape[1])
train.info()

train_data columns:  12
test_transaction columns:  11
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 755.7+ KB


In [19]:
train.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [20]:
print("Training set missing values")
print(train.isna().sum())
print("Test set missing values")
print(test.isna().sum())

Training set missing values
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Transported       0
dtype: int64
Test set missing values
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
dtype: int64


In [21]:
random_value = np.random.choice(train["HomePlanet"].dropna())
train["HomePlanet"].fillna(random_value, inplace=True)
train["CryoSleep"].fillna(train["CryoSleep"].mode()[0], inplace=True)
train["Cabin"].fillna("Unknown", inplace=True)
train["Destination"].fillna(train["Destination"].mode()[0], inplace=True)
train["VIP"].fillna(train["VIP"].mode()[0], inplace=True)

random_value = np.random.choice(test["HomePlanet"].dropna())
test["HomePlanet"].fillna(random_value, inplace=True)
test["CryoSleep"].fillna(test["CryoSleep"].mode()[0], inplace=True)
test["Cabin"].fillna("Unknown", inplace=True)
test["Destination"].fillna(test["Destination"].mode()[0], inplace=True)
test["VIP"].fillna(test["VIP"].mode()[0], inplace=True)

In [24]:
label_encoder = LabelEncoder()

columns_to_label_encode = ["HomePlanet", "CryoSleep", "Cabin", "Destination", "VIP"]
combined_data = pd.concat([train, test], axis=0)

for column in columns_to_label_encode:
    combined_data[column] = label_encoder.fit_transform(
        combined_data[column].astype(str)
    )

train[columns_to_label_encode] = combined_data[columns_to_label_encode][: len(train)]
test[columns_to_label_encode] = combined_data[columns_to_label_encode][len(train) :]

train["Transported"] = LabelEncoder().fit_transform(train["Transported"].astype(str))

In [25]:
train["Age"].fillna(train["Age"].median(), inplace=True)
train["RoomService"].fillna(train["RoomService"].mode()[0], inplace=True)
train["FoodCourt"].fillna(train["FoodCourt"].mode()[0], inplace=True)
train["ShoppingMall"].fillna(train["ShoppingMall"].mode()[0], inplace=True)
train["Spa"].fillna(train["Spa"].mode()[0], inplace=True)
train["VRDeck"].fillna(train["VRDeck"].mode()[0], inplace=True)

test["Age"].fillna(test["Age"].median(), inplace=True)
test["RoomService"].fillna(test["RoomService"].mode()[0], inplace=True)
test["FoodCourt"].fillna(test["FoodCourt"].mode()[0], inplace=True)
test["ShoppingMall"].fillna(test["ShoppingMall"].mode()[0], inplace=True)
test["Spa"].fillna(test["Spa"].mode()[0], inplace=True)
test["VRDeck"].fillna(test["VRDeck"].mode()[0], inplace=True)

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X = train.drop(columns=["Transported"])
y = train["Transported"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=2)
rf_model.fit(X_train, y_train)

In [29]:
predictions = rf_model.predict_proba(test)

result_df = pd.DataFrame({"PassengerId": test_ids, "Transported": predictions[:, 1]})

threshold = 0.5
result_df["Transported"] = result_df["Transported"] >= threshold

result_df.to_csv("submission.csv", index=False)
result_df

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
