In [73]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer 

In [74]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [75]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [76]:

test_data.isnull().sum()

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64

In [77]:
#missing data columns
#for train data
missing_cols = [col for col in data.columns if data[col].isnull().any()]
#for test data
test_missing_cols = [col for col in test_data.columns if data[col].isnull().any()]
test_missing_cols

['HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Name']

In [78]:
#train set
num_cols = ['Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
imputer = SimpleImputer(strategy = 'mean')
data[num_cols] = imputer.fit_transform(data[num_cols])
#test_set
num_cols = ['Age','VIP','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
imputer = SimpleImputer(strategy = 'mean')
test_data[num_cols] = imputer.fit_transform(test_data[num_cols])

In [79]:
cate_cols = ['HomePlanet','CryoSleep','Cabin','Destination','Name']
for i in  cate_cols:
    data[i].fillna(data[i].mode()[0],inplace= True)# mode() is most frequent data [0] is in case if there is 2 take the first one

#for test_set
cate_cols = ['HomePlanet','CryoSleep','Cabin','Destination','Name']
for i in  cate_cols:
    test_data[i].fillna(data[i].mode()[0],inplace= True)# mode() is most frequent data [0] is in case if there is 2 take the first one

test_data.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
dtype: int64

In [80]:
for_display  = test_data.copy()
passID_test = for_display['PassengerId']
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,0.0,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,0.0,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [81]:
#data preprocessing
#drop PassengerId , Name because it dosen't affect the model and also drop cabin cuz it contains complex format
new_ds = data.drop(['PassengerId','Cabin','Name'],axis=1)
#for test _set
new_testSet = test_data.drop(['PassengerId','Cabin','Name'],axis=1)


In [82]:
new_testSet.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,Earth,True,TRAPPIST-1e,27.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Earth,False,TRAPPIST-1e,19.0,0.0,0.0,9.0,0.0,2823.0,0.0
2,Europa,True,55 Cancri e,31.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Europa,False,TRAPPIST-1e,38.0,0.0,0.0,6652.0,0.0,181.0,585.0
4,Earth,False,TRAPPIST-1e,20.0,0.0,10.0,0.0,635.0,0.0,0.0


In [83]:
#for one-hot encoding
homeplanet_dummies = pd.get_dummies(new_ds['HomePlanet'], prefix='HomePlanet', drop_first=True)
Destination_dummies = pd.get_dummies(new_ds['Destination'], prefix='Destination', drop_first=True)
#test_set
homeplanet_test_dummies = pd.get_dummies(new_testSet['HomePlanet'], prefix='HomePlanet', drop_first=True)
Destination_test_dummies = pd.get_dummies(new_testSet['Destination'], prefix='Destination', drop_first=True)

In [84]:
# Add dummy variables to the DataFrame and drop the original columns
new_ds = pd.concat([new_ds, homeplanet_dummies], axis=1)
new_ds.drop('HomePlanet', axis=1, inplace=True)
#test set
new_testSet= pd.concat([new_testSet, homeplanet_test_dummies], axis=1)
new_testSet.drop('HomePlanet', axis=1, inplace=True)

In [85]:
new_ds = pd.concat([new_ds, Destination_dummies], axis=1)
new_ds.drop('Destination', axis=1, inplace=True)
#for test set
new_testSet = pd.concat([new_testSet, Destination_test_dummies], axis=1)
new_testSet.drop('Destination', axis=1, inplace=True)

In [86]:
# Check for duplicate columns
print("Duplicate Columns:\n", new_ds.columns[new_ds.columns.duplicated()])
# Remove duplicate columns by keeping only unique columns
new_ds = new_ds.loc[:, ~new_ds.columns.duplicated()]
#test set
new_testSet = new_testSet.loc[:, ~new_testSet.columns.duplicated()]
new_testSet.head()
new_testSet.head()

Duplicate Columns:
 Index([], dtype='object')


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,True,27.0,0.0,0.0,0.0,0.0,0.0,0.0,False,False,False,True
1,False,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,False,False,False,True
2,True,31.0,0.0,0.0,0.0,0.0,0.0,0.0,True,False,False,False
3,False,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,True,False,False,True
4,False,20.0,0.0,10.0,0.0,635.0,0.0,0.0,False,False,False,True


In [87]:
copy_ds = new_ds.copy()
copy_test_ds = new_testSet.copy()
copy_test_ds.isnull().sum()

CryoSleep                    0
Age                          0
VIP                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
HomePlanet_Europa            0
HomePlanet_Mars              0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
dtype: int64

In [88]:
copy_ds[['HomePlanet_Europa','HomePlanet_Mars','Destination_PSO J318.5-22','Destination_TRAPPIST-1e']]=copy_ds[['HomePlanet_Europa','HomePlanet_Mars','Destination_PSO J318.5-22','Destination_TRAPPIST-1e']].astype(int)

copy_test_ds[['HomePlanet_Europa','HomePlanet_Mars','Destination_PSO J318.5-22','Destination_TRAPPIST-1e']]=copy_test_ds[['HomePlanet_Europa','HomePlanet_Mars','Destination_PSO J318.5-22','Destination_TRAPPIST-1e']].astype(int)

In [89]:

copy_test_ds.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,True,27.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,1
1,False,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,0,0,0,1
2,True,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,0
3,False,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,1,0,0,1
4,False,20.0,0.0,10.0,0.0,635.0,0.0,0.0,0,0,0,1


In [90]:
X=copy_ds.drop(['Transported'],axis=1)
y=copy_ds['Transported']
X_test = copy_test_ds

In [91]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [92]:
model = RandomForestClassifier(n_estimators=100,random_state=2)

In [97]:
cross_val = cross_val_score(model,X,y,cv=5,scoring="accuracy")
print("Cross-validation scores:", cross_val)
mean = cross_val.mean()
print(mean)

Cross-validation scores: [0.78033353 0.77515814 0.77803335 0.78250863 0.78826237]
0.7808592031053652


In [94]:
model.fit(X,y)
predict = model.predict(X_test)
predict = predict.astype(bool)#Ensure the values are boolean

In [95]:
print(type(for_display), len(for_display))
print(type(predict), len(predict))

<class 'pandas.core.frame.DataFrame'> 4277
<class 'numpy.ndarray'> 4277


In [98]:
df_sum = pd.DataFrame({
    'PassengerId':passID_test,
    'Transported':predict
}).to_csv('summision.csv',index = False)