![il_fullxfull.3860244894_p9az.avif](attachment:b0cdebe8-56df-4c67-a18f-bbd14361328d.avif)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler,MinMaxScaler,OrdinalEncoder,LabelEncoder,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,BaggingClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from scipy.stats.mstats import winsorize
from scipy.stats import trim_mean

In [2]:
data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

# *Quick peek on the data*

In [3]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
data.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [5]:
data.shape

(8693, 14)

In [6]:
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

# *Dropping unecessary colum*

In [7]:
data.drop(['Name'],axis=1,inplace=True)

# *Handling the Null values*

In [8]:
numerical_col = data.select_dtypes(exclude=['object','bool']).columns.tolist()
numerical_col

['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [9]:
imuter = SimpleImputer(strategy='mean')
num_without_nulls = pd.DataFrame(imuter.fit_transform(data[numerical_col]),columns=numerical_col)
num_without_nulls.isnull().sum()

Age             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [10]:
data[numerical_col] = num_without_nulls
data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age               0
VIP             203
RoomService       0
FoodCourt         0
ShoppingMall      0
Spa               0
VRDeck            0
Transported       0
dtype: int64

In [11]:
categorical_col = data.select_dtypes(include=['object','bool']).columns.tolist()
categorical_col

['PassengerId',
 'HomePlanet',
 'CryoSleep',
 'Cabin',
 'Destination',
 'VIP',
 'Transported']

In [12]:
imuter = SimpleImputer(strategy='most_frequent')
cat_without_nulls = pd.DataFrame(imuter.fit_transform(data[categorical_col]),columns=categorical_col)
cat_without_nulls.isnull().sum()

PassengerId    0
HomePlanet     0
CryoSleep      0
Cabin          0
Destination    0
VIP            0
Transported    0
dtype: int64

In [13]:
data[categorical_col] = cat_without_nulls
data.isnull().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
dtype: int64

# *Extracting the group from the PassengerID*

In [14]:
def Group(data):    
    group = []
    for id in data.PassengerId:
        group.append(int(id.split('_')[0]))
    return group

In [15]:
group = Group(data)
group

[1,
 2,
 3,
 3,
 4,
 5,
 6,
 6,
 7,
 8,
 8,
 8,
 9,
 10,
 11,
 12,
 14,
 15,
 16,
 17,
 17,
 20,
 20,
 20,
 20,
 20,
 20,
 22,
 24,
 25,
 26,
 28,
 30,
 31,
 31,
 31,
 34,
 35,
 36,
 38,
 39,
 41,
 43,
 44,
 44,
 44,
 45,
 45,
 50,
 51,
 52,
 53,
 56,
 56,
 56,
 58,
 61,
 62,
 64,
 64,
 66,
 67,
 67,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 76,
 77,
 78,
 81,
 82,
 82,
 82,
 84,
 85,
 86,
 88,
 90,
 91,
 91,
 92,
 92,
 92,
 97,
 98,
 98,
 99,
 99,
 101,
 102,
 103,
 103,
 103,
 105,
 107,
 108,
 108,
 108,
 110,
 110,
 110,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 119,
 119,
 120,
 122,
 123,
 126,
 127,
 128,
 128,
 129,
 133,
 133,
 134,
 136,
 138,
 138,
 139,
 140,
 140,
 141,
 144,
 146,
 147,
 148,
 149,
 151,
 152,
 160,
 163,
 163,
 164,
 164,
 165,
 167,
 167,
 169,
 170,
 171,
 172,
 173,
 174,
 177,
 178,
 179,
 179,
 179,
 181,
 182,
 183,
 186,
 188,
 189,
 190,
 192,
 192,
 192,
 193,
 193,
 193,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 201,
 202,
 202,
 203,
 205,
 2

In [16]:
data.PassengerId = group
data.rename(columns={'PassengerId':'Group'},inplace=True)
data.Group

0          1
1          2
2          3
3          3
4          4
        ... 
8688    9276
8689    9278
8690    9279
8691    9280
8692    9280
Name: Group, Length: 8693, dtype: int64

# *Splitting the Cabines into 3 parts (deck,num,side)*

In [17]:
def Cabines(data):
    deck = []
    num = []
    side = []
    for cabin in data.Cabin:
        deck.append(cabin.split('/')[0])
        num.append(int(cabin.split('/')[1]))
        side.append(cabin.split('/')[-1])
    return deck,num,side

In [18]:
deck,num,side = Cabines(data)
deck,num,side

(['B',
  'F',
  'A',
  'A',
  'F',
  'F',
  'F',
  'G',
  'F',
  'B',
  'B',
  'B',
  'F',
  'G',
  'F',
  'G',
  'F',
  'F',
  'F',
  'G',
  'F',
  'E',
  'E',
  'E',
  'E',
  'E',
  'E',
  'D',
  'C',
  'F',
  'C',
  'F',
  'G',
  'F',
  'F',
  'F',
  'D',
  'D',
  'F',
  'F',
  'G',
  'G',
  'B',
  'G',
  'G',
  'G',
  'F',
  'F',
  'E',
  'E',
  'G',
  'F',
  'A',
  'A',
  'A',
  'G',
  'F',
  'F',
  'F',
  'E',
  'G',
  'G',
  'G',
  'F',
  'E',
  'F',
  'F',
  'F',
  'F',
  'D',
  'C',
  'F',
  'F',
  'C',
  'G',
  'F',
  'F',
  'F',
  'G',
  'C',
  'F',
  'E',
  'G',
  'G',
  'F',
  'G',
  'G',
  'G',
  'A',
  'G',
  'G',
  'F',
  'G',
  'G',
  'F',
  'F',
  'G',
  'G',
  'F',
  'D',
  'G',
  'G',
  'G',
  'G',
  'B',
  'B',
  'B',
  'E',
  'B',
  'F',
  'G',
  'F',
  'D',
  'A',
  'A',
  'F',
  'G',
  'F',
  'F',
  'E',
  'D',
  'E',
  'G',
  'F',
  'D',
  'G',
  'G',
  'G',
  'E',
  'C',
  'G',
  'F',
  'F',
  'F',
  'G',
  'G',
  'F',
  'G',
  'G',
  'F',
  'G',
  'B',
  'B',

In [19]:
cabin_df = pd.DataFrame({'Deck':deck,'Num':num,'Side':side})
cabin_df

Unnamed: 0,Deck,Num,Side
0,B,0,P
1,F,0,S
2,A,0,S
3,A,0,S
4,F,1,S
...,...,...,...
8688,A,98,P
8689,G,1499,S
8690,G,1500,S
8691,E,608,S


In [20]:
data.drop('Cabin',axis=1,inplace=True)
data = pd.concat([data,cabin_df],axis=1)
data.head()

Unnamed: 0,Group,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side
0,1,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,0,P
1,2,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,0,S
2,3,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,0,S
3,3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,0,S
4,4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,1,S


In [21]:
data.shape

(8693, 15)

In [22]:
data.isnull().sum()

Group           0
HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Deck            0
Num             0
Side            0
dtype: int64

# *Defining the final version of numerical,categorical features*

In [23]:
categorical_col.extend(['Deck','Side'])
categorical_col.remove('Cabin')
categorical_col.remove('PassengerId')
numerical_col.extend(['Num','Group'])
print(f'Categorical columns : {categorical_col}, Numerical columns: {numerical_col}')

Categorical columns : ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Transported', 'Deck', 'Side'], Numerical columns: ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num', 'Group']


In [24]:
data.describe()

Unnamed: 0,Group,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,4633.389624,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,603.42678
std,2671.028856,14.339054,659.739364,1594.434978,597.41744,1124.675871,1133.259049,506.36841
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2319.0,20.0,0.0,0.0,0.0,0.0,0.0,173.0
50%,4630.0,27.0,0.0,0.0,0.0,0.0,0.0,448.0
75%,6883.0,37.0,78.0,118.0,45.0,89.0,71.0,983.0
max,9280.0,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,1894.0


# *Standralizing and Encoding teh data*

In [25]:
data[numerical_col] = StandardScaler().fit_transform(data[numerical_col])
data.describe()

Unnamed: 0,Group,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,-2.615595e-17,-1.76144e-16,5.98726e-17,7.356361000000001e-17,7.724179000000001e-17,-5.803351e-17,-6.436816000000001e-17,-1.177018e-16
std,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058,1.000058
min,-1.734409,-2.010564,-0.3405899,-0.287314,-0.290817,-0.2766634,-0.2690226,-1.191744
25%,-0.8665285,-0.6156918,-0.3405899,-0.287314,-0.290817,-0.2766634,-0.2690226,-0.8500758
50%,-0.001269106,-0.1274865,-0.3405899,-0.287314,-0.290817,-0.2766634,-0.2690226,-0.3069617
75%,0.8422746,0.5699497,-0.2223546,-0.2133024,-0.2154885,-0.197525,-0.2063679,0.7496421
max,1.739733,3.499182,21.37681,18.41192,39.03403,19.64845,21.02742,2.548831


In [26]:
for col in categorical_col:
    data[col] = LabelEncoder().fit_transform(data[col])

In [27]:
data.dtypes 

Group           float64
HomePlanet        int64
CryoSleep         int64
Destination       int64
Age             float64
VIP               int64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported       int64
Deck              int64
Num             float64
Side              int64
dtype: object

In [28]:
data.select_dtypes(include=['object','bool']).sum()

Series([], dtype: float64)

In [29]:
X_train = data.drop('Transported',axis=1)
y_train = data.Transported

In [30]:
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
test.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,4186.0,4195.0,4171.0,4179.0,4176.0,4197.0
mean,28.658146,219.266269,439.484296,177.295525,303.052443,310.710031
std,14.179072,607.011289,1527.663045,560.821123,1117.186015,1246.994742
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,26.0,0.0,0.0,0.0,0.0,0.0
75%,37.0,53.0,78.0,33.0,50.0,36.0
max,79.0,11567.0,25273.0,8292.0,19844.0,22272.0


In [31]:
print(test.isnull().sum())

PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [32]:
ID  = test.PassengerId

# *Repeat the same steps with the test data*

In [33]:
test.drop(['Name'],axis=1,inplace=True)

numerical_col = test.select_dtypes(exclude=['object','bool']).columns.tolist()

imuter = SimpleImputer(strategy='mean')
num_without_nulls = pd.DataFrame(imuter.fit_transform(test[numerical_col]),columns=numerical_col)
test[numerical_col] = num_without_nulls

categorical_col = test.select_dtypes(include=['object','bool']).columns.tolist()

imuter = SimpleImputer(strategy='most_frequent')
cat_without_nulls = pd.DataFrame(imuter.fit_transform(test[categorical_col]),columns=categorical_col)
test[categorical_col] = cat_without_nulls

group = Group(test)
test.PassengerId = group
test.rename(columns={'PassengerId':'Group'},inplace=True)

deck,num,side = Cabines(test)
cabin_df = pd.DataFrame({'Deck':deck,'Num':num,'Side':side})
test.drop('Cabin',axis=1,inplace=True)
test = pd.concat([test,cabin_df],axis=1)

categorical_col.extend(['Deck','Side'])
categorical_col.remove('Cabin')
categorical_col.remove('PassengerId')
numerical_col.extend(['Num','Group'])

test[numerical_col] = StandardScaler().fit_transform(test[numerical_col])

for col in categorical_col:
    test[col] = LabelEncoder().fit_transform(test[col])

In [34]:
X_test = test

In [35]:
model = GradientBoostingClassifier(loss='exponential')
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [36]:
y_pred.tolist()
y_pred

array([1, 0, 1, ..., 1, 1, 1])

In [37]:
y_predediction = y_pred.astype('bool')

In [38]:
Submission = pd.DataFrame({'PassengerId':ID,'Transported':y_predediction})
Submission.to_csv('Submission.csv',index=False)
Submission

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
