 **Spaceship_Titanic Dataset**


 In this competition your task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To help you make these predictions, you're given a set of personal records recovered from the ship's damaged computer system.

### **Libraries**

In [2]:
import pandas as pd
import re
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer , make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import classification_report
import warnings
warnings. filterwarnings('ignore')

# **Data**

In [7]:
df_original = pd.read_csv('train.csv')
df_original.sample(1)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
4802,5127_02,Earth,False,F/983/S,TRAPPIST-1e,16.0,False,43.0,0.0,0.0,1893.0,119.0,Faithy Yanton,False


In [None]:
df_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [None]:
df_original.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
CryoSleep,217
ShoppingMall,208
VIP,203
HomePlanet,201
Name,200
Cabin,199
VRDeck,188
FoodCourt,183
Spa,183
Destination,182


In [None]:
for col in df_original:
    print(f'{col}: {df_original[col].unique()}')

PassengerId: ['0001_01' '0002_01' '0003_01' ... '9279_01' '9280_01' '9280_02']
HomePlanet: ['Europa' 'Earth' 'Mars' nan]
CryoSleep: [False True nan]
Cabin: ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
Destination: ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Age: [39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.]
VIP: [False True nan]
RoomService: [   0.  109.   43. ... 1569. 8586.  745.]
FoodCourt: [   0.    9. 3576. ... 3208. 6819. 4688.]
ShoppingMall: [   0.   25.  371. ... 1085.  510. 1872.]
Spa: [   0.  549. 6715. ... 2868. 1107. 1643.]
VRDeck: [   0.   44.   49. ... 1164.  971. 3235.]
Name: ['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' ... 'Fayey Connon'
 'Celeon Hontichre' 'Props

In [8]:
df=df_original.copy()

## **preprocessing**

In [9]:

  df[["Deck", "Cabin_num", "Side"]] =df["Cabin"].str.split("/", expand=True)
  X = df.drop('Transported', axis=1)
  y = df.Transported
  X=X.drop(columns=['PassengerId','Name','Cabin'],axis=1)
  name_columns = ['HomePlanet','Deck','Destination','Side']
  boolean_columns= ['CryoSleep','VIP']
  numeric_columns =['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_num']

  for col in X [numeric_columns]:
     num_Imputer = SimpleImputer()
     X[col] =num_Imputer.fit_transform(X[[col]])

  X[boolean_columns] = SimpleImputer(strategy='most_frequent').fit_transform(X[boolean_columns])
  X[name_columns] = SimpleImputer(strategy='most_frequent').fit_transform(X[name_columns])



In [5]:
X.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
HomePlanet,0
CryoSleep,0
Destination,0
Age,0
VIP,0
RoomService,0
FoodCourt,0
ShoppingMall,0
Spa,0
VRDeck,0


In [None]:
X.head(1)

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Cabin_num,Side
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,B,0.0,P


# **Train the model**

In [10]:
y=y.astype(int)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20, random_state=1223)

In [12]:
numeric_pipeline = Pipeline([('Scaler', MinMaxScaler())])
categorical_pipeline = Pipeline([('OneHot', OneHotEncoder(handle_unknown='ignore'))])
transformer = ColumnTransformer([('num', numeric_pipeline, ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_num']), ('cat', categorical_pipeline, ['HomePlanet','Deck','Destination','Side'])])

In [19]:
mlpipe = Pipeline([('Transformer',transformer), ('xgb',XGBClassifier())])

In [25]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.3.1

Found existing installation: scikit-learn 1.6.0
Uninstalling scikit-learn-1.6.0:
  Successfully uninstalled scikit-learn-1.6.0
Collecting scikit-learn==1.3.1
  Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m28.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
imbalanced-learn 0.13.0 requires scikit-learn<2,>=1.3.2, but you have scikit-learn 1.3.1 which is incompatible.[0m[31m
[0mSuccessfully installed scikit-learn-1.3.1


In [20]:
mlpipe.fit(X_train,y_train)

In [21]:
y_test.astype(int)

Unnamed: 0,Transported
1211,0
6461,1
1610,0
7786,1
5721,0
...,...
7766,0
7968,1
1690,0
8068,0


# **prediction**

In [22]:
y_hat = mlpipe.predict(X_test)

In [23]:
precision_score(y_test, y_hat)

0.828395061728395

In [24]:
 print("\nClassification Report:")
print(classification_report(y_test, y_hat))


Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.84      0.80       849
           1       0.83      0.75      0.79       890

    accuracy                           0.79      1739
   macro avg       0.80      0.80      0.79      1739
weighted avg       0.80      0.79      0.79      1739



In [None]:
import joblib
joblib.dump(mlpipe, 'xgbpipe.joblib')

['xgbpipe.joblib']

In [None]:
model = joblib.load('xgbpipe.joblib')

In [None]:
test = pd.read_csv('test.csv')
df_test = test.copy()

# **Test_data preprocessing**

In [None]:

  df_test[["Deck", "Cabin_num", "Side"]] =df["Cabin"].str.split("/", expand=True)
  X = df_test
  X=X.drop(columns=['PassengerId','Name','Cabin'],axis=1)
  name_columns = ['HomePlanet','Deck','Destination','Side']
  boolean_columns= ['CryoSleep','VIP']
  numeric_columns =['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Cabin_num']

  for col in X [numeric_columns]:
     num_Imputer = SimpleImputer()
     X[col] =num_Imputer.fit_transform(X[[col]])

  X[boolean_columns] = SimpleImputer(strategy='most_frequent').fit_transform(X[boolean_columns])
  X[name_columns] = SimpleImputer(strategy='most_frequent').fit_transform(X[name_columns])



In [None]:
yhat_new = model.predict(X)
yhat_new

array([1, 0, 1, ..., 1, 1, 1])

In [None]:
 submission = pd.DataFrame({'PassengerId': df_test['PassengerId'],'Transported':yhat_new.astype(bool)})
 submission.to_csv('submission.csv', index=False)

In [None]:
submission.head(5)

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
