In [1]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras import regularizers

In [2]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [3]:
train_df = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
train_df.keys()

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

In [4]:
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# **Data Processing**

In [5]:
train_df = train_df.drop(columns=['PassengerId', 'Name'])
test_df = test_df.drop(columns=['PassengerId', 'Name'])

In [6]:
# Cabin Column
train_df[["Cabin1","Cabin2","Cabin3"]]= train_df["Cabin"].str.split("/", expand = True)
train_df["Cabin2"] = train_df["Cabin2"].astype(float)

test_df[["Cabin1","Cabin2","Cabin3"]]= test_df["Cabin"].str.split("/", expand = True)
test_df["Cabin2"] = test_df["Cabin2"].astype(float)

train_df = train_df.drop(columns=['Cabin'])
test_df = test_df.drop(columns=['Cabin'])

expenses = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


train_df['Expenses'] = train_df[expenses].sum(axis=1)
test_df['Expenses'] = test_df[expenses].sum(axis=1)
train_df = train_df.drop(columns=expenses)
test_df = test_df.drop(columns=expenses)

In [7]:
train_df.isna().sum()

HomePlanet     201
CryoSleep      217
Destination    182
Age            179
VIP            203
Transported      0
Cabin1         199
Cabin2         199
Cabin3         199
Expenses         0
dtype: int64

In [8]:
object_cols = [col for col in train_df.keys() if train_df[col].dtypes == 'object']
float_cols = [col for col in train_df.keys() if train_df[col].dtypes == 'float64']
train_df.dtypes

HomePlanet      object
CryoSleep       object
Destination     object
Age            float64
VIP             object
Transported       bool
Cabin1          object
Cabin2         float64
Cabin3          object
Expenses       float64
dtype: object

In [9]:
# Replace NA Values in Object Datatype Columns with random values
for col in object_cols:
    unique_vals = train_df[col].unique().tolist()
    unique_vals = [item for item in unique_vals if str(item) != 'nan']
    train_df[col] = train_df[col].fillna(unique_vals[np.random.randint(0,len(unique_vals))])
    
# Replace NA Values in Float64 Datatype Columns with random values
for col in float_cols:
    avg_col = train_df[col].mean()
    train_df[col] = train_df[col].fillna(avg_col)   

In [10]:
# Convert Object dtype columns
for col in object_cols:
    test_df[col].replace(train_df[col].unique(), range(len(train_df[col].unique())), inplace=True)
    train_df[col].replace(train_df[col].unique(), range(len(train_df[col].unique())), inplace=True)
    
train_df['Transported'].replace(train_df['Transported'].unique(), range(len(train_df['Transported'].unique())), inplace=True)

In [11]:
# Switch column order to put'Transported' to the end
train_df = train_df[list(test_df.columns.values) + ['Transported']]

In [12]:
train_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Cabin1,Cabin2,Cabin3,Expenses,Transported
0,0,0,0,39.0,0,0,0.0,0,0.0,0
1,1,0,0,24.0,0,1,0.0,1,736.0,1
2,0,0,0,58.0,1,2,0.0,1,10383.0,0
3,0,0,0,33.0,0,2,0.0,1,5176.0,0
4,1,0,0,16.0,0,1,1.0,1,1091.0,1


In [13]:
test_df.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Cabin1,Cabin2,Cabin3,Expenses
0,1.0,1.0,0.0,27.0,0.0,3.0,3.0,1.0,0.0
1,1.0,0.0,0.0,19.0,0.0,1.0,4.0,1.0,2832.0
2,0.0,1.0,2.0,31.0,0.0,6.0,0.0,1.0,0.0
3,0.0,0.0,0.0,38.0,0.0,6.0,1.0,1.0,7418.0
4,1.0,0.0,0.0,20.0,0.0,1.0,5.0,1.0,645.0


In [14]:
train_df = train_df.sample(frac = 1)
SPLIT_SIZE = 0.7

data = train_df.iloc[:,:-1].to_numpy()
labels = train_df.pop('Transported').to_numpy()

train_size = int(data.shape[0]*SPLIT_SIZE)
train_data = data[:train_size]
train_labels = labels[:train_size]

val_data = data[train_size:]
val_labels = labels[train_size:]

# **Model (Fully Connected)**

In [15]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu',kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(128, activation='relu',kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)),
    tf.keras.layers.Dropout(0.2),
    #tf.keras.layers.Dense(64, activation='relu',kernel_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4)),
    #tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

2022-11-26 12:30:26.034091: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [16]:
model.fit(train_data, train_labels, epochs=40, batch_size=64, validation_data=(val_data, val_labels))

2022-11-26 12:30:26.345276: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7fab586f0990>

# **Predictions**

In [17]:
x = test_df.to_numpy()
#x = np.asarray(x).astype('float64')
predictions = model.predict(x)

In [18]:
predictions = [1 if pred >= 0.5 else 0 for pred in predictions]

In [19]:
id_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

result = pd.DataFrame(columns=['PassengerId', 'Transported'])
result['PassengerId'] = [id for id in id_df['PassengerId']]
result['Transported'] = predictions
result['Transported'].replace([0, 1], ['False', 'True'], inplace=True)

In [20]:
result.to_csv('submission1.csv', index=False)

# **CATBOOST**

In [21]:
from catboost import CatBoostRegressor, CatBoostClassifier

In [22]:
model = CatBoostRegressor()
model.fit(data, labels)

Learning rate set to 0.057619
0:	learn: 0.4916354	total: 60.8ms	remaining: 1m
1:	learn: 0.4841460	total: 65ms	remaining: 32.4s
2:	learn: 0.4775737	total: 67.4ms	remaining: 22.4s
3:	learn: 0.4712108	total: 69.6ms	remaining: 17.3s
4:	learn: 0.4653542	total: 71.8ms	remaining: 14.3s
5:	learn: 0.4600084	total: 73.9ms	remaining: 12.2s
6:	learn: 0.4552037	total: 75.9ms	remaining: 10.8s
7:	learn: 0.4510657	total: 78ms	remaining: 9.68s
8:	learn: 0.4473258	total: 80.2ms	remaining: 8.83s
9:	learn: 0.4439673	total: 82.3ms	remaining: 8.14s
10:	learn: 0.4407493	total: 84.3ms	remaining: 7.58s
11:	learn: 0.4382969	total: 86.4ms	remaining: 7.11s
12:	learn: 0.4354994	total: 88.5ms	remaining: 6.72s
13:	learn: 0.4332983	total: 90.7ms	remaining: 6.39s
14:	learn: 0.4311103	total: 92.8ms	remaining: 6.09s
15:	learn: 0.4285901	total: 94.9ms	remaining: 5.84s
16:	learn: 0.4266362	total: 97ms	remaining: 5.61s
17:	learn: 0.4247085	total: 99.2ms	remaining: 5.41s
18:	learn: 0.4231473	total: 101ms	remaining: 5.23s
19

<catboost.core.CatBoostRegressor at 0x7fab2c3f9bd0>

In [23]:
preds = model.predict(test_df)
predictions = [1 if pred >= 0.5 else 0 for pred in predictions]

In [24]:
id_df = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

result = pd.DataFrame(columns=['PassengerId', 'Transported'])
result['PassengerId'] = [id for id in id_df['PassengerId']]
result['Transported'] = predictions
result['Transported'].replace([0, 1], ['False', 'True'], inplace=True)

In [25]:
result.to_csv('submission1.csv', index=False)