# Import Lib and Dataset

In [None]:
# Python basic library

import numpy as np
import pandas as pd

# Visualising
import matplotlib.pyplot as plt
import seaborn as sns

#plt.figure(figsize=(12,18))

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# Connect to Google drive
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# SK-Learn
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
# Set Google drive path
#gdrive_path = '/content/drive/MyDrive/Kaggle/spaceship_titanic/'
kdrive_path = '/kaggle/input/spaceship-titanic/'

In [None]:
# load train and test dataset
train_set = pd.read_csv(kdrive_path+'train.csv')
test_set = pd.read_csv(kdrive_path+'test.csv')

# Data Visualization

In [None]:
train_set.head()

In [None]:
test_set.head()

In [None]:
# analysing the categorical features
train_set.describe(include='object')

In [None]:
# analysing the numeric features
train_set.describe(include=np.number)

In [None]:
train_set.dtypes

In [None]:
train_set.count()

In [None]:
# Checking null values
train_set.isna().sum()

# Feature Engineering

In [None]:
train_set.head()

## Checking - RoomServicec, FoodCourt, ShoppingMall, Spa and VRDeck

In [None]:
train_set[train_set['CryoSleep']==True].isna().sum()

In [None]:
train_set[train_set['CryoSleep']==True].count()

In [None]:
train_set.loc[0,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']]

In [None]:
for idx, value in enumerate(train_set['CryoSleep']):
  if value == True:
    if train_set.loc[idx,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']].isna().any():
      train_set.loc[idx,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']] = 0

In [None]:
train_set[train_set['CryoSleep']==True].isna().sum()

In [None]:
# Same for the test set
for idx, value in enumerate(test_set['CryoSleep']):
  if value == True:
    if test_set.loc[idx,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']].isna().any():
      test_set.loc[idx,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']] = 0

In [None]:
test_set[test_set['CryoSleep']==True].isna().sum()

## Checking CryoSleep

In [None]:
sns.countplot(x='CryoSleep', data=train_set.fillna('Missing'))

In [None]:
train_set.CryoSleep.isnull().sum()

In [None]:
# Convert the CryoSleep from Boolean to Float
train_set['CryoSleep'] = train_set['CryoSleep'].astype(float)

In [None]:
train_set['CryoSleep'].fillna(2).value_counts()

In [None]:
train_set['CryoSleep'].fillna(2, inplace=True)

In [None]:
for idx, value in enumerate(train_set['CryoSleep']):
  if value == 2:
    if train_set.loc[idx,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']].sum() == 0:
      train_set.loc[idx,'CryoSleep'] = 1
    else:
      train_set.loc[idx,'CryoSleep'] = 0

In [None]:
sns.countplot(x='CryoSleep', data=train_set.fillna(2))

In [None]:
# Convert the CryoSleep from Boolean to Float
test_set['CryoSleep'] = test_set['CryoSleep'].astype(float)

In [None]:
# Test Set
test_set['CryoSleep'].fillna(2, inplace=True)

In [None]:
for idx, value in enumerate(test_set['CryoSleep']):
  if value == 2:
    if test_set.loc[idx,['RoomService','ShoppingMall','FoodCourt','Spa','VRDeck']].sum() == 0:
      test_set.loc[idx,'CryoSleep'] = 1
    else:
      test_set.loc[idx,'CryoSleep'] = 0

In [None]:
sns.countplot(x='CryoSleep', data=test_set.fillna(2))

## Checking Transported

In [None]:
train_set.head()

In [None]:
# Converting Boolean to float
train_set['Transported'] = train_set['Transported'].astype(float)

In [None]:
sns.countplot(x='Transported',data=train_set.fillna(2))

In [None]:
# Test Set
test_set.head()

## Checking VIP

In [None]:
# Converting boolean to float
train_set['VIP'] = train_set['VIP'].astype(float)

In [None]:
# Check the distribution of VIP
sns.countplot(x='VIP', data=train_set.fillna(3))

In [None]:
# Determine how many VIPs are in CryoSleep
train_set.VIP[train_set['CryoSleep'] == 1].value_counts()

In [None]:
# Determine how many VIPs are not in CryoSleep
train_set.VIP[train_set['CryoSleep'] == 0].value_counts()

In [None]:
train_set.dtypes

In [None]:
# Calculate total spendings
train_set['Total_Spending'] = train_set['RoomService'] + train_set['ShoppingMall'] + train_set['FoodCourt'] + train_set['Spa'] + train_set['VRDeck']

In [None]:
# Determine the average Spending of VIP
train_set['Total_Spending'].mean()

In [None]:
train_set.head()

In [None]:
# Finding the mean spending according to VIP and NON VIP
grp_df = train_set[['Total_Spending','VIP']].groupby('VIP').mean()

In [None]:
grp_df

In [None]:
# The value 2 represents null vale
train_set['VIP'].fillna(2, inplace=True)

In [None]:
for idx, col_val in enumerate(train_set['VIP']):
  if col_val == 2:
    if train_set.loc[idx,'Total_Spending'] > 4000:
      train_set.loc[idx,'VIP'] = 1
    else:
      train_set.loc[idx,'VIP'] = 0

In [None]:
# Check the distribution of VIP
sns.countplot(x='VIP', data=train_set.fillna(2))

In [None]:
# Test Dataset

# Converting boolean to float
test_set['VIP'] = test_set['VIP'].astype(float)

# The value 2 represents null vale
test_set['VIP'].fillna(2, inplace=True)

# Calculate total spendings
test_set['Total_Spending'] = test_set['RoomService'] + test_set['ShoppingMall'] + test_set['FoodCourt'] + test_set['Spa'] + test_set['VRDeck']

# Filling the null value with formulated data
for idx, col_val in enumerate(test_set['VIP']):
  if col_val == 2:
    if test_set.loc[idx,'Total_Spending'] > 4000:
      test_set.loc[idx,'VIP'] = 1
    else:
      test_set.loc[idx,'VIP'] = 0

In [None]:
# Check the distribution of VIP
sns.countplot(x='VIP', data=test_set.fillna(2))

## Checking Spendings

In [None]:
train_set.isna().sum()

In [None]:
# Calculate total spendings
test_set['Total_Spending'] = test_set['RoomService'] + test_set['ShoppingMall'] + test_set['FoodCourt'] + test_set['Spa'] + test_set['VRDeck']


In [None]:
# Finding Average spending on RoomService
grp_df = train_set[['VIP','RoomService']].groupby('VIP').mean()
grp_df

In [None]:
# Assign -ve values for the null
train_set['RoomService'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(train_set['VIP']):
  if train_set.loc[idx,'RoomService'] == -1:
    if col_val == 0:
      train_set.loc[idx,'RoomService'] = 215
    else:
      train_set.loc[idx,'RoomService'] = 523

In [None]:
# Assign -ve values for the null
test_set['RoomService'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(test_set['VIP']):
  if test_set.loc[idx,'RoomService'] == -1:
    if col_val == 0:
      test_set.loc[idx,'RoomService'] = 215
    else:
      test_set.loc[idx,'RoomService'] = 523

In [None]:
train_set.isna().sum()

In [None]:
test_set.isna().sum()

In [None]:
# Finding Average spending on ShoppingMall
grp_df = train_set[['VIP','ShoppingMall']].groupby('VIP').mean()
grp_df

In [None]:
# Assign -ve values for the null
train_set['ShoppingMall'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(train_set['VIP']):
  if train_set.loc[idx,'ShoppingMall'] == -1:
    if col_val == 0:
      train_set.loc[idx,'ShoppingMall'] = 170
    else:
      train_set.loc[idx,'ShoppingMall'] = 240

In [None]:
# Assign -ve values for the null
test_set['ShoppingMall'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(test_set['VIP']):
  if test_set.loc[idx,'ShoppingMall'] == -1:
    if col_val == 0:
      test_set.loc[idx,'ShoppingMall'] = 170
    else:
      test_set.loc[idx,'ShoppingMall'] = 240

In [None]:
# Finding Average spending on FoodCourt
grp_df = train_set[['VIP','FoodCourt']].groupby('VIP').mean()
grp_df

In [None]:
# Assign -ve values for the null
train_set['FoodCourt'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(train_set['VIP']):
  if train_set.loc[idx,'FoodCourt'] == -1:
    if col_val == 0:
      train_set.loc[idx,'FoodCourt'] = 416
    else:
      train_set.loc[idx,'FoodCourt'] = 1959

In [None]:
# Assign -ve values for the null
test_set['FoodCourt'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(test_set['VIP']):
  if test_set.loc[idx,'FoodCourt'] == -1:
    if col_val == 0:
      test_set.loc[idx,'FoodCourt'] = 416
    else:
      test_set.loc[idx,'FoodCourt'] = 1959

In [None]:
# Finding Average spending on FoodCourt
grp_df = train_set[['VIP','Spa']].groupby('VIP').mean()
grp_df

In [None]:
# Assign -ve values for the null
train_set['Spa'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(train_set['VIP']):
  if train_set.loc[idx,'Spa'] == -1:
    if col_val == 0:
      train_set.loc[idx,'Spa'] = 295
    else:
      train_set.loc[idx,'Spa'] = 832

In [None]:
# Assign -ve values for the null
test_set['Spa'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(test_set['VIP']):
  if test_set.loc[idx,'Spa'] == -1:
    if col_val == 0:
      test_set.loc[idx,'Spa'] = 295
    else:
      test_set.loc[idx,'Spa'] = 832

In [None]:
# Finding Average spending on FoodCourt
grp_df = train_set[['VIP','VRDeck']].groupby('VIP').mean()
grp_df

In [None]:
# Assign -ve values for the null
train_set['VRDeck'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(train_set['VIP']):
  if train_set.loc[idx,'VRDeck'] == -1:
    if col_val == 0:
      train_set.loc[idx,'VRDeck'] = 276
    else:
      train_set.loc[idx,'VRDeck'] = 1323

In [None]:
# Assign -ve values for the null
test_set['VRDeck'].fillna(-1, inplace=True)

# Filling the null value with formulated data
for idx, col_val in enumerate(test_set['VIP']):
  if test_set.loc[idx,'VRDeck'] == -1:
    if col_val == 0:
      test_set.loc[idx,'VRDeck'] = 276
    else:
      test_set.loc[idx,'VRDeck'] = 1323

In [None]:
train_set['Total_Spending'] = train_set.RoomService + train_set.FoodCourt + train_set.ShoppingMall + train_set.Spa + train_set.VRDeck

In [None]:
train_set.isna().sum()

In [None]:
test_set['Total_Spending'] = test_set.RoomService + test_set.FoodCourt + test_set.ShoppingMall + test_set.Spa + test_set.VRDeck

In [None]:
test_set.isna().sum()

In [None]:
train_set['RoomService'].values

In [None]:
# Scaling the spendings
for i in ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']:
  # Initialise Standard Scalar
  scaling_SS = StandardScaler()

  # Extracting the column
  values = train_set[i].values
  values_test = test_set[i].values

  # Reshaping to fit the transformation
  values = values.reshape(len(values),1)
  values_test = values_test.reshape(len(values_test),1)

  # Training the scaling_RS
  scaling_SS = scaling_SS.fit(values)

  # Transform the column
  values_transform = scaling_SS.transform(values)
  values_test_trans = scaling_SS.transform(values_test)

  # Linking it to the Dataframe
  train_set[i+str('_norm')] = values_transform
  test_set[i+str('_norm')] = values_test_trans


In [None]:
train_set.head()

In [None]:
test_set.head()

## Checking Age

In [None]:
train_set.head()

In [None]:
sns.histplot(data=train_set.Age)

In [None]:
train_set.Age.mean()

In [None]:
train_set.Age.fillna(train_set.Age.mean(), inplace=True)

In [None]:
train_set.Age.isna().sum()

In [None]:
train_set['Age'] = train_set.Age.round()

In [None]:
sns.histplot(train_set.Age)

In [None]:
# Scaling - StandardScalar
scalar = StandardScaler()
age_val = train_set.Age.values

age_val = age_val.reshape((len(age_val),1))

# Training the Standard Scalar
scalar = scalar.fit(age_val)

# Normalising the values
norm_age = scalar.transform(age_val)

In [None]:
norm_age[:5]

In [None]:
sns.histplot(norm_age)

In [None]:
train_set.isna().sum()

In [None]:
# Inserting the normalised value to Train_set
train_set['Age_norm'] = norm_age

In [None]:
# Test Set
# Filling null values
test_set.Age.fillna(round(test_set['Age'].mean()), inplace=True)

In [None]:
# Standard Scaling
test_age = test_set['Age'].values
test_age = test_age.reshape((len(test_age),1))

test_norm = scalar.transform(test_age)

test_set['Age_norm'] = test_norm

In [None]:
test_set.isna().sum()

## Checking Cabin

In [None]:
grouped = train_set.groupby('Cabin').agg({'Cabin':'count'})

In [None]:
grouped.rename(columns={'Cabin':'Cabin_count'}, inplace=True)

In [None]:
grouped.sort_values(by='Cabin_count',ascending=False)

In [None]:
train_set.Cabin.str.split('/',expand=True)

In [None]:
train_set[['Cabin_Code_0','Cabin_Code_1','Cabin_Code_2']] = train_set.Cabin.str.split('/',expand=True)

In [None]:
train_set.isnull().sum()

In [None]:
# check the composition of cabin codes
group_cabin0 = train_set.groupby(by='Cabin_Code_0').agg({'Cabin_Code_0':'count'})
group_cabin0

In [None]:
group_cabin1 = train_set.groupby(by='Cabin_Code_1').agg({'Cabin_Code_1':'count'})
group_cabin1

In [None]:
group_cabin2 = train_set.groupby(by='Cabin_Code_2').agg({'Cabin_Code_2':'count'})
group_cabin2

In [None]:
# Since there is a shortage on the port side, adding all the null to the Port Side
train_set.Cabin_Code_2.fillna('P', inplace=True)

# Since maximum of the users are in G deck, assigning all teh null to deck G
train_set.Cabin_Code_0.fillna('G',inplace=True)

# Removing the Cabin Number and Cabin columns
train_set.drop(columns=['Cabin_Code_1','Cabin'], axis=1, inplace=True)

In [None]:
train_set.isna().sum()

In [None]:
# Test Set
# Split the Cabin column into it's corresponding values
test_set[['Cabin_Code_0','Cabin_Code_1','Cabin_Code_2']] = test_set.Cabin.str.split('/', expand=True)

# Since there is a shortage on the port side, adding all the null to the Port Side
test_set.Cabin_Code_2.fillna('P', inplace=True)

# Since maximum of the users are in G deck, assigning all teh null to deck G
test_set.Cabin_Code_0.fillna('G',inplace=True)

# Removing the Cabin Number and Cabin columns
test_set.drop(columns=['Cabin_Code_1','Cabin'], axis=1, inplace=True)

In [None]:
test_set.isnull().sum()

In [None]:
# Vectorising the Cabin_Code_0
train_set[['Cabin_0_A','Cabin_0_B','Cabin_0_C','Cabin_0_D','Cabin_0_E','Cabin_0_F','Cabin_0_G','Cabin_0_T']] = pd.get_dummies(train_set.Cabin_Code_0)
test_set[['Cabin_0_A','Cabin_0_B','Cabin_0_C','Cabin_0_D','Cabin_0_E','Cabin_0_F','Cabin_0_G','Cabin_0_T']] = pd.get_dummies(test_set.Cabin_Code_0)

In [None]:
train_set.head()

In [None]:
# Vectorising the Cabin_Code_2
train_set[['Cabin_2_S']] = pd.get_dummies(train_set.Cabin_Code_2, drop_first=True)
test_set[['Cabin_2_S']] = pd.get_dummies(test_set.Cabin_Code_2, drop_first=True)

In [None]:
train_set.head()

## Checking Home Planet

In [None]:
home_planet_agg = train_set[train_set.CryoSleep==0].groupby('HomePlanet').agg({'HomePlanet':'count',
                                                                               'Total_Spending':'max'})
home_planet_agg

In [None]:
home_planet_agg2 = train_set.groupby(['HomePlanet','CryoSleep','Destination',]).agg({'Destination':'count',
                                                                                    'Total_Spending':'mean'})
home_planet_agg2

In [None]:
# Based on the above table, we can fix values for the missinh Home planet and Destination Planet
train_set.HomePlanet.fillna('Unknown', inplace=True)

for idx, col_val in enumerate(train_set['HomePlanet']):
  if col_val == 'Unknown':
    if train_set.loc[idx,'Total_Spending'] >= 0 and train_set.loc[idx,'Total_Spending'] < 1000:
      train_set.loc[idx,'HomePlanet'] = 'Earth'
    elif train_set.loc[idx,'Total_Spending'] >= 1000 and train_set.loc[idx,'Total_Spending'] < 2000:
      train_set.loc[idx,'HomePlanet'] = 'Mars'
    else:
      train_set.loc[idx,'HomePlanet'] = 'Europa'

In [None]:
train_set.groupby('HomePlanet').agg({'HomePlanet':'count'})

In [None]:
# Based on the above table, we can fix values for the missinh Home planet and Destination Planet
test_set.HomePlanet.fillna('Unknown', inplace=True)

for idx, col_val in enumerate(test_set['HomePlanet']):
  if col_val == 'Unknown':
    if test_set.loc[idx,'Total_Spending'] >= 0 and test_set.loc[idx,'Total_Spending'] < 1000:
      test_set.loc[idx,'HomePlanet'] = 'Earth'
    elif test_set.loc[idx,'Total_Spending'] >= 1000 and test_set.loc[idx,'Total_Spending'] < 2000:
      test_set.loc[idx,'HomePlanet'] = 'Mars'
    else:
      test_set.loc[idx,'HomePlanet'] = 'Europa'

In [None]:
test_set.groupby('HomePlanet').agg({'HomePlanet':'count'})

In [None]:
# One Hot Encoding
train_set[['HomePlanet_0','HomePlanet_1']] = pd.get_dummies(train_set.HomePlanet,drop_first=True)

In [None]:
train_set.head()

In [None]:
# Test Set
test_set[['HomePlanet_0','HomePlanet_1']] = pd.get_dummies(test_set.HomePlanet, drop_first=True)

In [None]:
test_set.head()

## Checking Destination

In [None]:
destination_agg = train_set.groupby(['Destination','CryoSleep']).agg({'Destination':'count',
                                                                      'Total_Spending':'mean',
                                                                      'Age':'min'})
destination_agg

In [None]:
# Since Maximum people are travelling to TRAPPIST-1e, setting the destination as TRAPPIST-1e for unknown values
train_set.Destination.fillna('TRAPPIST-1e', inplace=True)
test_set.Destination.fillna('TRAPPIST-1e', inplace=True)

In [None]:
train_set.groupby('Destination').agg({'Destination':'count'})

In [None]:
test_set.groupby('Destination').agg({'Destination':'count'})

In [None]:
# One Hot Encoding
train_set[['Destination_0','Destination_1']] = pd.get_dummies(train_set.Destination, drop_first=True)
test_set[['Destination_0','Destination_1']] = pd.get_dummies(test_set.Destination, drop_first=True)

In [None]:
train_set.isnull().sum()

In [None]:
test_set.isnull().sum()

## Dropping unwanted values

In [None]:
train_set.columns

In [None]:
train_set_select = train_set.drop(columns=['Total_Spending','Name','VRDeck','Spa','ShoppingMall','FoodCourt','RoomService','Age','Destination','HomePlanet','Cabin_Code_0', 'Cabin_Code_2'], axis=1)
test_set_select = test_set.drop(columns=['Total_Spending','Name','VRDeck','Spa','ShoppingMall','FoodCourt','RoomService','Age','Destination','HomePlanet','Cabin_Code_0', 'Cabin_Code_2'], axis=1)

# Splitting X and Y

In [None]:
train_set_select.columns

In [None]:
X_train = train_set_select.drop(columns=['PassengerId','Transported'], axis=1)
y_train = train_set_select['Transported']
id_train = train_set.PassengerId

In [None]:
X_train.head()

In [None]:
X_train_val = X_train.values
y_train_val = y_train.values

In [None]:
X_train_val.shape

In [None]:
y_train_val.shape

In [None]:
X_tr_split_val, X_va_split_val, y_tr_split_val, y_va_split_val = train_test_split(X_train_val, y_train_val, test_size=0.30, random_state=1, shuffle=True)

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

In [None]:
X_test = test_set_select.drop(columns=['PassengerId'], axis=1)

In [None]:
id_test = pd.DataFrame(test_set_select['PassengerId'])

In [None]:
id_test.head()

In [None]:
X_test_val = X_test.values

In [None]:
print('X Test shape = ', X_test_val.shape)

# Building ML Models

## Model 1 - Logestic Regression

### Model Building

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

In [None]:
# Initialising LR
calssifer_LR_1 = LogisticRegression(random_state=1, max_iter=10)

# Train the model
calssifer_LR_1.fit(X_tr_split_val, y_tr_split_val)

### Prediction

In [None]:
# predict function under LogisticRegression
y_pred_1 = calssifer_LR_1.predict(X_va_split_val)

### Evaluation

In [None]:
print('Accuracy Score =',accuracy_score(y_va_split_val,y_pred_1))
print('classification report =\n',classification_report(y_va_split_val,y_pred_1))

## Model 2 - Decission Tree

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

### Model Building

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Initialise the DT Classifier
classifier_DT_1 = DecisionTreeClassifier(random_state=1)

# Train the model
classifier_DT_1.fit(X_tr_split_val,y_tr_split_val)

### Prediction

In [None]:
# Predict the value
y_pred_DT_1 = classifier_DT_1.predict(X_va_split_val)

### Evaluation

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_DT_1))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_DT_1))

## Model 3 - Decission Tree

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

### Model Building

In [None]:
# Initialise the DT Classifier
classifier_DT_2 = DecisionTreeClassifier(random_state=1,criterion='entropy')

# Train the model
classifier_DT_2.fit(X_tr_split_val,y_tr_split_val)

### Prediction

In [None]:
# Predict the value
y_pred_DT_2 = classifier_DT_2.predict(X_va_split_val)

### Evaluation

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_DT_2))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_DT_2))

## Model 4 - Decission Tree with Pruning

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

### Model Building

In [None]:
# Initialise the DT Classifier
classifier_DT_3 = DecisionTreeClassifier(random_state=1,criterion='entropy', max_depth=5)

# Train the model
classifier_DT_3.fit(X_tr_split_val,y_tr_split_val)

### Prediction

In [None]:
# Predict the value
y_pred_DT_3 = classifier_DT_3.predict(X_va_split_val)

### Evaluation

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_DT_3))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_DT_3))

## Model 5 - Random Forest

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Model Building

In [None]:
# Initialise the RF Classifier
classifier_RF_1 = RandomForestClassifier(n_estimators=500,random_state=1,criterion='entropy')

# Train the model
classifier_RF_1.fit(X_tr_split_val,y_tr_split_val)

### Prediction

In [None]:
# Predict the value
y_pred_RF_1 = classifier_RF_1.predict(X_va_split_val)

### Evaluation

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_RF_1))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_RF_1))

## Model 6 - Neural Networks

In [None]:
y_tr_split_val = y_tr_split_val.reshape(len(y_tr_split_val),1)

In [None]:
y_va_split_val = y_va_split_val.reshape(len(y_va_split_val),1)

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

In [None]:
X_tr_split_val = np.asarray(X_tr_split_val).astype('float32')
y_tr_split_val = np.asarray(y_tr_split_val).astype('float32')
X_va_split_val = np.asarray(X_va_split_val).astype('float32')
y_va_split_val = np.asarray(y_va_split_val).astype('float32')

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout

In [None]:
from keras import Input

### Model Building

In [None]:
tf.keras.backend.clear_session()
# Initialise the Model

# Layer 1 - Input
classifier_DNN_1 = Sequential()
#classifier_DNN_1.add(Input(shape=X_tr_split_val.shape[1],))
classifier_DNN_1.add(Dense(100,input_shape=(X_tr_split_val.shape[1],), activation='relu'))
classifier_DNN_1.add(Dropout(0.2))
classifier_DNN_1.add(BatchNormalization())
# Layer 2
classifier_DNN_1.add(Dense(100, activation='relu'))
classifier_DNN_1.add(Dropout(0.2))
classifier_DNN_1.add(BatchNormalization())
# Layer 3
classifier_DNN_1.add(Dense(100, activation='relu'))
classifier_DNN_1.add(Dropout(0.2))
classifier_DNN_1.add(BatchNormalization())
# Layer - Output
classifier_DNN_1.add(Dense(units=1,activation='sigmoid'))

In [None]:
classifier_DNN_1.compile(loss='binary_crossentropy',optimizer='adam',metrics=['Accuracy'])

In [None]:
classifier_DNN_1.summary()

### Training

In [None]:
# fit network
history = classifier_DNN_1.fit(X_tr_split_val, y_tr_split_val, epochs=50, batch_size=32,
                               validation_split=0.25,
                               #callbacks=[model_checkpoint_callback],
                               verbose=2, shuffle=True)

### Prediction

In [None]:
# Predict the value
y_pred_proba_DNN_1 = classifier_DNN_1.predict(X_va_split_val)
y_pred_DNN_1 = y_pred_proba_DNN_1 > 0.5

In [None]:
y_pred_proba_DNN_1[:5]

In [None]:
y_pred_DNN_1[:5]

In [None]:
y_pred_DNN_1_fl = y_pred_DNN_1.astype(float)

### Evaluation

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_DNN_1_fl))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_DNN_1_fl))

### Submission

In [None]:
id_test.head()

In [None]:
X_test_val = np.asarray(X_test_val).astype('float32')

In [None]:
# Predict the value
y_pred_proba_DNN_1 = classifier_DNN_1.predict(X_test_val)
y_pred_DNN_1 = y_pred_proba_DNN_1 > 0.5

In [None]:
id_test['Transported'] = y_pred_DNN_1

In [None]:
id_test.to_csv('/kaggle/working/submission.csv',  index=False)

## Model 7 - Neural Networks

In [None]:
y_tr_split_val = y_tr_split_val.reshape(len(y_tr_split_val),1)

In [None]:
y_va_split_val = y_va_split_val.reshape(len(y_va_split_val),1)

In [None]:
print('X Train shape = ', X_tr_split_val.shape)
print('y Train shape = ', y_tr_split_val.shape)
print('X Validation shape = ', X_va_split_val.shape)
print('y Validation shape = ', y_va_split_val.shape)

### Model Building

In [None]:
tf.keras.backend.clear_session()
# Initialise the Model

# Layer 1 - Input
classifier_DNN_2 = Sequential()
#classifier_DNN_1.add(Input(shape=X_tr_split_val.shape[1],))
classifier_DNN_2.add(Dense(100,input_shape=(X_tr_split_val.shape[1],), activation='relu'))
classifier_DNN_2.add(Dropout(0.3))
classifier_DNN_2.add(BatchNormalization())
# Layer 2
classifier_DNN_2.add(Dense(80, activation='relu'))
classifier_DNN_2.add(Dropout(0.3))
classifier_DNN_2.add(BatchNormalization())
# Layer 3
classifier_DNN_2.add(Dense(60, activation='relu'))
classifier_DNN_2.add(Dropout(0.3))
classifier_DNN_2.add(BatchNormalization())
# Layer 4
classifier_DNN_2.add(Dense(40, activation='relu'))
classifier_DNN_2.add(Dropout(0.3))
classifier_DNN_2.add(BatchNormalization())
# Layer 5
classifier_DNN_2.add(Dense(20, activation='relu'))
classifier_DNN_2.add(Dropout(0.3))
classifier_DNN_2.add(BatchNormalization())
# Layer - Output
classifier_DNN_2.add(Dense(units=1,activation='sigmoid'))

In [None]:
classifier_DNN_2.compile(loss='binary_crossentropy',optimizer='adam',metrics=['Accuracy'])

In [None]:
classifier_DNN_2.summary()

### Training

In [None]:
# fit network
history = classifier_DNN_2.fit(X_tr_split_val, y_tr_split_val, epochs=50, batch_size=32,
                               validation_split=0.25,
                               #callbacks=[model_checkpoint_callback],
                               verbose=2, shuffle=True)

In [None]:
# fit network
history = classifier_DNN_2.fit(X_tr_split_val, y_tr_split_val, initial_epoch=50, epochs=100, batch_size=16,
                               validation_split=0.25,
                               #callbacks=[model_checkpoint_callback],
                               verbose=2, shuffle=True)

### Prediction

In [None]:
# Predict the value
y_pred_proba_DNN_2 = classifier_DNN_2.predict(X_va_split_val)
y_pred_DNN_2 = y_pred_proba_DNN_2 > 0.5

In [None]:
y_pred_proba_DNN_2[:5]

In [None]:
y_pred_DNN_2[:5]

In [None]:
y_pred_DNN_2_fl = y_pred_DNN_2.astype(float)

### Evaluation

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_DNN_2_fl))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_DNN_2_fl))

### Submission file

In [None]:
print('X Test shape = ', X_test_val.shape)

In [None]:
# Predict the value
y_test_proba_DNN_2 = classifier_DNN_2.predict(X_test_val)
y_test_DNN_2 = y_test_proba_DNN_2 > 0.5

In [None]:
y_test_DNN_2[:5]

In [None]:
id_test.head()

In [None]:
id_test['Transported'] = y_test_DNN_2

In [None]:
id_test.head()

In [None]:
id_test.to_csv(kdrive_path+'submission.csv')

## Model 8 - AdaBoostClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

In [None]:
# Initialise the AB Classifier
classifier_AB_1 = AdaBoostClassifier( estimator = RandomForestClassifier(random_state = 1,criterion='entropy',
                                                       n_estimators=200), random_state = 1)

# Train the model
classifier_AB_1.fit(X_tr_split_val,y_tr_split_val)

In [None]:
# Predict the value
y_pred_AB_1 = classifier_AB_1.predict(X_va_split_val)

In [None]:
print('Accuracy score = ', accuracy_score(y_va_split_val,y_pred_AB_1))
print('Classification Report =\n', classification_report(y_va_split_val,y_pred_AB_1))