## Import module and dataset

In [None]:
import pandas as pd
import numpy as np
import os as os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
#warnings.filterwarnings("default")

In [None]:
df_train = pd.read_csv('./spaceship-titanic/train.csv')
df_test = pd.read_csv('./spaceship-titanic/test.csv')

## 1 Check Dataset

### 1.1 Preview Dataset

In [None]:
print('Train dataset preview:\n')
df_train.head()

In [None]:
print('Test dataset preview:\n')
df_test.head()

### 1.2 Dimensions of Data

In [None]:
df_train.shape

In [None]:
df_test.shape

### 1.3 Types of Data

In [None]:
df_train.dtypes

In [None]:
col = df_train.columns[:-1].tolist()

In [None]:
col

In [None]:
col_label = df_train.columns[-1]

In [None]:
col_label

In [None]:
col_numeric = df_train._get_numeric_data().columns.tolist()

In [None]:
col_numeric.remove(col_label)

In [None]:
col_numeric 

In [None]:
col_categorical = list(set(col) - set(col_numeric) - set(col_label))

In [None]:
col_categorical

### 1.4 Statistics of Data

In [None]:
df_train.describe()

In [None]:
df_test.describe()

### 1.5 Missing Value Ratio

In [None]:
df_missing_train = pd.DataFrame(data = df_train.isna().sum(), columns = ['# of Missing Value'])
df_missing_train['% of Missing Value'] = round(df_train.isna().sum() /df_train.shape[0] *100,2)
print('train dataset missing count/rate:\n')
display(df_missing_train)

In [None]:
df_missing_test = pd.DataFrame(data = df_test.isna().sum(), columns = ['# of Missing Value'])
df_missing_test['% of Missing Value'] = round(df_test.isna().sum() /df_test.shape[0] *100,2)
print('test dataset missing count/rate:\n')
display(df_missing_test)

### 1.6 Cardinality of Data

In [None]:
# Categorical Cardinality
df_train[col_categorical].nunique()

In [None]:
pd.unique(df_train['HomePlanet'])

In [None]:
pd.unique(df_train['Destination'])

In [None]:
df_test[col_categorical].nunique()

## 2 EDA

### 2.1 Balance of Label

In [None]:
df_label = df_train.groupby(col_label, as_index =True).size()

In [None]:
df_label

In [None]:
plt.figure(figsize=(10,6))
plt.pie(df_train[col_label].value_counts(),labels=df_train[col_label].value_counts().keys(),autopct="%1.1f%%",
       textprops={"fontsize":20,"fontweight":"black"},colors=sns.color_palette("Set2"))
plt.title("Label Distribution: {}".format(col_label))

### 2.2 Numeric Variables vs. Label

In [None]:
col_numeric

In [None]:
plt.figure(figsize=(14,10))
l = len(col_numeric)
n_bins = 30

for idx,f in enumerate(col_numeric):
    #print(idx, f)
    plt.subplot(3,l//3+1,idx+1)
    sns.histplot(x=f,hue=col_label,data=df_train,bins =n_bins,kde=True,multiple="dodge")
    plt.title("Feature Distribution: {}".format(f))
    plt.ylim(0, df_train.shape[0]/n_bins*3)
    plt.tight_layout()

In [None]:
# Plot individual boxplots for each variable
plt.figure(figsize=(12, 8))

for idx,column in enumerate(col_numeric):
    #print(idx, f)
    plt.subplot(3,l//3+1,idx+1)
    #plt.subplot(2, 3, df_train.columns.get_loc(column) + 1)
    sns.boxplot(x=df_train[column])
    plt.title(column)

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()

### 2.3 Categorical Variables vs. Label

In [None]:
df_cardinalitiy = df_train[col_categorical].nunique() 

In [None]:
df_cardinalitiy

In [None]:
col_card_less_than10 = df_cardinalitiy[df_cardinalitiy<=10].index.tolist()

In [None]:
col_categorical_less10 = list(set(col_categorical) & set(col_card_less_than10))

In [None]:
col_categorical

In [None]:
col_categorical_less10

In [None]:
l = len(col_categorical_less10)

plt.figure(figsize=(12,8))
for idx,column in enumerate(col_categorical_less10):
    plt.subplot(l//2+1,2,idx+1)
    sns.countplot(x=column, hue=col_label, data=df_train)
    plt.title("Feature Distribution: {}".format(column))
    plt.tight_layout()

### 2.4 Correlation

In [None]:
plt.figure(figsize=(10, 6))
heatmap = sns.heatmap(df_train.corr())
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12},pad=12)

## 3 Feature Engineering

### 3.0 Preprocessing data: Cabin

In [None]:
df_train_c = df_train.copy()
df_train_c['Cabin_1'] = df_train_c.apply(lambda x : x['Cabin'].split('/')[0] if isinstance(x['Cabin'], str) else np.nan, axis=1)
df_train_c['Cabin_2'] = df_train_c.apply(lambda x : int(x['Cabin'].split('/')[1]) if isinstance(x['Cabin'], str) else np.nan, axis=1)
df_train_c['Cabin_3'] = df_train_c.apply(lambda x : x['Cabin'].split('/')[2] if isinstance(x['Cabin'], str) else np.nan, axis=1)

In [None]:
df_train_c

In [None]:
## Additional Visuals

plt.figure(figsize=(8,4))
for idx,column in enumerate(['Cabin_1','Cabin_3']):
    plt.subplot(1,2,idx+1)
    sns.countplot(x=column, hue=col_label, data=df_train_c)
    plt.title("Feature Distribution: {}".format(column))
    plt.tight_layout()

In [None]:
plt.figure(figsize=(5,3))

n_bins = 30

for idx,f in enumerate(['Cabin_2']):
    sns.histplot(x=f,hue=col_label,data=df_train_c,bins =n_bins,kde=True)
    plt.title("Feature Distribution: {}".format(f))
    # plt.ylim(0, df_train.shape[0]/n_bins)
    plt.tight_layout()

In [None]:
df_test_c = df_test.copy()
df_test_c['Cabin_1'] = df_test_c.apply(lambda x : x['Cabin'].split('/')[0] if isinstance(x['Cabin'], str) else np.nan, axis=1)
df_test_c['Cabin_2'] = df_test_c.apply(lambda x : int(x['Cabin'].split('/')[1]) if isinstance(x['Cabin'], str) else np.nan, axis=1)
df_test_c['Cabin_3'] = df_test_c.apply(lambda x : x['Cabin'].split('/')[2] if isinstance(x['Cabin'], str) else np.nan, axis=1)

In [None]:
col_c = df_train_c.columns.tolist()

In [None]:
col_label_c = col_label

In [None]:
col_numeric_c = df_train_c._get_numeric_data().columns.tolist()
col_numeric_c.remove(col_label_c)

In [None]:
col_categorical_c = list(set(col_c) - set(col_numeric_c) - set(col_label_c))
col_categorical_c.remove(col_label_c)

In [None]:
df_train_c.to_csv('dataset_train_cabin.csv',index=False)

In [None]:
df_test_c.to_csv('dataset_test_cabin.csv',index=False)

### 3.1 Fill in Null Value (Model Method, revisit)

In [None]:
## Numeric variable: 
## Categorical variable: mode

In [None]:
def fillInNull(df_train_tmp, df_test_tmp,col_num, col_cat):
    df_skew_tmp = df_train_tmp[col_num].skew()
    col_lst_mean = df_skew_tmp[abs(df_skew_tmp)<=3].index.tolist()
    col_lst_median = df_skew_tmp[abs(df_skew_tmp)>3].index.tolist()
    
    ## mean 
    for col in col_lst_mean:
        value_mean = df_train_tmp[col].mean()
        df_train_tmp[col].fillna(value=value_mean, inplace=True)
        df_test_tmp[col].fillna(value=value_mean, inplace=True)
        
    ## median
    for col in col_lst_median:
        value_median = df_train_tmp[col].mean()
        df_train_tmp[col].fillna(value=value_median, inplace=True)
        df_test_tmp[col].fillna(value=value_median, inplace=True)
        
    ## most frequent
    for col in col_cat:
        value_mode = df_train_tmp.value_counts(col,ascending=False).index[0]
        df_train_tmp[col].fillna(value=value_mode, inplace=True)
        df_test_tmp[col].fillna(value=value_mode, inplace=True)
    
    return df_train_tmp, df_test_tmp

In [None]:
df_train_opt, df_test_opt = fillInNull(df_train_c, df_test_c,col_numeric_c, col_categorical_c )

In [None]:
# Check how many missing value
df_train_opt.isna().sum().sum()

In [None]:
df_train_opt

In [None]:
col_numeric_c

In [None]:
col_categorical_c

### 3.2 Get 0/1 for True/False Variable

In [None]:
df_train_c.head()

In [None]:
df_train_c.replace({True: 1, False: 0}, inplace = True)

In [None]:
df_test_c.replace({True: 1, False: 0}, inplace = True)

In [None]:
df_train_c.head()

### 3.3 One-hot Encoding for Multi-class Variables

In [None]:
col_categorical_less10 = col_categorical_less10 + ['Cabin_1','Cabin_3']
col_categorical_less10

In [None]:
df_train_c_encoding = df_train_c.copy()
df_test_c_encoding = df_test_c.copy()

In [None]:
for col in col_categorical_less10:
    df_tmp = pd.get_dummies(df_train_c_encoding[col], prefix = col)
    df_train_c_encoding = pd.concat([df_train_c_encoding,df_tmp.iloc[:,:-1]] , axis = 1)
    df_train_c_encoding.drop(col,1, inplace = True)

    df_tmp = pd.get_dummies(df_test_c_encoding[col], prefix = col)
    df_test_c_encoding = pd.concat([df_test_c_encoding,df_tmp.iloc[:,:-1]] , axis = 1)
    df_test_c_encoding.drop(col,1, inplace = True)
    

In [None]:
col_categorical_less10

In [None]:
df_train_c_encoding.columns

In [None]:
df_test_c_encoding.columns

### 3.4 Feature Scaling for Numeric Variables

In [None]:
col_numeric_c

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
df_train_c_encoding_scale = df_train_c_encoding.copy()
df_test_c_encoding_scale = df_test_c_encoding.copy()

In [None]:
df_train_c_encoding_scale[col_numeric_c] = sc.fit_transform(df_train_c_encoding_scale[col_numeric_c])


In [None]:
df_test_c_encoding_scale[col_numeric_c] = sc.transform(df_test_c_encoding_scale[col_numeric_c])

In [None]:
col_numeric_c

In [None]:
col_categorical_c

In [None]:
df_train_c_encoding_scale.columns

In [None]:
col_label = col_label
col_drop = ['PassengerId', 'Cabin','Name','Transported']

In [None]:
# df_train_c_encoding_scale.to_csv('dataset_train_final.csv', index =False)
# df_test_c_encoding_scale.to_csv('dataset_test_final.csv', index =False)

In [None]:
df_train_c_encoding_scale = pd.read_csv('dataset_train_final.csv')
df_test_c_encoding_scale = pd.read_csv('dataset_test_final.csv')

In [None]:
df_sample = df_train_c.copy()

In [None]:
df_sample.head()

In [None]:
df_sample.groupby('Cabin_1', as_index = False).agg({'Age':'mean','FoodCourt':['sum','mean']})

In [None]:
X.columns

In [None]:
X_report.columns

### 3.5 Train/Validation/Test Split

In [None]:
col_drop = ['PassengerId', 'Cabin','Name','Transported']
col_label = 'Transported'

In [None]:
y = df_train_c_encoding_scale[col_label]
X = df_train_c_encoding_scale.drop(col_drop, 1)
X_report = df_test_c_encoding_scale.drop(['PassengerId','Cabin','Name'],1)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test \
    = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val \
    = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [None]:
X_train

In [None]:
y_train

## 4 Model Training

In [None]:
from sklearn import metrics 
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import uniform, randint

In [None]:
col_output = ['accuracy','precision','recall','f1score','auc','ckscore']

In [None]:
def mlEvaluationMetrics(y_true, y_pred, y_proba):
    accuracy = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1score = metrics.f1_score(y_true, y_pred)
    auc = metrics.roc_auc_score(y_true, y_proba)
    ckscore = metrics.cohen_kappa_score(y_true, y_pred)
    return [accuracy,precision,recall,f1score,auc,ckscore]

### 4.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
LR = LogisticRegression()

In [None]:
LR.fit(X_train, y_train)

In [None]:
y_val_pred = LR.predict(X_val)
y_val_proba = LR.predict_proba(X_val)[:, 1]

y_train_pred = LR.predict(X_train)
y_train_proba = LR.predict_proba(X_train)[:, 1]

y_test_pred = LR.predict(X_test)
y_test_proba = LR.predict_proba(X_test)[:, 1]

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

### 4.2 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RF = RandomForestClassifier(n_estimators = 100)
# max_depth, max_features, n_estimators

In [None]:
RF.fit(X_train, y_train)

In [None]:
y_val_pred = RF.predict(X_val)
y_val_proba = RF.predict_proba(X_val)[:, 1]

y_train_pred = RF.predict(X_train)
y_train_proba = RF.predict_proba(X_train)[:, 1]

y_test_pred = RF.predict(X_test)
y_test_proba = RF.predict_proba(X_test)[:, 1]

In [None]:
RF.feature_importances_

In [None]:
RF.feature_names_in_

In [None]:
lst_importance = list(zip(RF.feature_importances_.round(2), RF.feature_names_in_))

In [None]:
sorted(lst_importance, key = lambda x : x[0] ,reverse = True)

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

#### 4.2.1 Parameter Tuning, Random Search

In [None]:
# Define XGBoost model and hyperparameters to tune
RF_model = RandomForestClassifier()

param_distributions = {
    'n_estimators': np.arange(100, 1000, 100),
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': np.arange(2, 11),
    'min_samples_leaf': np.arange(1, 11)
}
random_search = RandomizedSearchCV(
    RF_model,
    param_distributions,
    n_iter=50,  # Number of parameter settings that are sampled
    cv=5,       # 5-fold cross-validation
    n_jobs=-1,  # Use all available CPU cores
    random_state=42
)

# Perform random search cross-validation to find the best hyperparameters
#random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42, scoring='accuracy')
random_search.fit(X_train, y_train)

In [None]:
# Print the best parameters and best score
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)

In [None]:

# Evaluate the best model on the testing set
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_val_pred)
confusion_mat = metrics.confusion_matrix(y_val, y_val_pred)

print("Best hyperparameters: ", random_search.best_params_)
print("Testing accuracy: ", accuracy)
print("Confusion matrix: ", confusion_mat)

In [None]:
y_val_pred = best_model.predict(X_val)
y_val_proba = best_model.predict_proba(X_val)[:, 1]

y_train_pred = best_model.predict(X_train)
y_train_proba = best_model.predict_proba(X_train)[:, 1]

y_test_pred = best_model.predict(X_test)
y_test_proba = best_model.predict_proba(X_test)[:, 1]

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

In [None]:
lst_importance = list(zip(best_model.feature_importances_.round(2), best_model.feature_names_in_))
sorted(lst_importance, key = lambda x:x[0], reverse =True)

In [None]:
y_pred_report = best_model.predict(X_report)
df_output = df_test_c_encoding_scale.copy()
df_output[col_label] = y_pred_report
df_output.replace({1: True, 0:False}, inplace = True)
df_output[['PassengerId',col_label]].to_csv('mysubmission_rf.csv',index =False)

### 4.3 XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
XGB = XGBClassifier()
XGB.fit(X_train, y_train)

In [None]:
y_val_pred = XGB.predict(X_val)
y_val_proba = XGB.predict_proba(X_val)[:, 1]

y_train_pred = XGB.predict(X_train)
y_train_proba = XGB.predict_proba(X_train)[:, 1]

y_test_pred = XGB.predict(X_test)
y_test_proba = XGB.predict_proba(X_test)[:, 1]

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

#### 4.3.1 Parameter Tuning, Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
# Define XGBoost model and hyperparameters to tune
xgb_model = XGBClassifier()
param_grid = {
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 150],
    'min_child_weight': [1, 3, 5]
}

# Perform grid search cross-validation to find the best hyperparameters
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:

# Evaluate the best model on the testing set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_val_pred)
confusion_mat = metrics.confusion_matrix(y_val, y_val_pred)

print("Best hyperparameters: ", grid_search.best_params_)
print("Testing accuracy: ", accuracy)
print("Confusion matrix: ", confusion_mat)

#### 4.3.1 Parameter Tuning, Random Search

In [None]:
# Define XGBoost model and hyperparameters to tune
xgb_model = XGBClassifier()
param_dist = {
    'learning_rate': uniform(0.1, 0.3),
    'max_depth': randint(3, 6),
    'n_estimators': randint(50, 150),
    'min_child_weight': randint(1, 6)
}

# Perform random search cross-validation to find the best hyperparameters
random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=10, cv=5, random_state=42, scoring='accuracy')
random_search.fit(X_train, y_train)

In [None]:

# Evaluate the best model on the testing set
best_model = random_search.best_estimator_
y_val_pred = best_model.predict(X_val)
accuracy = metrics.accuracy_score(y_val, y_val_pred)
confusion_mat = metrics.confusion_matrix(y_val, y_val_pred)

print("Best hyperparameters: ", random_search.best_params_)
print("Testing accuracy: ", accuracy)
print("Confusion matrix: ", confusion_mat)

In [None]:
lst_importance = list(zip(best_model.feature_importances_.round(2), best_model.feature_names_in_))
sorted(lst_importance, key = lambda x:x[0], reverse =True)

In [None]:
y_val_pred = best_model.predict(X_val)
y_val_proba = best_model.predict_proba(X_val)[:, 1]

y_train_pred = best_model.predict(X_train)
y_train_proba = best_model.predict_proba(X_train)[:, 1]

y_test_pred = best_model.predict(X_test)
y_test_proba = best_model.predict_proba(X_test)[:, 1]

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

In [None]:
y_pred_report = best_model.predict(X_report)
df_output = df_test_c_encoding_scale.copy()
df_output[col_label] = y_pred_report
df_output.replace({1: True, 0:False}, inplace = True)
df_output[['PassengerId',col_label]].to_csv('mysubmission_xgboost.csv',index =False)

### 4.4 KNN

In [None]:
from sklearn.neighbors import (NeighborhoodComponentsAnalysis,KNeighborsClassifier)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(X_train, y_train)

In [None]:
y_val_pred = knn.predict(X_val)
y_val_proba = knn.predict_proba(X_val)[:, 1]

y_train_pred = knn.predict(X_train)
y_train_proba = knn.predict_proba(X_train)[:, 1]

y_test_pred = knn.predict(X_test)
y_test_proba = knn.predict_proba(X_test)[:, 1]

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

### 4.5 MLP

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split


# Define the MLP model
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        out = self.fc1(x)
        out = self.sigmoid(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

# Define the hyperparameters
input_size = len(X_train.columns)
hidden_size = 10
output_size = 1
learning_rate = 0.01
num_epochs = 100

# Instantiate the model, loss function, and optimizer
model = MLP(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    # Convert the data to tensors
    inputs = torch.Tensor(X_train.values)
    labels = torch.Tensor(y_train.values).view(-1, 1)
    
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    
    # Backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Print the loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

# Evaluate the model on the test set
with torch.no_grad():
    inputs = torch.Tensor(X_test.values)
    labels = torch.Tensor(y_test.values).view(-1, 1)
    outputs = model(inputs)
    predicted = (outputs > 0.5).float()
    accuracy = (predicted == labels).float().mean()
    print('Test Accuracy: {:.2f}%'.format(accuracy.item() * 100))


In [None]:
torch.save(model.state_dict(), 'model_weights_1.pth')

In [None]:
model1 = MLP(input_size, hidden_size, output_size)
model1.load_state_dict(torch.load('model_weights_1.pth'))

In [None]:
input_val = torch.Tensor(X_val.values)
ouptut_val = model1(input_val)
y_val_proba = ouptut_val.float().detach().numpy()
y_val_pred = (ouptut_val > 0.5).float().detach().numpy()

In [None]:
input_test = torch.Tensor(X_test.values)
ouptut_test = model1(input_test)
y_test_proba = ouptut_test.float().detach().numpy()
y_test_pred = (ouptut_test > 0.5).float().detach().numpy()

In [None]:
input_train = torch.Tensor(X_train.values)
ouptut_train = model1(input_train)
y_train_proba = ouptut_train.float().detach().numpy()
y_train_pred = (ouptut_train > 0.5).float().detach().numpy()

In [None]:
y_train_proba

In [None]:
output_train = mlEvaluationMetrics(y_train,y_train_pred,y_train_proba)

output_val = mlEvaluationMetrics(y_val,y_val_pred,y_val_proba)

output_test = mlEvaluationMetrics(y_test,y_test_pred,y_test_proba)

arr = np.array([output_train,output_val,output_test])

df_output = pd.DataFrame(data = arr, columns = col_output)

df_output

## 5 Prediction and Evaluation

In [None]:
## Check RF vs. Xgboost
df_RF_pred = pd.read_csv('mysubmission_rf.csv')

In [None]:

df_XGB_pred = pd.read_csv('mysubmission_xgboost.csv')

In [None]:
df_XGB_pred

In [None]:
df_RF_pred

In [None]:
df_RF_XGB_pred = df_RF_pred.merge(df_XGB_pred,  on = 'PassengerId',
                 how='inner', suffixes=('_RF', '_XGB'))

In [None]:
df_RF_XGB_pred.groupby(by = ['Transported_RF','Transported_XGB'], as_index =False).size()