In [1]:
# importing python libraries, that gives extra functionalities to to import data, clean the data, format the data, build and evaluate the XGBoost model.

import pandas as pd # for loading and manipulating data and for One-Hot Encoding
import numpy as np # calculating the mean and standard deviation
import matplotlib.pyplot as plt # for visualisation
import seaborn as sns  # for visualisation
import plotly.express as px  # for visualisation
from sklearn import metrics

import xgboost as xgb # import XGBoost 
#from xgboost import XGBClassifier
from sklearn.model_selection import  cross_val_score
from sklearn.metrics import plot_roc_curve
from sklearn.model_selection import train_test_split # for spliting data into training and testing sets
from sklearn.metrics import  balanced_accuracy_score, roc_auc_score, make_scorer # for scoring 
from sklearn.model_selection import GridSearchCV # for cross validation
from sklearn.metrics import confusion_matrix # to create a confusion matrix
from sklearn.metrics import plot_confusion_matrix # for drawing a confusion matrix
from sklearn.preprocessing import scale # for scaling and centring data
from sklearn.decomposition import PCA # to perform PCA for plotting the data
from sklearn.svm import SVC  # for making support vector machine for classification
from sklearn.utils import resample # for downsampling the dataset

%matplotlib inline

ImportError: cannot import name 'plot_roc_curve' from 'sklearn.metrics' (C:\Users\User2\anaconda3\lib\site-packages\sklearn\metrics\__init__.py)

In [None]:
# import sys
# !{sys.executable} -m pip install xgboost

In [None]:
# importing training data and looking at the data
train = pd.read_csv('train.csv')
train

In [None]:
# importing test data and looking at the data
test = pd.read_csv('/content/test.csv')
test

In [None]:
# checking the type of data in each column

train.dtypes

In [None]:
train.info()

In [None]:
train['Cabin'].unique

In [None]:
# dropping columns not usefull

train.drop(['Cabin','Name', 'PassengerId'], axis=1, inplace=True)

In [None]:
test.drop(['Cabin','Name'], axis=1, inplace=True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train.describe()

In [None]:
train['HomePlanet'].fillna(train['HomePlanet'].mode()[0],inplace = True)
train['Age'].fillna(train['Age'].median(), inplace = True)
train['CryoSleep'].fillna(train['CryoSleep'].mode()[0],inplace = True)
train['Destination'].fillna(train['Destination'].mode()[0],inplace = True)
train['VIP'].fillna(train['VIP'].mode()[0],inplace = True)
train['RoomService'].fillna(train['RoomService'].median(), inplace = True)
train['FoodCourt'].fillna(train['FoodCourt'].median(), inplace = True)
train['ShoppingMall'].fillna(train['ShoppingMall'].median(), inplace = True)
train['Spa'].fillna(train['Spa'].median(), inplace = True)
train['VRDeck'].fillna(train['VRDeck'].median(), inplace = True)

In [None]:
test['HomePlanet'].fillna(test['HomePlanet'].mode()[0],inplace = True)
test['Age'].fillna(test['Age'].median(), inplace = True)
test['CryoSleep'].fillna(test['CryoSleep'].mode()[0],inplace = True)
test['Destination'].fillna(test['Destination'].mode()[0],inplace = True)
test['VIP'].fillna(test['VIP'].mode()[0],inplace = True)
test['RoomService'].fillna(test['RoomService'].median(), inplace = True)
test['FoodCourt'].fillna(test['FoodCourt'].median(), inplace = True)
test['ShoppingMall'].fillna(test['ShoppingMall'].median(), inplace = True)
test['Spa'].fillna(test['Spa'].median(), inplace = True)
test['VRDeck'].fillna(test['VRDeck'].median(), inplace = True)

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['Transported'].value_counts().plot.pie(autopct='%0.2f%%')

In [None]:
sns.countplot(y=train.HomePlanet)

In [None]:
sns.countplot(y=train.CryoSleep)

In [None]:
sns.countplot(y=train.Destination)

In [None]:
sns.countplot(y=train.VIP)

In [None]:
sns.histplot(x=train.Age,kde=True)

In [None]:
# checking the correlation of the features
train.corr()

In [None]:
# visualizing the correlation of the features
plt.figure(figsize=(12, 6))
sns.heatmap(train.corr(), annot=True, cmap='viridis')

In [None]:
# spliting the data into Dependent and independent variables

X = train.drop('Transported', axis=1)
y = train['Transported']

In [None]:
# Format data with one-hot encoding
X_encoded = pd.get_dummies(X, columns=['HomePlanet',
                                               'CryoSleep',
                                               'Destination',
                                               'VIP',
])

X_encoded.head()

In [None]:
X_encoded.shape

In [None]:
# spliting the encoded dataset

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, random_state=42, test_size=0.3, stratify=y)

# Support Vector Machine (SVM)

In [None]:
# scalling the dataset

X_train_scaled = scale(X_train)
X_test_scaled = scale(X_test)

In [None]:
X_train_scaled.shape

In [None]:
X_test_scaled.shape

In [None]:
clf_svm = SVC(random_state=42)
clf_svm.fit(X_train_scaled, y_train)
clf_svm.score(X_train_scaled, y_train)

In [None]:
plot_confusion_matrix(clf_svm,
                      X_test_scaled,
                      y_test,
                      values_format='d',
                      display_labels=["Not Transported", "Transported"])

In [None]:
# param_grid = [
#     {'C':[0.5, 1, 10, 100],
#      'gamma': ['scale', 1, 0.1, 0.001, 0.0001],
#      'kernel':['rbf']},
    
# ]

# optimal_params = GridSearchCV(
#     SVC(),
#     param_grid,
#     cv=5,
#     scoring='accuracy',
#     verbose=0
# )

# optimal_params.fit(X_train_scaled, y_train)
# print(optimal_params.best_params_)

In [None]:
clf_svm = SVC(random_state=42, C=1, gamma=0.1)
clf_svm.fit(X_train_scaled, y_train)
clf_svm.score(X_train_scaled, y_train)

In [None]:
plot_confusion_matrix(clf_svm,
                      X_test_scaled,
                      y_test,
                      values_format='d',
                      display_labels=["Not Transported", "Transported"])

# XGBOOST

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
clf_xgb = xgb.XGBClassifier(objective='binary:logistic', missing=None, seed=42)
clf_xgb.fit(X_train,
            y_train,
            verbose=False,
            early_stopping_rounds=10,
            eval_metric='aucpr',
            eval_set=[(X_test, y_test)])

In [None]:
plot_confusion_matrix(clf_xgb,
                      X_test,
                      y_test,
                      values_format='d',
                      display_labels=["Not Transported", "Transported"])

In [None]:
#param_grid = {
#    'max_depth': [3, 4, 5],
#    'learning_rate': [0.1, 0.01, 0.05],
#    'gamma': [0,0.25, 1.0],
#    'reg_lambda': [0, 1.0, 10.0],
#}

# param_grid = {
#    'max_depth': [5],
#     'learning_rate': [0.1],
#     'gamma': [0],
#     'reg_lambda': [10],
           
# }

# #colsample_bytree = 0.5),
# optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic',
#                                                           seed=42,
#                                                           subsample=0.9,
#                                                           colsample_bytree = 0.5),
                                                          
#                               param_grid=param_grid,
#                               scoring='roc_auc',
#                               verbose=0,
#                               n_jobs=10,
#                               cv = 3)

# optimal_params.fit(X_train,
#                    y_train,
#                    early_stopping_rounds=10,
#                    eval_metric='auc',
#                    eval_set=[(X_test, y_test)],
#                    verbose=False)
        
# print(optimal_params.best_params_)

# Round 1  {'gamma': 0.25, 'learning_rate': 0.1, 'max_depth': 3, 'reg_lambda': 1.0}
# Round 2  {'gamma': 0.25, 'learning_rate': 0.25, 'max_depth': 4, 'reg_lambda': 1.0}


In [None]:
clf_xgb = xgb.XGBClassifier(seed=42,
                            objective='binary:logistic',
                            gamma=0,
                            learning_rate=0.1,
                            max_depth=5,
                            reg_lambda=10,
                            subsample=0.9,
                            colsample_bytree=0.5)
clf_xgb.fit(X_train,
             y_train,
             verbose=False,
             early_stopping_rounds=10,
             eval_metric='aucpr',
             eval_set=[(X_test, y_test)])
clf_xgb.score(X_train, y_train)   

In [None]:
plot_confusion_matrix(clf_xgb,
                      X_test,
                      y_test,
                      values_format='d',
                      display_labels=["Not Transported", "Transported"])

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

logreg.fit(X_train_scaled, y_train)

logreg.score(X_train_scaled, y_train)

In [None]:
plot_confusion_matrix(logreg,
                      X_test,
                      y_test,
                      values_format='d',
                      display_labels=["Not Transported", "Transported"])

In [None]:
predictions=logreg.predict(X_test)
predictions[0:10]

In [None]:
#output = pd.DataFrame({'PassengerId': y_test.index, 'Transported': predictions})
#output.to_csv('submission.csv', index=False)

In [None]:
#output['Transported'] =output['Transported'].apply(lambda x:True if x==1 else False)

In [None]:
#output.to_csv('submission3.csv', index=False)

In [None]:
output = pd.DataFrame({'PassengerId': y_test.index, 'Transported': predictions})