In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors  import KNeighborsClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import ExtraTreesClassifier

# **Reading the training and the testing data**

In [None]:
train=pd.read_csv("../input/spaceship-titanic/train.csv")
pred=pd.read_csv("../input/spaceship-titanic/test.csv")

# **Displaying the top 10 data in training and testing data**

In [None]:
train.head()

In [None]:
pred.head()

In [None]:
#number of rows in test and train
print("Number of rows in Train dataset: ",train.shape[0], " Number of rows in Test dataset: ", pred.shape[0])

In [None]:
#number of columns in test and train
print("Number of columns in Train dataset: ",pred.shape[1], " Number of columns in Test dataset: ", pred.shape[1])

In [None]:
#Information about the training dataset
train.describe(include="all")

# **Cleaning the dataset:**
1.     Removing the Null values
2.     Dropping the unwanted columns
3.     Adding New columns if needed
4.     Formatting the columns

In [None]:
#to find the percentage of null:
def percentagenull(df):
    percentage= ((df.isna().sum()/df.isna().count())*100).sort_values(ascending=False)
    count= df.isna().sum().sort_values(ascending=False)
    dfff= pd.concat([count, percentage], axis=1,keys=['the Count', 'Percentage of null'])
    return dfff
    

In [None]:
percentagenull(train)

In [None]:
percentagenull(pred)

**We can see that every columns has equal percentage of null**

In [None]:
#Replacing the null data in dataset with the null string before spliting them into new columns
train.replace(np.nan, "null", inplace=True)
pred.replace(np.nan, "null", inplace=True)

**We can see that the cabin is of format Deck/num/side. So going to split separately and make them into columns**

In [None]:
# Creating a function to create new columns (can use split function - but this for deeper understanding)
def addcolumns(df):
    li=""
    f=[]
    mm=""
    m=[]
    l=[]
    for value in df["Cabin"].tolist():
        if value == "null":
            f.append('null')
            m.append('null')
            l.append('null')
        else:    
            li=li+value
    
            f.append(li[0])
            for j in range(2,len(li)):
                if li[j] == '/':
                    break
                else:
                    mm=mm+li[j]
            m.append(mm)
            l.append(li[j+1])
            li=""
            mm=""
    deck=pd.DataFrame(f, columns=["Deck"])
    num=pd.DataFrame(m,columns=["Num"])
    side=pd.DataFrame(l,columns=["Side"])
    df1=pd.concat([deck,num,side],axis=1)
    df=pd.concat([df, df1], axis=1)
    return df

In [None]:
train=addcolumns(train)
pred=addcolumns(pred)

In [None]:
#Seeing the new columns added
train.head()

In [None]:
#Dropping cabin as they are not required from here onwards
train.drop('Cabin', axis=1, inplace=True)
pred.drop('Cabin', axis=1, inplace=True)

In [None]:
#Dropping PassengerId and Name as they dont have any pattern and they are unique
train.drop('PassengerId', axis=1, inplace=True)
pred.drop('PassengerId', axis=1, inplace=True)
train.drop('Name', axis=1, inplace=True)
pred.drop('Name', axis=1, inplace=True)


In [None]:
train.dtypes

Converting the object to numeric for fitting

In [None]:
#Deck distribution with respect to transportation
sns.countplot(x='Deck', data=train, hue='Transported')

In [None]:
#Using label encoder to convert category to numeric (Other methods: get_dummies, map)
le=LabelEncoder()

In [None]:
train["HomePlanet"]=le.fit_transform(train["HomePlanet"])

train["Destination"]=le.fit_transform(train["Destination"])

train["VIP"].replace('null', train.VIP.mode()[0], inplace=True)

train["VIP"]=le.fit_transform(train["VIP"])

train["CryoSleep"].replace('null', train.CryoSleep.mode()[0], inplace=True)

train["CryoSleep"]=le.fit_transform(train["CryoSleep"])

train["Transported"].replace('null', train.Transported.mode()[0], inplace=True)

train["Transported"]=le.fit_transform(train["Transported"])

train["Age"].replace('null', '0.0', inplace=True)

train["Age"]=train["Age"].astype("float")


In [None]:
# Replacing the NUll with their respective mean values
train["Age"].replace('0.0', train.Age.mean(), inplace=True)

train["RoomService"].replace('null', '0.0', inplace=True)

train["FoodCourt"].replace('null', '0.0', inplace=True)
train["ShoppingMall"].replace('null', '0.0', inplace=True)
train["Spa"].replace('null', '0.0', inplace=True)
train["VRDeck"].replace('null', '0.0', inplace=True)

train["FoodCourt"]=train["FoodCourt"].astype("float")
train["ShoppingMall"]=train["ShoppingMall"].astype("float")
train["Spa"]=train["Spa"].astype("float")
train["VRDeck"]=train["VRDeck"].astype("float")
train["RoomService"]=train["RoomService"].astype("float")

In [None]:
#Replacing null with their mode and converting to numeric using LabelEncoder
train["Deck"].replace('null',train["Deck"].mode()[0] , inplace=True)
train["Num"].replace('null', train["Num"].mode()[0], inplace=True)
train["Side"].replace('null', train["Side"].mode()[0], inplace=True)

train["Deck"]=le.fit_transform(train["Deck"])
train["Num"]=le.fit_transform(train["Num"])
train["Side"]=le.fit_transform(train["Side"])

In [None]:
#Following the above process for the prediction dataset

pred["HomePlanet"]=le.fit_transform(pred["HomePlanet"])

pred["Destination"]=le.fit_transform(pred["Destination"])

pred["VIP"].replace('null', pred.VIP.mode()[0], inplace=True)

pred["VIP"]=le.fit_transform(pred["VIP"])

pred["CryoSleep"].replace('null', pred.CryoSleep.mode()[0], inplace=True)

pred["CryoSleep"]=le.fit_transform(pred["CryoSleep"])



pred["Age"].replace('null', '0.0', inplace=True)

pred["Age"]=pred["Age"].astype("float")
pred["Age"].replace('0.0', pred.Age.mean(), inplace=True)

pred["RoomService"].replace('null', '0.0', inplace=True)

pred["FoodCourt"].replace('null', '0.0', inplace=True)
pred["ShoppingMall"].replace('null', '0.0', inplace=True)
pred["Spa"].replace('null', '0.0', inplace=True)
pred["VRDeck"].replace('null', '0.0', inplace=True)

pred["FoodCourt"]=pred["FoodCourt"].astype("float")
pred["ShoppingMall"]=pred["ShoppingMall"].astype("float")
pred["Spa"]=pred["Spa"].astype("float")
pred["VRDeck"]=pred["VRDeck"].astype("float")
pred["RoomService"]=pred["RoomService"].astype("float")


pred["Deck"].replace('null',pred["Deck"].mode()[0] , inplace=True)
pred["Num"].replace('null', pred["Num"].mode()[0], inplace=True)
pred["Side"].replace('null', pred["Side"].mode()[0], inplace=True)

pred["Deck"]=le.fit_transform(pred["Deck"])
pred["Num"]=le.fit_transform(pred["Num"])
pred["Side"]=le.fit_transform(pred["Side"])

In [None]:
#Viewing information about the training dataset after the cleaning
train.info()

In [None]:
#Viewing information about the prediction dataset after the cleaning
pred.info()

# **Feature Importance:**
1.     Using heatmap to detect the correlaion between features

In [None]:
x=train[['HomePlanet',
 'CryoSleep',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Deck',
 'Num',
 'Side']]
y=train['Transported']

In [None]:
plt.figure(figsize=(40, 25))
sns.heatmap(train.corr(), annot=True)

In [None]:
#Distribution of Age
plt.figure(figsize=(20,8))
sns.histplot(train.Age)
plt.show()

In [None]:
sns.regplot(train['CryoSleep'], train['Transported'], data=train)

**From above plot we can see that CryoSleep is positvely correlated to Transportation**

In [None]:
#Try this merge function for your chances of increasing the score. If you get it please comment me ;)
def merges(df):
    relatives=[]
    for i in df["FoodCourt"].values.tolist():
        relatives.append(i)
    relatives1=[]
    for i in df["ShoppingMall"].values.tolist():
        relatives1.append(i) 
    relatives2=[]
    for i in df["Spa"].values.tolist():
        relatives2.append(i)
    relatives3=[]
    for i in df["VRDeck"].values.tolist():
        relatives3.append(i) 
    relatives4=[]
    for i in df["RoomService"].values.tolist():
        relatives4.append(i)     
    re=[]

    for i in range(0, len(relatives)):
        re.append(relatives[i]+relatives1[i]+relatives2[i]+relatives3[i]+relatives4[i])  
    df1=pd.DataFrame(re, columns=['Price Spent'])   
    df= pd.concat([df, df1], axis=1)
    return df

In [None]:
#pred=merges(pred)
#train=merges(train)

#train.drop(['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck'], axis=1, inplace=True)
#pred.drop(['RoomService', 'FoodCourt','ShoppingMall','Spa','VRDeck'], axis=1, inplace=True)

#train.drop(['VIP', 'ShoppingMall'], axis=1, inplace=True)
#pred.drop(['VIP', 'ShoppingMall'], axis=1, inplace=True)

# **Preprocessing the data**

In [None]:
X_train, X_test, y_train, y_test= train_test_split(x, y, test_size=0.2, random_state=1)
X_train=StandardScaler().fit_transform(X_train)
X_test=StandardScaler().fit_transform(X_test)
pred=StandardScaler().fit_transform(pred)
train.dtypes

# **DIFFERENT ML MODELS AND THEIR ACCURACY SCORE**

# **Logistic Regression:**

In [None]:
lr = LogisticRegression()
params = { "penalty": ("l1", "l2", "elasticnet"), "tol": (0.1, 0.01, 0.001, 0.0001), "C": (10.0, 1.0, 0.1, 0.01)}
modelLR = GridSearchCV(lr, params, cv=10)
modelLR.fit(X_train, y_train)

In [None]:
print(accuracy_score(modelLR.predict(X_test),y_test))

# **Cat Boost:**

In [None]:
modelCT=CatBoostClassifier(verbose = 0)
modelCT.fit(X_train,y_train)

In [None]:
print(accuracy_score(modelCT.predict(X_test),y_test))

# **Voting Classifier:**

In [None]:
models = {'catboost':CatBoostClassifier(verbose = 0),
           'gbc':GradientBoostingClassifier(),
           'ridge':RidgeClassifier(),
           'lr':LogisticRegression()}

estimators = [('catboost', CatBoostClassifier(verbose = 0)), ('gbc', GradientBoostingClassifier()),  ('lr', LogisticRegression())]
modelVC = VotingClassifier(estimators=estimators, voting='soft', weights=[1, 1, 1])
modelVC.fit(X_train,y_train)

In [None]:
print(accuracy_score(modelVC.predict(X_test),y_test))

# **LGBM Classifier:**

In [None]:
modelLGBM=LGBMClassifier(max_depth=6, random_state=314, silent=True, metric='None', n_jobs=6)


modelLGBM.fit(X_train,y_train)

In [None]:
print(accuracy_score(modelLGBM.predict(X_test),y_test))

# **SVM:**

In [None]:
modelSVM=svm.SVC(kernel='rbf')
modelSVM.fit(X_train, y_train)

In [None]:
print(accuracy_score(modelSVM.predict(X_test),y_test))

# **Decision Tree Classifier:**

In [None]:
modelDTC=DecisionTreeClassifier(criterion="entropy")
modelDTC.fit(X_train, y_train)

In [None]:
print(accuracy_score(modelDTC.predict(X_test),y_test))

# **KNeighbors Classifier:**

In [None]:
n=KNeighborsClassifier(n_neighbors=3)
n.fit(X_train, y_train)

In [None]:
print(accuracy_score(n.predict(X_test),y_test))

In [None]:
report = pd.DataFrame({
    "Model" : ["Logistic Regression","Cat Boost","Voting Classifier", "LGBM Classifier", "SVM", "Decision Tree Classifier", "KNeighborsClassifier"],
    "Accuracy score" : [accuracy_score(modelLR.predict(X_test),y_test),accuracy_score(modelCT.predict(X_test),y_test),accuracy_score(modelVC.predict(X_test),y_test),accuracy_score(modelLGBM.predict(X_test),y_test), accuracy_score(modelSVM.predict(X_test),y_test),accuracy_score(modelDTC.predict(X_test),y_test),accuracy_score(n.predict(X_test),y_test)]
})
report.sort_values(by = "Accuracy score")

In [None]:
transportedpred=modelSVM.predict(pred)

transportedpredframe=pd.DataFrame(transportedpred, columns=['Transported'])

transportedpredframe['Transported']=transportedpredframe['Transported'].replace(1, "True")

transportedpredframe['Transported']=transportedpredframe['Transported'].replace(0, "False")

In [None]:
transportedpredframe

In [None]:
pred1=pd.read_csv("../input/spaceship-titanic/test.csv")

passe=pred1["PassengerId"]

passee=pd.DataFrame(passe, columns=['PassengerId'])

In [None]:
sub=pd.concat([passee,transportedpredframe], axis=1)

In [None]:
sub.to_csv("submission.csv", index=False)

# **THANK YOU SO MUCH FOR VIEWING MY NOTEBOOK. YOUR FEEDBACK IS MORE IMPORTANT FOR MY IMPROVEMENT!**