In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
print(train_data.info())
train_data.head()

In [None]:
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
print(test_data.info())
test_data.head()

In [None]:
y = train_data.Transported
train_data = train_data.drop(['PassengerId', 'Name', 'Transported'], axis=1)
train_data.isnull().sum().sort_values(ascending=False)

In [None]:
train_data[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = train_data[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
train_data.isnull().sum().sort_values(ascending=False)

In [None]:
y = y.astype(int)
train_data['VIP'] = train_data['VIP'].astype(int)
train_data['CryoSleep'] = train_data['CryoSleep'].astype(int)

In [None]:
train_data[["Deck", "Cabin_num", "Side"]] = train_data["Cabin"].str.split("/", expand=True)

try:
    train_data = train_data.drop('Cabin', axis=1)
except KeyError:
    print("Field does not exist")

train_data.head(5)

In [None]:
train_data.isnull().sum().sort_values(ascending=False)

In [None]:
train_data['Cabin_num'].describe()

In [None]:
train_data = train_data.drop('Cabin_num', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_data, y, random_state=1)

In [None]:
numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]

object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely ordinal encoded
good_label_cols = [col for col in object_cols if 
                   set(X_valid[col]).issubset(set(X_train[col]))]

print('Categorical columns that will be encoded:', good_label_cols)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score

In [None]:
numerical_transformer = SimpleImputer(strategy='constant')
        
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, good_label_cols)
    ])

In [None]:
model = RandomForestClassifier(n_estimators=100 ,random_state=0)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                    ])

clf_score = cross_val_score(clf, train_data, y, cv=3)

clf.fit(X_train, y_train)

score = clf.score(X_valid, y_valid)

print('Score:', score)
print('Cross Val Score:', np.mean(clf_score))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn.svm import SVC

lr = LogisticRegression(random_state=0)

rfc =  RandomForestClassifier(n_estimators=50 ,random_state=0)

dtc = DecisionTreeClassifier()

gnb = GaussianNB()

knn = neighbors.KNeighborsClassifier()

svc = SVC(kernel='linear')

In [None]:
svc.get_params()

In [None]:
X_train.head()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

models = [lr, dtc, rfc, gnb, knn, svc]
pipe_dict = {0: 'Logistic Regression ', 1: 'Decision Tree Classifier ', 2: 'Random Forest Classifier ', 3: 'GaussianNB '
             , 4: 'KNN ', 5: 'SVC '}

for i,model in enumerate(models):
    pipeline = Pipeline([('preprocessor', preprocessor),
                        ('scalar', StandardScaler()),
                        ('model', model)])
    
    cv_score = cross_val_score(pipeline, train_data, y, cv=3)
    pipeline.fit(X_train, y_train)
    
    print("{}Test Accuracy {}".format(pipe_dict[i],pipeline.score(X_valid, y_valid)))
    print("{}Test Accuracy {}\n".format(pipe_dict[i],np.mean(cv_score)))

In [None]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
svc_params = {
        'C': [0.1, 1, 10, 100], 
        'gamma': ['scale', 0.1, 0.01, 0.001],
        'degree': [1, 2, 3, 4, 5],
        'kernel': ['rbf', 'linear', 'sigmoid']
}

svc_search = GridSearchCV(estimator=svc, 
                           param_grid=svc_params, 
                           cv=3, verbose=2, scoring = "accuracy")

In [None]:
svc_pipe = Pipeline([('preprocessor', preprocessor),
                     ('scalar', StandardScaler()),
                       ('model', svc_search)])
svc_pipe.fit(train_data, y)

In [None]:
print('Score: ', svc_search.best_score_)
svc_search.best_params_

In [None]:
final_model = svc_search.best_estimator_

In [None]:
'''
rfc_params = {
    'n_estimators': [5, 10, 30, 50, 100, 130 , 150],
    'max_depth': [10, 20, 30, 50],
    'min_samples_leaf': [2, 5, 10, 20],
    'bootstrap': [False],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 10, 15, 20, 30]
}

rfc_search = GridSearchCV(estimator=rfc, 
                           param_grid=rfc_params, 
                           cv=3, verbose=1, scoring = "accuracy")
'''

In [None]:
# rfc_pipe = Pipeline([('preprocessor', preprocessor),
#                    ('scalar', StandardScaler()),
#                       ('model', rfc_search)])
# rfc_pipe.fit(X_train, y_train)

In [None]:
# print('Score: ', rfc_search.best_score_)
# rfc_search.best_params_

In [None]:
final_pipe = Pipeline([('preprocessor', preprocessor),
                     ('scalar', StandardScaler()),
                       ('model', final_model)])

final_pipe.fit(X_train, y_train)
final_pipe.score(X_valid, y_valid)

In [None]:
Id = test_data.PassengerId
test_data = test_data.drop(['PassengerId', 'Name'], axis=1)

test_data[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = test_data[['VIP', 'CryoSleep', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(value=0)
test_data['VIP'] = test_data['VIP'].astype(int)
test_data['CryoSleep'] = test_data['CryoSleep'].astype(int)

test_data[["Deck", "Cabin_num", "Side"]] = test_data["Cabin"].str.split("/", expand=True)
test_data = test_data.drop(['Cabin', 'Cabin_num'], axis=1)

final_pipe.fit(train_data, y)
predictions = final_pipe.predict(test_data).astype(bool)
print(predictions)

In [None]:
output = pd.DataFrame({'PassengerId': Id, 'Transported': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")