# Daten-Analyse Spaceship Titanic

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score



# Load CSV into Dataframe from Data folder
df = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

In [None]:
print(df.head())

In [None]:
print(df.shape)

In [None]:
print(df.describe())

In [None]:
print(df.info())

In [None]:
# use one hot encoding to convert categorical columns to numeric
df_cats = df.select_dtypes(include='object')
one_hot = OneHotEncoder()
df_cats = one_hot.fit_transform(df_cats).toarray()
df_cats = pd.DataFrame(df_cats, columns=one_hot.get_feature_names_out())
df_cats

In [None]:
# drop categorical columns from original dataframe
df = df.drop(columns=df.select_dtypes(include='object').columns)
df = pd.concat([df, df_cats], axis=1)
df

In [None]:
# Show me all columns
pd.set_option('display.max_columns', None)
df.head()


In [None]:
# Show me all columns with NaN values
df.columns[df.any()].tolist()


In [None]:
# Fill NaN of each column with mean of that column
for col in df.columns[df.any()].tolist():
    df[col] = df[col].fillna(df[col].mean())



In [None]:
# show how many nan values are in each column
print(df.isnull().sum())

In [None]:
y = df["Transported"]
X = df.drop('Transported', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123, stratify=y)

In [None]:
rf = RandomForestClassifier(n_estimators=50, max_leaf_nodes=50, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
Random_Forest = ('Accuracy score: ', accuracy_score(y_test, y_pred))
print(Random_Forest)

In [None]:
ada = AdaBoostClassifier(n_estimators=250, random_state=42)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)
ADA_Boost = print('Accuracy score: ', accuracy_score(y_test, y_pred))
print(ADA_Boost)

In [None]:
# Voting Classifier with rf and ada
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('ada', ada)],
    voting='hard')
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)
Voting_Classifier = print('Accuracy score: ', accuracy_score(y_test, y_pred))
print(Voting_Classifier)


In [None]:
# SVM Classifier with Grid Search
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
pipe = make_pipeline(StandardScaler(), SVC(random_state=42))
param_grid = [{'svc__C': [1, 10, 100, 1000], 'svc__kernel': ['linear']},
              {'svc__C': [1, 10, 100, 1000], 'svc__gamma': [0.001, 0.0001],
               'svc__kernel': ['rbf']}]
grid = GridSearchCV(pipe, param_grid, cv=5, iid=False)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
SVM_Classifier = print('Accuracy score: ', accuracy_score(y_test, y_pred))
print(SVM_Classifier)


In [None]:
# Use the Voting Classifier to predict the test data
df_test_cats = df_test.select_dtypes(include='object')
one_hot = OneHotEncoder()
df_test_cats = one_hot.fit_transform(df_test_cats).toarray()
df_test_cats = pd.DataFrame(df_test_cats, columns=one_hot.get_feature_names_out())
df_test_cats

# drop categorical columns from original dataframe
df_test = df_test.drop(columns=df_test.select_dtypes(include='object').columns)
df_test = pd.concat([df_test, df_test_cats], axis=1)
df_test

# # predict test data based on voting classifier
# y_pred = voting_clf.predict(df_test)
# print(y_pred)

# # create submission file
# submission = pd.DataFrame({'Id': df_test.Id, 'Transported': y_pred})
# submission.to_csv('submission.csv', index=False)
# submission.head()
