In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
dataset = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')

In [None]:
dataset.info(verbose = 'True')

In [None]:
missing = pd.DataFrame((dataset.isnull().sum())*100/dataset.shape[0]).reset_index()
plt.figure(figsize = (16,5))
ax = sns.pointplot('index',0, data = missing)
plt.xticks(rotation = 90, fontsize = 7)
plt.title("Percentages of the missing vzlues")
plt.ylabel("Percentage")
plt.show()

In [None]:
dataset.drop(columns = ['PassengerId','Name'], axis= 1, inplace = True)

In [None]:
dataset.info()

In [None]:
dataset.info()

In [None]:
dataset = dataset.dropna(subset=['HomePlanet','CryoSleep','Destination','Age','VIP','VRDeck','Cabin'])

In [None]:
dataset.info()

In [None]:
fig, axs = plt.subplots(2, 2)
plt.subplots_adjust(wspace=0.5, hspace=0.5) 
axs[0, 0].plot(dataset['RoomService'])
axs[0, 0].set_title('Room Service')
axs[0, 1].plot(dataset['FoodCourt'])
axs[0, 1].set_title('Food Court')
axs[1, 0].plot(dataset['ShoppingMall'])
axs[1, 0].set_title('Shopping Mall')
axs[1, 1].plot(dataset['Spa'])
axs[1, 1].set_title('Spa')
plt.show()

In [None]:
mask1 = dataset['RoomService'].eq(0)
mask2 = dataset['FoodCourt'].eq(0)
mask3 = dataset['ShoppingMall'].eq(0)
mask4 = dataset['Spa'].eq(0)

count1 = mask1.sum()
count2 = mask2.sum()
count3 = mask3.sum()
count4 = mask4.sum()

print(count1,count2,count3,count4)

In [None]:
for i, predictor in enumerate(dataset.drop(columns = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Age','Transported','Cabin'])):
  plt.figure(i)
  sns.countplot(data = dataset, x = predictor, hue = 'Transported')

In [None]:
dataset.drop(columns = ['VIP'], axis= 1, inplace = True)

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset[['RoomService','FoodCourt','ShoppingMall','Spa']] = dataset[['RoomService','FoodCourt','ShoppingMall','Spa']].fillna(value=0)

In [None]:
dataset.info()

In [None]:
categories = {"Mars": 2,"Europa": 1, "Earth": 0}
dataset['HomePlanet']= dataset['HomePlanet'].map(categories)

categories = dataset.Cabin.unique()
dataset['Cabin'] = dataset.Cabin.astype("category").cat.codes

In [None]:
dataset['CryoSleep'] = dataset['CryoSleep'].astype(int)
dataset['Transported'] = dataset['Transported'].astype(int)

In [None]:
categories = {"TRAPPIST-1e": 2,"PSO J318.5-22": 1, "55 Cancri e": 0}
dataset['Destination']= dataset['Destination'].map(categories)

In [None]:
dataset.head(15)

In [None]:
from sklearn.preprocessing import MinMaxScaler

LABEL = 'Transported'
y = dataset[LABEL]
dataset = dataset.drop(LABEL, axis=1)  

scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(dataset)

scaled_train = pd.DataFrame(scaled_train, columns=dataset.columns, index=dataset.index)


In [None]:
scaled_train.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

x_train, x_test, y_train, y_test = train_test_split(scaled_train, y, test_size=0.2)
print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

clf = RandomForestClassifier(n_estimators=1000)

clf.fit(x_train, y_train)

In [None]:
feature_imp = pd.Series(clf.feature_importances_, index=scaled_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=feature_imp, y=feature_imp.index)

plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.tight_layout()

In [None]:
y_pred=clf.predict(x_test)

In [None]:
print("Accuracy: {}".format(metrics.accuracy_score(y_test, y_pred)))

In [None]:
new_train = scaled_train.drop(['Destination','HomePlanet'], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(new_train, y, test_size=0.2)
clf = RandomForestClassifier(n_estimators=1000)

clf.fit(X_train, Y_train)

In [None]:
Y_pred = clf.predict(X_test)
print("   Accuracy: {}".format(metrics.accuracy_score(Y_test, Y_pred)))

In [None]:
print(classification_report(Y_test,Y_pred))

conf_matrix = confusion_matrix(Y_test, Y_pred)

plt.figure(figsize=(8,5))
sns.heatmap(conf_matrix, annot=True)
plt.title('Confusion Matrix')
plt.tight_layout()


In [None]:
test_dataset = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
test_dataset.info()

In [None]:
test_dataset.head()

In [None]:
test_dataset.drop(columns = ['PassengerId','HomePlanet','Destination','VIP','Name'], axis= 1, inplace = True)

In [None]:
test_dataset.head()


In [None]:
categories = test_dataset.Cabin.unique()
test_dataset['Cabin'] = test_dataset.Cabin.astype("category").cat.codes

In [None]:
test_dataset.head()

In [None]:
for i, predictor in enumerate(test_dataset.drop(columns = ['Cabin','Age','RoomService','Spa','FoodCourt','ShoppingMall','VRDeck'])):
  plt.figure(i)
  sns.countplot(data = test_dataset, x = predictor)

In [None]:
test_dataset['CryoSleep'].fillna(False, inplace=True)

In [None]:
test_dataset['CryoSleep'] = test_dataset['CryoSleep'].astype(int)

In [None]:
test_dataset.head()

In [None]:
test_dataset.info()

In [None]:
mask1 = test_dataset['RoomService'].eq(0)
mask2 = test_dataset['FoodCourt'].eq(0)
mask3 = test_dataset['ShoppingMall'].eq(0)
mask4 = test_dataset['Spa'].eq(0)
mask5 = test_dataset['VRDeck'].eq(0)

count1 = mask1.sum()
count2 = mask2.sum()
count3 = mask3.sum()
count4 = mask4.sum()
count5 = mask5.sum()

print(count1,count2,count3,count4,count5)

In [None]:
test_dataset[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']] = test_dataset[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']].fillna(value=0)

In [None]:
test_dataset.info()

In [None]:
mean_c = test_dataset['Age'].mean()
test_dataset['Age'].fillna(mean_c, inplace=True)

In [None]:
test_dataset.info()

In [None]:
Test_pred = clf.predict(test_dataset)

In [None]:
Test_pred = Test_pred.astype(bool)
print(Test_pred)

In [None]:
Test_pred[:].shape

In [None]:
submission_dataset = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

In [None]:
submission_dataset.head()

In [None]:
submission_dataset.drop(columns = ['Transported'], axis= 1, inplace = True)

In [None]:
pred_df = pd.DataFrame({'Transported': Test_pred})

In [None]:
submission_dataset = pd.concat([submission_dataset, pred_df], axis=1)

In [None]:
submission_dataset.head()

In [None]:
submission_dataset.info(verbose = 'True')

In [None]:
submission_dataset.to_csv('submission.csv',index=None)