In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Membaca data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Melihat informasi umum mengenai data
print(train_df.info())
print(test_df.info())

# Melihat distribusi missing values
sns.heatmap(train_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values in Train Data')
plt.show()

sns.heatmap(test_df.isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values in Test Data')
plt.show()

# Mengisi missing values
train_df['Age'].fillna(train_df['Age'].median(), inplace=True)
test_df['Age'].fillna(test_df['Age'].median(), inplace=True)

# Mengisi missing values pada fitur kategorikal dengan modus
for column in ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']:
    train_df[column].fillna(train_df[column].mode()[0], inplace=True)
    test_df[column].fillna(test_df[column].mode()[0], inplace=True)

In [None]:
# Membuat fitur baru berdasarkan fitur yang ada
train_df['TotalSpend'] = train_df['RoomService'] + train_df['FoodCourt'] + train_df['ShoppingMall'] + train_df['Spa'] + train_df['VRDeck']
test_df['TotalSpend'] = test_df['RoomService'] + test_df['FoodCourt'] + test_df['ShoppingMall'] + test_df['Spa'] + test_df['VRDeck']

# Melakukan encoding pada fitur kategorikal
train_df = pd.get_dummies(train_df, columns=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP'])
test_df = pd.get_dummies(test_df, columns=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, roc_auc_score

# Membagi data train menjadi train dan validation set
X = train_df.drop(columns=['Transported','Name'], axis=1)
y = train_df['Transported']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Melatih model RandomForest
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Prediksi pada validation set
y_pred = rf.predict(X_val)
y_pred_proba = rf.predict_proba(X_val)[:, 1]

# Evaluasi model
print(classification_report(y_val, y_pred))
print('AUC-ROC:', roc_auc_score(y_val, y_pred_proba))

# Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='roc_auc')
grid_search.fit(X_train, y_train)

# Model terbaik
best_rf = grid_search.best_estimator_

# Prediksi pada test set
test_pred = best_rf.predict(test_df)