In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
"""
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
"""
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('./kaggle/input/dataset-a/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('./kaggle/input/dataset-a/test.csv')
test_df.head()

In [None]:
missing_values_train = train_df.isnull().sum()

missing_values_test = test_df.isnull().sum()

missing_values_train, missing_values_test

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score

train_df = pd.read_csv('./kaggle/input/dataset-a/train.csv')
test_df = pd.read_csv('./kaggle/input/dataset-a/test.csv')

num_imputer = SimpleImputer(strategy='median')
train_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = num_imputer.fit_transform(train_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])
test_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = num_imputer.transform(test_df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']])

cat_imputer = SimpleImputer(strategy='most_frequent')
train_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']] = cat_imputer.fit_transform(train_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']])
test_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']] = cat_imputer.transform(test_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']])

train_df = train_df.drop(columns=['Name'])
test_df = test_df.drop(columns=['Name'])

combined_df = pd.concat([train_df.drop(columns=['Transported']), test_df], ignore_index=True)

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
encoded_columns = ohe.fit_transform(combined_df[['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']])

encoded_df = pd.DataFrame(encoded_columns, columns=ohe.get_feature_names_out())
encoded_df.index = combined_df.index

combined_df = pd.concat([combined_df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)
combined_df = combined_df.drop(columns=['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP'])

X_train = combined_df.iloc[:len(train_df), :]
X_test = combined_df.iloc[len(train_df):, :]
y_train = train_df['Transported']

X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_split, y_train_split)

cv_scores = cross_val_score(rf, X_train_split, y_train_split, cv=5)
print(f'Cross-validation accuracy: {np.mean(cv_scores):.4f}')

val_predictions = rf.predict(X_val)
val_accuracy = np.mean(val_predictions == y_val)
print(f'Validation accuracy: {val_accuracy:.4f}')

test_predictions = rf.predict(X_test)

submission_df = pd.DataFrame({'PassengerId': test_df['PassengerId'], 'Transported': test_predictions})
submission_df['Transported'] = submission_df['Transported'].astype(bool)
submission_df.to_csv('./kaggle/working/submission.csv', index=False)