### Import Libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

### Load Data

In [None]:
train = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/train.csv')
train

In [None]:
test = pd.read_csv('/share/dutta/eyao/dataset/kaggle/spaceship-titanic/test.csv')
test

### Data Preprocessing

In [None]:
imputer = SimpleImputer(strategy='median')
train[['Age']] = imputer.fit_transform(train[['Age']])

### Fill missing values

In [None]:
train['Age'].fillna(train['Age'].median())
train['HomePlanet'].fillna('Unknown')
train['CryoSleep'].fillna(False)

### One-Hot Encoding for Categorical Features

In [None]:
categorical_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
train = pd.get_dummies(train, columns=categorical_cols)

### Feature Engineering

In [None]:
train['Total_Billed'] = train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

### Features and Target

In [None]:
X = train.drop(['PassengerId', 'Name', 'Cabin', 'Transported'], axis=1)
y = train['Transported'].astype(int)

### Check for any remaining NaN values and handle them

In [None]:
X.fillna(0, inplace=True)

### Split Data

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

### Validation

In [None]:
y_pred = model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred)}")

### Test Prediction

### Data Preprocessing for Test Data

In [None]:
test[['Age']] = imputer.transform(test[['Age']])
test['HomePlanet'].fillna('Unknown')
test['CryoSleep'].fillna(False)
test['VIP'].fillna(False)

### One-Hot Encoding for Categorical Features

In [None]:
test = pd.get_dummies(test, columns=categorical_cols)

### Feature Engineering for Test Data

In [None]:
test['Total_Billed'] = test[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

### Align test data with train data columns

In [None]:
X_test = test.drop(['PassengerId', 'Name', 'Cabin'], axis=1)
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Check for any remaining NaN values in X_test and handle them
X_test.fillna(0, inplace=True)

### Test predictions

In [None]:
test_preds = model.predict(X_test)

### Create Submission

In [None]:
submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Transported': test_preds})
submission.to_csv('submission.csv', index=False)