In [3]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os

# Check the current working directory
print("Current Working Directory:", os.getcwd())

# Load train.csv
train_data = pd.read_csv("C:/Users/kaurs/Downloads/hackathon/summer-school-24-25-hackathon/train.csv")

# Load test.csv
test_data = pd.read_csv("C:/Users/kaurs/Downloads/hackathon/summer-school-24-25-hackathon/test.csv")

# Handle missing values
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
train_data['CryoSleep'].fillna(False, inplace=True)
test_data['CryoSleep'].fillna(False, inplace=True)
train_data['VIP'].fillna(False, inplace=True)
test_data['VIP'].fillna(False, inplace=True)

# Convert boolean columns to numeric
train_data['CryoSleep'] = train_data['CryoSleep'].astype(int)
test_data['CryoSleep'] = test_data['CryoSleep'].astype(int)
train_data['VIP'] = train_data['VIP'].astype(int)
test_data['VIP'] = test_data['VIP'].astype(int)

# One-hot encode categorical columns: 'HomePlanet' and 'Cabin'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit and transform on train_data
train_encoded = encoder.fit_transform(train_data[['HomePlanet', 'Cabin']])
train_encoded_df = pd.DataFrame(train_encoded, columns=encoder.get_feature_names_out(['HomePlanet', 'Cabin']))
train_data = pd.concat([train_data, train_encoded_df], axis=1)

# Transform on test_data
test_encoded = encoder.transform(test_data[['HomePlanet', 'Cabin']])
test_encoded_df = pd.DataFrame(test_encoded, columns=encoder.get_feature_names_out(['HomePlanet', 'Cabin']))
test_data = pd.concat([test_data, test_encoded_df], axis=1)

# Drop unnecessary columns
train_data.drop(columns=['Name', 'HomePlanet', 'Cabin'], inplace=True)
test_data.drop(columns=['Name', 'HomePlanet', 'Cabin'], inplace=True)

# One-hot encode categorical columns (if needed) in train_data and test_data for 'Destination' column
train_data = pd.get_dummies(train_data, columns=['Destination'])
test_data = pd.get_dummies(test_data, columns=['Destination'])

# Separate features (X) and target (y) for training data
X_train = train_data.drop(columns=['PassengerId', 'Transported'])
y_train = train_data['Transported']

# Separate features (X) for test data (assuming we have PassengerId for prediction)
X_test = test_data.drop(columns=['PassengerId'])

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_imputed, y_train)

# Make predictions on the training set to evaluate the model
y_train_pred = clf.predict(X_train_imputed)

# Evaluate the model's performance
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred))

# Make predictions on the test set
test_predictions = clf.predict(X_test_imputed)


submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})

# Convert boolean predictions to 'True' and 'False' strings
submission['Transported'] = submission['Transported'].astype(bool)

# Save the submission file
try:
    submission.to_excel("C:/Users/kaurs/Downloads/hackathon/summer-school-24-25-hackathon/submission.xlsx", index=False)
    print("\nSubmission file created successfully!")
except Exception as e:
    print(f"Error saving the file: {e}")

print("\nProcessed Training Data:")
print(X_train.head())

print("\nProcessed Testing Data:")
print(X_test.head())


Current Working Directory: C:\Users\kaurs
Training Accuracy: 0.9995398596571955

Confusion Matrix:
[[4312    3]
 [   1 4377]]

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00      4315
        True       1.00      1.00      1.00      4378

    accuracy                           1.00      8693
   macro avg       1.00      1.00      1.00      8693
weighted avg       1.00      1.00      1.00      8693


Submission file created successfully!

Processed Training Data:
   CryoSleep   Age  VIP  RoomService  FoodCourt  ShoppingMall     Spa  VRDeck  \
0          0  39.0    0          0.0        0.0           0.0     0.0     0.0   
1          0  24.0    0        109.0        9.0          25.0   549.0    44.0   
2          0  58.0    1         43.0     3576.0           0.0  6715.0    49.0   
3          0  33.0    0          0.0     1283.0         371.0  3329.0   193.0   
4          0  16.0    0        303.0       70.0      