### Model Selection and Submission

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the training and test datasets
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Drop unnecessary columns from the training and test datasets
train_data = train_data.drop(columns=['Name', 'PassengerId', 'Cabin'])
test_data_1 = test_data.drop(columns=['Name', 'PassengerId', 'Cabin'])

# Separate features and target
features = train_data.drop(columns=['Transported'])
labels = train_data['Transported']

# Identify categorical and numerical columns
categorical_cols = features.select_dtypes(include=['object']).columns
numerical_cols = features.select_dtypes(include=['int64', 'float64']).columns

# Create preprocessing pipelines
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create a pipeline that includes preprocessing and the RandomForest model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=40))
])

# Parameters of the model to be optimized by GridSearchCV
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30]
}

# Setup the grid search with 10-fold cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=10, scoring='accuracy')

# Split the data into training and validation sets (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

# Run grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Evaluate the model with the best parameters on the validation set
y_pred = grid_search.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print(f'Validation Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')

# Preprocess and predict on the test set using test_data_1
test_predictions = grid_search.predict(test_data_1)

Best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
Best cross-validation score: 0.80
Validation Accuracy: 0.7872340425531915
Classification Report:
              precision    recall  f1-score   support

       False       0.80      0.75      0.78       861
        True       0.77      0.82      0.80       878

    accuracy                           0.79      1739
   macro avg       0.79      0.79      0.79      1739
weighted avg       0.79      0.79      0.79      1739



In [2]:
test_predictions

array([ True, False,  True, ...,  True,  True, False])

In [5]:
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Transported': test_predictions
})
# Convert the Transported column to boolean (if it's not already)
submission['Transported'] = submission['Transported'].astype(bool)

# Save the submission DataFrame to a CSV file
submission_file_path = 'submission4.csv'
submission.to_csv(submission_file_path, index=False)

print(f'Submission file saved to {submission_file_path}')

Submission file saved to submission4.csv
