In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv('titanic.csv')

# Select relevant variables
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'

# Split the data into training, development, and test sets
train_data, temp_data = train_test_split(data, test_size=0.3, random_state=42)
dev_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Handle missing values in the 'Age' column by filling with the mean age
mean_age = train_data['Age'].mean()
train_data['Age'].fillna(mean_age, inplace=True)
dev_data['Age'].fillna(mean_age, inplace=True)
test_data['Age'].fillna(mean_age, inplace=True)

# Encode categorical features
encoder = LabelEncoder()
for feature in features:
    if train_data[feature].dtype == 'object':
        train_data[feature] = encoder.fit_transform(train_data[feature])
        dev_data[feature] = encoder.transform(dev_data[feature])
        test_data[feature] = encoder.transform(test_data[feature])

# Define X_train, y_train, X_dev, and y_dev
X_train = train_data[features]
y_train = train_data[target]
X_dev = dev_data[features]
y_dev = dev_data[target]

models = [
    ("Bagged", BaggingClassifier(n_estimators=100, random_state=42), {}),
    ("Random Forest", RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42), {}),
    ("Boosted", GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42), {})
]

best_model = None
best_accuracy = 0

# Train models and evaluate on the development set
for model_name, model, params in models:
    model.fit(X_train, y_train)
    y_dev_pred = model.predict(X_dev)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    
    if dev_accuracy > best_accuracy:
        best_accuracy = dev_accuracy
        best_model = (model_name, model, params)

# Report model accuracies
print("Model Accuracies:")
for model_name, model, params in models:
    y_dev_pred = model.predict(X_dev)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    print(f"{model_name}: {dev_accuracy:.2f}")

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_model.feature_importances_

# Comment on feature importances
feature_importance_dict = dict(zip(features, feature_importances))
sorted_feature_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

most_important_feature = sorted_feature_importances[0][0]
least_important_feature = sorted_feature_importances[-1][0]

print(f"\nThe most important feature for predicting survival is '{most_important_feature}'.")
print(f"The least important feature for predicting survival is '{least_important_feature}'.")

# Evaluate the best model on the test set and report its accuracy
X_test = test_data[features]
y_test = test_data[target]
y_test_pred = best_model[1].predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test Accuracy of Best Model ({best_model[0]}): {test_accuracy:.2f}")


FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'