In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load the Titanic dataset
data = pd.read_csv('titanic.csv')

# Select relevant features (variables) for prediction
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
target = 'Survived'
data = data[features + [target]]

# Handle missing data
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)

# Convert categorical variables to numerical
data = pd.get_dummies(data, columns=['Sex', 'Embarked'], drop_first=True)

# Define the train-dev-test split ratios (60% - 20% - 20%)
train_ratio = 0.6
dev_ratio = 0.2

# Split the data into train-dev-test sets
X = data.drop(target, axis=1)
y = data[target]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=1 - train_ratio, random_state=42)
X_dev, X_test, y_dev, y_test = train_test_split(X_temp, y_temp, test_size=dev_ratio / (dev_ratio + (1 - dev_ratio)), random_state=42)

# Create lists to store accuracy values
max_depth_values = range(2, 11)
train_accuracy = []
dev_accuracy = []

# Train Decision Tree models with different max_depth values
for max_depth in max_depth_values:
    clf = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    clf.fit(X_train, y_train)

    y_train_pred = clf.predict(X_train)
    y_dev_pred = clf.predict(X_dev)

    train_accuracy.append(accuracy_score(y_train, y_train_pred))
    dev_accuracy.append(accuracy_score(y_dev, y_dev_pred))

# Find the best max_depth based on dev accuracy
best_max_depth = max_depth_values[dev_accuracy.index(max(dev_accuracy))]

# Train the final model using the best max_depth
final_clf = DecisionTreeClassifier(max_depth=best_max_depth, random_state=42)
final_clf.fit(X_train, y_train)

# Evaluate the final model on the test set
y_test_pred = final_clf.predict(X_test)
test_accuracy_final = accuracy_score(y_test, y_test_pred)

# Print results
print(f'Final Model Test Accuracy: {test_accuracy_final:.2f}')
print(f'Best max_depth: {best_max_depth}')
print(f'Accuracy on the development set: {max(dev_accuracy):.2f}')

# Plot the accuracy vs. max_depth
plt.figure(figsize=(8, 6))
plt.plot(max_depth_values, train_accuracy, label='Train Accuracy')
plt.plot(max_depth_values, dev_accuracy, label='Development Accuracy')
plt.xlabel('max_depth')
plt.ylabel('Accuracy')
plt.title('Decision Tree Accuracy vs. max_depth')
plt.legend()
plt.grid(True)
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'titanic.csv'