# Baseline Model

## Table of Contents
1. [Model Choice](#model-choice)
2. [Feature Selection](#feature-selection)
3. [Implementation](#implementation)
4. [Evaluation](#evaluation)


In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.tree import export_text

## Model Choice

We chose a decision tree model because of its simplicity and interpretability. Since we are dealing with a classification problem, a tree-based approach allows for enhanced feature selection. Moreover, it is applicable to both categorical and numerical data, which we have, without the need for
preprocessing, and it can also recognize nonlinear relationships.



## Feature Selection

We are exploiting our model using all available variables, afterwards we will apply a SHAP method in order to indentify the more and less important features for the target variable.

In [3]:
data_path = "../share_w6_retirees_w9_clean - share_w6_retirees_w9_clean.csv"
df = pd.read_csv(data_path)
df.head()

a = list(df.isna().sum())
df.isna().sum()>0

df.shape
(1394, 127)
object_cols = df.select_dtypes(include='object')
df_new = df.drop(columns=object_cols.columns)
df_new.head()

df_new.info()



FileNotFoundError: [Errno 2] No such file or directory: '../share_w6_retirees_w9_clean - share_w6_retirees_w9_clean.csv'

## Implementation



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

df_new.fillna(0, inplace=True)
y = df_new['ep036__7']
X = df_new.drop(columns=['ep036__7'])
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 3. Further split train+val into training and validation sets (e.g., 75% train, 25% val)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

pipeline = Pipeline([
    ('clf', DecisionTreeClassifier(random_state=42))
])

param_grid = {
    'clf__criterion': ['gini', 'entropy', 'log_loss'],
    'clf__max_depth': [None, 3, 5, 10],
    'clf__min_samples_split': [2, 5, 10],
    'clf__min_samples_leaf': [1, 2, 4]
}

# 5. Set up cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 6. Use GridSearchCV for tuning
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=cv,tree_rules = export_text(best_model, feature_names=list(X.columns))

print(tree_rules)
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# 7. Fit model
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_.named_steps['clf']
best_model

X_train.columns

plt.figure(figsize=(20, 14))
plot_tree(best_model, feature_names=X_train.columns, class_names=X_train.columns, filled=True)
plt.title("Decision Tree for train X-y Dataset")
plt.show()
plt.savefig('decision_tree.png')

tree_rules = export_text(best_model, feature_names=list(X_train.columns))

# Save to a text file
with open("decision_tree_rules.txt", "w") as f:
    f.write(tree_rules)
# 8. Print best parameters and CV score
print("Best Params:", grid_search.best_params_)
print("Best CV Accuracy:", grid_search.best_score_)

## Evaluation





In [None]:
# 9. Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_pred))
print("Test Classification Report:\n", classification_report(y_test, y_pred))
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

# 5. Evaluate on validation set
val_preds = clf.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print("Validation Report:\n", classification_report(y_val, val_preds))

# 6. Retrain on train+val for final model
clf_final = DecisionTreeClassifier(random_state=42)
clf_final.fit(X_train_val, y_train_val)

# 7. Evaluate on test set
test_preds = clf_final.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print("Test Report:\n", classification_report(y_test, test_preds))