<a href="https://colab.research.google.com/github/Kaiziferr/machine_learning/blob/main/decision_tree/03_pos_pruning_tree_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [35]:
import warnings

import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.datasets import make_regression, make_friedman1

# **Config**
---

In [36]:
sns.set(style="darkgrid")
pd.set_option('display.float_format', '{:,.5f}'.format)
random_seed = 12354
warnings.filterwarnings('ignore')

# **Data**
---

In [37]:
X, y = make_regression(n_samples=500, n_features=8, n_informative=5, bias=0.0, noise=1.5, random_state=random_seed)
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.33309,0.40515,-0.29568,0.03915,0.62578,0.03084,2.14020,-2.30404
1,0.64701,0.97186,-0.25811,0.18368,-0.05451,0.66951,0.03158,0.35562
2,1.18575,-1.84027,0.01981,-0.23854,-0.56441,-1.03689,-1.20976,-1.75251
3,0.55373,0.92211,0.37396,0.66754,1.27186,-0.16641,0.60589,0.36458
4,-0.55459,-0.23257,2.49568,-0.18946,-0.29539,-1.20166,-1.05297,0.29651
...,...,...,...,...,...,...,...,...
495,-0.38648,0.78051,-0.01369,-1.73480,-1.62471,1.14644,-0.73190,0.20413
496,0.26934,1.02124,0.17559,0.91135,0.35462,-0.04796,-0.18890,-1.32060
497,0.04411,-1.40635,-2.06421,-0.57773,0.48997,-1.27363,-0.09255,0.12158
498,0.93807,0.05416,-0.80141,-2.41603,-0.46104,0.95923,-1.20203,1.72506


In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=random_seed)

In [63]:
def cost_complexity_pruning_scores(
  estimator,
  X_train,
  X_test,
  y_train,
  y_test,
  individual_metric,
  cv = 5,
  cv_metric = 'neg_mean_squared_error',
  typ = 'R'
):
  try:
    cost = estimator.cost_complexity_pruning_path(X_train, y_train)
    ccp_alphas, impurities = cost.ccp_alphas, cost.impurities
    node_counts = []
    depth = []
    train_scores = []
    test_scores = []
    models = []
    cro_cv = []
    score_trainer = []
    score_test = []
    for ccp_alpha in ccp_alphas:
      model = DecisionTreeRegressor(random_state=random_seed, ccp_alpha=ccp_alpha)
      p = 1
      if cv_metric in ['neg_mean_squared_error']:
        p=-1
      cro_cv.append(p*cross_val_score(model, X_train, y_train, cv=cv, scoring=cv_metric).mean())
      model.fit(X_train, y_train)
      score_trainer.append(model.score(X_train, y_train))
      score_test.append(model.score(X_test, y_test))
      node_counts.append(model.tree_.node_count)
      depth.append(model.tree_.max_depth)
      train_scores.append(individual_metric(y_train, model.predict(X_train)))
      test_scores.append(individual_metric(y_test, model.predict(X_test)))
      models.append(model)


    fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15,10))
    ax[0,0].plot(ccp_alphas, score_trainer, marker="o", label="train", drawstyle="steps-post")
    ax[0,0].plot(ccp_alphas, score_test, marker="o", label="train", drawstyle="steps-post")
    ax[0,0].set_xlabel("Alpha")
    ax[0,0].set_ylabel("Score")
    ax[0,0].set_title("Score vs alpha for training and testing sets")
    best_alpha = ccp_alphas[score_test.index(m(score_test))]
    print(best_alpha)

    ax[0,1].plot(ccp_alphas, train_scores, marker="o", label="train", drawstyle="steps-post")
    ax[0,1].plot(ccp_alphas, test_scores, marker="o", label="test", drawstyle="steps-post")
    ax[0,1].set_xlabel("Alpha")
    ax[0,1].set_ylabel("Metric")
    ax[0,1].set_title("Metric vs alpha for training and testing sets")
    best_alpha = ccp_alphas[test_scores.index(min(test_scores))]
    print(best_alpha)

    ax[0,2].plot(ccp_alphas, cro_cv, marker="o", label="test", drawstyle="steps-post")
    ax[0,2].set_xlabel("Alpha")
    ax[0,2].set_ylabel("Mean Score Cross Validation ")
    ax[0,2].set_title("Cross Validation vs alpha")
    best_alpha = ccp_alphas[cro_cv.index(min(cro_cv))]
    print(best_alpha)

    ax[1,0].plot(ccp_alphas[:-1], impurities[: -1], marker="o", drawstyle='steps-post')
    ax[1,0].set_xlabel("Effective alpha")
    ax[1,0].set_ylabel("Total impurity of leaves")
    ax[1,0].set_title("Total Impurity vs effective alpha for training set")

    ax[1,1].plot(ccp_alphas, node_counts, marker="o", drawstyle="steps-post")
    ax[1,1].set_xlabel("Alpha")
    ax[1,1].set_ylabel("Number of nodes")
    ax[1,1].set_title("Number of nodes vs alpha")

    ax[1,2].plot(ccp_alphas, depth, marker="o", drawstyle="steps-post")
    ax[1,2].set_xlabel("alpha")
    ax[1,2].set_ylabel("depth of tree")
    ax[1,2].set_title("Depth vs alpha")
    fig.tight_layout()
    plt.subplots_adjust(top=0.9)
  except:
    print('x')



In [64]:
model_regresion = DecisionTreeRegressor(random_state=random_seed)

In [65]:
cost_complexity_pruning_scores(model_regresion, X_train, X_test, y_train, y_test, individual_metric=mean_absolute_error, cv=10)

x
