<a href="https://colab.research.google.com/github/Kaiziferr/machine_learning/blob/main/decision_tree/01_show_ways_modify_importance_features_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
from unicodedata import normalize

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import (DecisionTreeRegressor, plot_tree)
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

#**Info**
---
@By: **Steven Bernal**

@Nickname: **Kaiziferr**

@Git: https://github.com/Kaiziferr

# **Funciones**
---

In [None]:
def normalize_word(word):
  """Normalization of names of features"""
  word = word.replace(' ', '_')
  find_guion = word.find('_')
  list_word = []
  if find_guion:
    list_word = [w for w in word.split('_') if w != '']
  else:
    list_word = word
  word = list(map(lambda x: x.lower(), list_word))
  word = [normalize('NFKD', c).encode('ASCII', 'ignore').decode() for c in word]
  word = "_".join(word)
  return word


def normalize_name_columns(columns):
  """Normalization of columns"""
  columns = list(map(lambda x: normalize_word(x), columns))
  return columns


def tree_plot(model, X_columns, y_columns):
  fig, ax = plt.subplots(figsize=(12, 5))
  print(f"Tree Deep: {model.get_depth()}")
  print(f"Number of terminal nodes: {model.get_n_leaves()}")

  plot = plot_tree(
    decision_tree = model,
    feature_names = X_columns,
    class_names   = y_columns,
    filled        = True,
    impurity      = False,
    fontsize      = 5,
    precision     = 2,
    ax            = ax
)

def importances_feature(model, X_columns):
  """important predictors table"""
  importancia_predictores = pd.DataFrame(
      {'predictor': X_columns,
      'importance': model.feature_importances_*100}
      )

  return importancia_predictores.sort_values('importance', ascending=False)

# **Config**
---

In [None]:
random_seed = 12354
warnings.filterwarnings('ignore')
sns.set(style='darkgrid')
pd.set_option('display.float_format', '{:,.3f}'.format)
title_data = 'Mineral Resources'
paleta = sns.color_palette('Set2').as_hex()
np.set_printoptions(precision=3, suppress=True)

# **Data**
---

In [None]:
url_materiales_mineros = 'https://drive.google.com/file/d/10_YGM_dOKzzLsLNkfB_bN5YeLgDHcrUZ/view?usp=sharing'
url_materiales_mineros = 'https://drive.google.com/uc?id=' + url_materiales_mineros.split('/')[-2]
data_materiales_mineros = pd.read_csv(url_materiales_mineros, dtype='str')

In [None]:
data_materiales_mineros.columns

Index(['Municipio Productor', 'Departamento', 'Recurso Natural',
       'Nombre Del Proyecto', 'Año Produccion', 'Trimestre', 'Unidad Medida',
       'Tipo Contraprestacion', 'Valor Contraprestacion ',
       'Cantidad Producción'],
      dtype='object')

In [None]:
data_materiales_mineros.head()

Unnamed: 0,Municipio Productor,Departamento,Recurso Natural,Nombre Del Proyecto,Año Produccion,Trimestre,Unidad Medida,Tipo Contraprestacion,Valor Contraprestacion,Cantidad Producción
0,Medellin,Antioquia,ARCILLAS,PRODUCTORES,2018,Trimestre 4,TONELADAS,REGALIAS,614258,0
1,Medellin,Antioquia,ARCILLAS CERAMICAS,PRODUCTORES,2014,Trimestre 4,TONELADAS,REGALIAS,2036781,14381
2,Medellin,Antioquia,ARCILLAS FERRUGINOSAS,PRODUCTORES,2012,Trimestre 1,TONELADAS,REGALIAS,836927,16308
3,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2019,Trimestre 1,TONELADAS,REGALIAS,5763839,33348
4,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2022,Trimestre 1,TONELADAS,REGALIAS,7187998,28540


In [None]:
data_materiales_mineros.columns = [
    'producing_municipality',
    'department',
    'natural_resource',
    'project_name',
    'production_year',
    'quarter',
    'unit_measure',
    'compensation',
    'amount',
    'production quantity'
]

In [None]:
# Normalization the names of the features
data_materiales_mineros.columns = normalize_name_columns(
    data_materiales_mineros.columns)

In [None]:
data_materiales_mineros.columns

Index(['producing_municipality', 'department', 'natural_resource',
       'project_name', 'production_year', 'quarter', 'unit_measure',
       'compensation', 'amount', 'production_quantity'],
      dtype='object')

In [None]:
# Assignment the real type
data_materiales_mineros[[
    'amount',
    'production_year'
]] = data_materiales_mineros[[
    'amount',
    'production_year'
]].astype('float64')

In [None]:
# Se tomará como periodo de datos para el entrenamiento de modelos 5 años
data_materiales_mineros = data_materiales_mineros[
    (data_materiales_mineros["production_year"] >= 2018)
    &
    (data_materiales_mineros["production_year"] <= 2022)
]

In [None]:
# Five years are selected for data
data_materiales_mineros[
    'quarter'] = data_materiales_mineros[
        'quarter'].apply(lambda x: x.split(' ')[1])

data_materiales_mineros.head(3)

Unnamed: 0,producing_municipality,department,natural_resource,project_name,production_year,quarter,unit_measure,compensation,amount,production_quantity
0,Medellin,Antioquia,ARCILLAS,PRODUCTORES,2018.0,4,TONELADAS,REGALIAS,614258.0,0
3,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2019.0,1,TONELADAS,REGALIAS,5763839.0,33348
4,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2022.0,1,TONELADAS,REGALIAS,7187998.0,28540


In [None]:
# Join of feature of department and producing municipality for made a new feature that represents a key
data_materiales_mineros['key'] = data_materiales_mineros.producing_municipality.str.cat(
    data_materiales_mineros.department, sep='-')

In [None]:
data_materiales_mineros.head()

Unnamed: 0,producing_municipality,department,natural_resource,project_name,production_year,quarter,unit_measure,compensation,amount,production_quantity,key
0,Medellin,Antioquia,ARCILLAS,PRODUCTORES,2018.0,4,TONELADAS,REGALIAS,614258.0,0,Medellin-Antioquia
3,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2019.0,1,TONELADAS,REGALIAS,5763839.0,33348,Medellin-Antioquia
4,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2022.0,1,TONELADAS,REGALIAS,7187998.0,28540,Medellin-Antioquia
6,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2019.0,2,TONELADAS,REGALIAS,5802546.0,32485,Medellin-Antioquia
7,Medellin,Antioquia,ARCILLAS MISCELANEAS,PRODUCTORES,2022.0,2,TONELADAS,REGALIAS,4051294.0,16309,Medellin-Antioquia


In [None]:
# Grouping of features transactional
function_dictionary = {
    "amount": ['sum', 'count']
}

filter_feature = [
    'key',
    'production_year',
    'quarter',
    'compensation']

new_val_col = [
    'sum_compensation',
    'count_compensation'
]


data_group = data_materiales_mineros.groupby(
    filter_feature).aggregate(function_dictionary)

data_group.columns = new_val_col
data_group = data_group.reset_index()

data_group.head()

Unnamed: 0,key,production_year,quarter,compensation,sum_compensation,count_compensation
0,Abejorral-Antioquia,2018.0,1,REGALIAS,2783489.0,1
1,Abejorral-Antioquia,2018.0,2,REGALIAS,3659452.0,1
2,Abejorral-Antioquia,2018.0,3,REGALIAS,5732763.0,1
3,Abejorral-Antioquia,2018.0,4,REGALIAS,11706438.0,2
4,Abejorral-Antioquia,2019.0,1,REGALIAS,5793131.0,2


In [None]:
function_dictionary = {
    "sum_compensation": ['sum'],
    "count_compensation": ['sum'],
    'quarter': ['count'],
    'production_year': ['nunique']
}

filter_feature = [
    'key',
    'compensation']



data_group2 = data_group.groupby(
    filter_feature).aggregate(function_dictionary)

new_val_col = [
    'total_compensation',
    'count_transactions',
    'number_active_quarters',
    'year_production_active',
]


data_group2.columns = new_val_col
data_group2 = data_group2.reset_index()

data_group2.head()

Unnamed: 0,key,compensation,total_compensation,count_transactions,number_active_quarters,year_production_active
0,Abejorral-Antioquia,REGALIAS,52580674.0,17,11,4
1,Abrego-Norte de Santander,REGALIAS,22530021.0,21,16,5
2,Abriaqui-Antioquia,REGALIAS,355266722.0,35,15,5
3,Acacias-Meta,REGALIAS,607849192.0,58,20,5
4,Achi-Bolivar,REGALIAS,557468046.0,6,3,1


# **Split**
---

In [None]:
data_numerica = data_group2.select_dtypes(include=['float64', 'int64']).columns.to_list()
data_categorica = data_group2.select_dtypes(include=['object']).columns.to_list()

In [None]:
# The purpose will be to predict the amounts corresponding to the type of compensation.
print(data_numerica[0])
print(data_numerica[1:]+data_categorica[1:])

total_compensation
['count_transactions', 'number_active_quarters', 'year_production_active', 'compensation']


In [None]:
X = data_group2[data_numerica[1:]+data_categorica[1:]]
y = data_group2[data_numerica[0]]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state = random_seed
)

#**Preprocessing and Model**
---

I will first use the "OneHotEncoder" method without eliminating a variable.

In [None]:
compose_column = ColumnTransformer(
    [(
        'OneHot', OneHotEncoder(), data_categorica[1:]
    )],
    remainder='passthrough'
)

X_train_compose = compose_column.fit_transform(X_train)
X_test_compose = compose_column.transform(X_test)

encoded_cat = compose_column.named_transformers_['OneHot'].get_feature_names_out(data_categorica[1:]).tolist()
col = encoded_cat + data_numerica[1:]
col

['compensation_COMPENSACIÓN',
 'compensation_REGALIAS',
 'count_transactions',
 'number_active_quarters',
 'year_production_active']

In [None]:
X_train_method =pd.DataFrame(X_train_compose, columns = col)
X_test_method =pd.DataFrame(X_test_compose, columns = col)

- A model with a depth of 3 will be used as a baseline.

In [None]:
model_base = DecisionTreeRegressor(
    max_depth = 3,
    random_state = random_seed,
)
params_base_master =  model_base.get_params()
model_base.fit(X_train_method, y_train)
importances_feature(model_base, X_train_method.columns)

Unnamed: 0,predictor,importance
2,count_transactions,93.62
1,compensation_REGALIAS,6.38
0,compensation_COMPENSACIÓN,0.0
3,number_active_quarters,0.0
4,year_production_active,0.0


- The preprocessing of this dataset has the peculiarity that there is a single important feature, while the rest have no effect on the tree split.

- One possible cause could be the limited variation of the other variables compared to the most important one (number of transactions).

In [None]:
print(f"""
- numero_transacciones: {X_train_method.count_transactions.var()}
- numero_trimestres_activos: {X_train_method.number_active_quarters.var()}
- ano_produccion_activos: {X_train_method.year_production_active.var()}
- tipo_contraprestacion_COMPENSACIÓN: {X_train_method.compensation_COMPENSACIÓN.var()}
- tipo_contraprestacion_REGALIAS: {X_train_method.compensation_REGALIAS.var()}
""")


- numero_transacciones: 824.9490913552819
- numero_trimestres_activos: 45.289707192953
- ano_produccion_activos: 1.9150833731578476
- tipo_contraprestacion_COMPENSACIÓN: 0.05265231262028081
- tipo_contraprestacion_REGALIAS: 0.05265231262028081



The params modified is max_feature with the value 'sqrt'

In [None]:
params_t = params_base_master.copy()
params_t['max_features'] = "sqrt"
model_base.set_params(
   **params_t
)
model_base.fit(X_train_method, y_train)
importances_feature(model_base, X_train_method.columns)

Unnamed: 0,predictor,importance
2,count_transactions,74.776
3,number_active_quarters,12.949
1,compensation_REGALIAS,12.275
0,compensation_COMPENSACIÓN,0.0
4,year_production_active,0.0


The importance of variable count_transactions decreased, but the importance of column number_active_quarters and compensation_REGALIAS increased. This is last is a feature category that explain the type compensation, which is the regalias

The params modified is max_feature with the value 'log2'

In [None]:
params_t = params_base_master.copy()
params_t['max_features'] = "log2"
model_base.set_params(
   **params_t
)
model_base.fit(X_train_method, y_train)
importances_feature(model_base, X_train_method.columns)

Unnamed: 0,predictor,importance
2,count_transactions,74.776
3,number_active_quarters,12.949
1,compensation_REGALIAS,12.275
0,compensation_COMPENSACIÓN,0.0
4,year_production_active,0.0


This results is same that previous results

Now, is modified with value quantitative in percentage

In [None]:
params_t = params_base_master.copy()
params_t['max_features'] = 0.7
model_base.set_params(
   **params_t
)
model_base.fit(X_train_method, y_train)
importances_feature(model_base, X_train_method.columns)

Unnamed: 0,predictor,importance
2,count_transactions,93.097
0,compensation_COMPENSACIÓN,6.468
3,number_active_quarters,0.434
1,compensation_REGALIAS,0.0
4,year_production_active,0.0


With this configuration, the model assigns more importance to the other categorical variable.

Now we will validate how the model's importance behaves by applying certain preprocessing to the categorical variable.
---

The first thing that was done was to count rows of each type compensation and divide it by the total number of examples, to determine a percentage when the event occurs.

In [None]:
X_train_one = X_train.copy()
count_compensation = pd.concat([X_train, y_train], axis=1).groupby(['compensation'])['total_compensation'].count()
count_compensation = count_compensation/X_train.shape[0]
X_train_one['frequency compensation'] = X_train_one['compensation'].map(count_compensation)
X_train_one = X_train_one.drop(['compensation'], axis=1)
X_train_one

Unnamed: 0,count_transactions,number_active_quarters,year_production_active,frequency compensation
257,7,5,4,0.944
403,2,1,1,0.056
447,65,20,5,0.944
536,1,1,1,0.944
350,119,20,5,0.944
...,...,...,...,...
635,77,20,5,0.944
298,22,10,5,0.944
22,51,20,5,0.944
100,6,5,3,0.944


In [None]:
model_base = DecisionTreeRegressor(
    max_depth = 3,
    random_state = random_seed,
)
params_base_master =  model_base.get_params()
model_base.fit(X_train_one, y_train)
importances_feature(model_base, X_train_one.columns)

Unnamed: 0,predictor,importance
0,count_transactions,93.62
3,frequency compensation,6.38
1,number_active_quarters,0.0
2,year_production_active,0.0


However, this experiment assigned importance to the frequency of the category variable

In this iteration, is calcule of prob of not occurrence of event

In [None]:
X_train_one = X_train_one.rename(columns={'frequency compensation': 'proba_true'})

In [None]:
X_train_one['proba_false'] = 1-X_train_one['proba_true']

In [None]:
X_train_one.head()

Unnamed: 0,count_transactions,number_active_quarters,year_production_active,proba_true,proba_false
257,7,5,4,0.944,0.056
403,2,1,1,0.056,0.944
447,65,20,5,0.944,0.056
536,1,1,1,0.944,0.056
350,119,20,5,0.944,0.056


In [None]:
model_base.fit(X_train_one, y_train)
importances_feature(model_base, X_train_one.columns)

Unnamed: 0,predictor,importance
0,count_transactions,93.62
3,proba_true,6.38
1,number_active_quarters,0.0
2,year_production_active,0.0
4,proba_false,0.0


But is the same results the previous config

This demonstrates that the importance of variables in the decision tree model can be modified to some extent indirectly. While it is possible to select a more robust model (decision trees and XGBoost), it is interesting to know that the importance of variables can be altered in this model, even knowing that categorical variables must be used to explain different behaviors of the problem context

#**Info**
---
@By: **Steven Bernal**

@Nickname: **Kaiziferr**

@Git: https://github.com/Kaiziferr

In [None]:
Feature Weighting Technique