<h1>Aprendizaje de arboles de decision</h1>
<ul>
    <li> Objetivo: aprender un arbol de decision consistente con los ejemplos
    <li> Para posteriormente clasificar ejemplos nuevos
    <li> Ejemplos de conjuntos de entrenamiento:
</ul>

![Alt text](image.png)

<ul>
    <li><b>Features:</b> Cielo, Temperatura, Humedad, Viento
    <li><b>Label:</b> Jugar_tenis (La salida que queremos predecir)
    <li><b>Class:</b> +, -
</ul>

In [156]:
import pandas as pd 
import numpy as np 

In [157]:

df = pd.read_csv("PlayTennis.csv")

df

Unnamed: 0,Cielo,Temperatura,Humedad,Viento,Jugar_tenis
0,Sol,Alta,Alta,Debil,-
1,Sol,Alta,Alta,Fuerte,-
2,Nubes,Alta,Alta,Debil,+
3,Lluvia,Suave,Alta,Debil,+
4,Lluvia,Baja,Normal,Debil,+
5,Lluvia,Baja,Normal,Fuerte,-
6,Nubes,Baja,Normal,Fuerte,+
7,Sol,Suave,Alta,Debil,-
8,Sol,Baja,Normal,Debil,+
9,Lluvia,Suave,Normal,Debil,+


<h3>FIND ENTROPY</h3>

In [158]:
print("target-->", df.keys()[-1])
print("values-->", df['Jugar_tenis'].unique()) 
print("ej fraction-->",df['Jugar_tenis'].value_counts()['-']/len(df['Jugar_tenis'])) # num(-)/total('-'+'+')

target--> Jugar_tenis
values--> ['-' '+']
ej fraction--> 0.35714285714285715


In [159]:
def find_entropy(df):
    #target column
    target = df.keys()[-1] #la ultima columna es la que queremos predecir
    entropy = 0
    values = df[target].unique()
    #calc entropy
    for value in values:
        fraction = df[target].value_counts()[value]/len(df[target])
        entropy += -fraction*np.log2(fraction)
    return entropy

<h3>AVERAGE INFORMATION</h3>

In [160]:
eps = 1e-5  # Define a small positive value

#df: A DataFrame containing the dataset.
#attribute: A list of columns from the DataFrame df.
def average_information(df,attribute): #attribute = [columns]
  target = df.keys()[-1]   #target column
  target_variables = df[target].unique()  #This gives all 'Yes' and 'No'
  variables = df[attribute].unique()    #This gives different features in that attribute (like 'Hot','Cold' in attribute=Temperature)
  entropy2 = 0
  
  #The outer loop iterates over the unique values in the specified columns (variable represents a specific feature value within those columns).
  for variable in variables: #iterates in ['Hot', 'Cold'] for example
      entropy = 0
      #The inner loop iterates over the unique target variables (e.g., '+', '-').
      for target_variable in target_variables: 
          #num: The count of rows where the specified attribute (e.g., 'Hot') is equal to variable and the target variable matches target_variable (e.g., 'Hot' and '+').
          num = len(df[attribute][df[attribute]==variable][df[target] ==target_variable])
          #den: The count of rows where the specified attribute is equal to variable.
          den = len(df[attribute][df[attribute]==variable])
          fraction = num/(den+eps)
          entropy += -fraction*np.log(fraction+eps)
      fraction2 = den/len(df)
      entropy2 += -fraction2*entropy
  return abs(entropy2)

In [161]:
target = df.keys()[-1]
print("target_variables", df[target].unique())

target_variables ['-' '+']


In [162]:
def average_information(df, attribute):
    target = df.keys()[-1]   # Target column
    target_variables = df[target].unique()  # Unique target values ['-', '+']
    variables = df[attribute].unique()    # This gives different features in that attribute (like 'Hot','Cold' in attribute=Temperature)
    entropy2 = 0
    
    #The outer loop iterates over the unique values in the specified columns (variable represents a specific feature value within those columns).
    for variable in variables: #iterates in ['Hot', 'Cold'] for example
        entropy = 0
        
        #The inner loop iterates over the unique target variables (e.g., '+', '-').
        for target_variable in target_variables: 
            
            # Count the number of rows where the attribute matches 'variable' and the target matches 'target_variable'
            matching_rows = df[(df[attribute] == variable) & (df[target] == target_variable)]
            
            #print("\ntarget_variable:",target_variable, "\nattribute=", attribute, "\nvariable=", variable, '\nmatching_rows:\n',matching_rows, '\n \n \n')
            
            
            # Calculate 'num' as the count of matching rows
            num = len(matching_rows)
            
            # Count the total number of rows where the attribute matches 'variable'
            total_matching_rows = len(df[df[attribute] == variable])
            
            # Calculate 'den' as the count of total matching rows
            den = total_matching_rows
            
            # Calculate 'fraction' with 'eps' to avoid division by zero
            eps = 1e-5  # Small positive value to avoid division by zero
            fraction = num / (den + eps)
            
            # Calculate entropy contribution for this combination
            entropy += -fraction * np.log2(fraction + eps)

            print("target_variable:",target_variable, " / attribute=", attribute, " / variable=", variable, " / num=", num, " / den=", den,
                  "fraction=", fraction, "entropy", entropy, '\nmatching_rows:\n',matching_rows, '\n')
            
        fraction2 = den / len(df)
        
        # Accumulate the entropy contributions for different 'variable' values
        entropy2 += -fraction2 * entropy
    
    # Return the absolute value of 'entropy2' as the result
    return abs(entropy2)


In [171]:
df.keys()[:-1]

Index(['Cielo', 'Temperatura', 'Humedad', 'Viento'], dtype='object')

In [163]:
def find_winner(df):
    IG = []
    for key in df.keys()[:-1]: # ['Cielo', 'Temperatura', 'Humedad', 'Viento']
        IG.append(find_entropy(df)-average_information(df,key)) # prueba average_information para cada columna 
    return df.keys()[:-1][np.argmax(IG)]

In [164]:
def get_subtable(df, node,value):
  return df[df[node] == value].reset_index(drop=True)

In [165]:
def buildTree(df,tree=None): 
    target = df.keys()[-1]   #target column
    
    #Here we build our decision tree

    #Get attribute with maximum information gain
    node = find_winner(df)
    
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
    #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable[target],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) #Calling the function recursively 
                   
    return tree

In [166]:
#build Tree
tree = buildTree(df)

target_variable: -  / attribute= Cielo  / variable= Sol  / num= 3  / den= 5 fraction= 0.5999988000024 entropy 0.44216577654144945 
matching_rows:
   Cielo Temperatura Humedad  Viento Jugar_tenis
0   Sol        Alta    Alta   Debil           -
1   Sol        Alta    Alta  Fuerte           -
7   Sol       Suave    Alta   Debil           - 

target_variable: +  / attribute= Cielo  / variable= Sol  / num= 2  / den= 5 fraction= 0.39999920000160005 entropy 0.970922684338529 
matching_rows:
    Cielo Temperatura Humedad  Viento Jugar_tenis
8    Sol        Baja  Normal   Debil           +
10   Sol       Suave  Normal  Fuerte           + 

target_variable: -  / attribute= Cielo  / variable= Nubes  / num= 0  / den= 4 fraction= 0.0 entropy 0.0 
matching_rows:
 Empty DataFrame
Columns: [Cielo, Temperatura, Humedad, Viento, Jugar_tenis]
Index: [] 

target_variable: +  / attribute= Cielo  / variable= Nubes  / num= 4  / den= 4 fraction= 0.9999975000062501 entropy -1.0820154197534596e-05 
matching_row

In [167]:
import pprint
pprint.pprint(tree)

{'Cielo': {'Lluvia': {'Viento': {'Debil': '+', 'Fuerte': '-'}},
           'Nubes': '+',
           'Sol': {'Humedad': {'Alta': '-', 'Normal': '+'}}}}


=================================================

In [168]:

df_1 = pd.read_csv("Figuras.csv")

df_1

Unnamed: 0,Color,Forma,Tamanio,Clase
0,Rojo,Cuadrado,Grande,+
1,Azul,Cuadrado,Grande,+
2,Rojo,Redondo,Pequenio,-
3,Verde,Cuadrado,Pequenio,-
4,Rojo,Redondo,Grande,+
5,Verde,Cuadrado,Grande,-


In [169]:
#build Tree
arbol = buildTree(df_1)

target_variable: +  / attribute= Color  / variable= Rojo  / num= 2  / den= 3 fraction= 0.6666644444518518 entropy 0.389962479699178 
matching_rows:
   Color     Forma Tamanio Clase
0  Rojo  Cuadrado  Grande     +
4  Rojo   Redondo  Grande     + 

target_variable: -  / attribute= Color  / variable= Rojo  / num= 1  / den= 3 fraction= 0.3333322222259259 entropy 0.9182687284617883 
matching_rows:
   Color    Forma   Tamanio Clase
2  Rojo  Redondo  Pequenio     - 

target_variable: +  / attribute= Color  / variable= Azul  / num= 1  / den= 1 fraction= 0.9999900000999989 entropy -1.44266471640627e-10 
matching_rows:
   Color     Forma Tamanio Clase
1  Azul  Cuadrado  Grande     + 

target_variable: -  / attribute= Color  / variable= Azul  / num= 0  / den= 1 fraction= 0.0 entropy -1.44266471640627e-10 
matching_rows:
 Empty DataFrame
Columns: [Color, Forma, Tamanio, Clase]
Index: [] 

target_variable: +  / attribute= Color  / variable= Verde  / num= 0  / den= 2 fraction= 0.0 entropy 0.0 
match

In [170]:
import pprint
pprint.pprint(arbol)

{'Color': {'Azul': '+',
           'Rojo': {'Tamanio': {'Grande': '+', 'Pequenio': '-'}},
           'Verde': '-'}}
