# Tarea 8: Random Forest

### Equipo 11  
<ul>
  <li>Guillermo Arredondo</li>
  <li>Iñaki Fernandez</li>
  <li>Mauricio Vazquez</li>
</ul>

In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split


def unique_vals(df, col):
    return df[col].unique()

def mean(rows):
    return np.mean(rows.iloc[:, -1])

def mse(rows):
    mean_value = mean(rows)
    return np.mean((rows.iloc[:, -1] - mean_value) ** 2)

def partition(df, question):
    mask = question.match(df)
    return df[mask], df[~mask]

class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, df):
        val = df[self.column]
        if is_numeric(self.value):
            return val >= self.value
        else:
            return val == self.value

    def __repr__(self):
        condition = "=="
        if is_numeric(self.value):
            condition = ">="
        return "Is %s %s %s?" % (
            self.column, condition, str(self.value))

def is_numeric(value):
    return isinstance(value, (int, float))

def info_gain(left, right, current_mse):
    p = float(len(left)) / (len(left) + len(right))
    return current_mse - p * mse(left) - (1 - p) * mse(right)

def find_best_split(df):
    best_gain = 0
    best_question = None
    current_mse = mse(df)
    n_features = df.shape[1] - 1

    for col in df.columns[:-1]:
        values = unique_vals(df, col)
        for val in values:
            question = Question(col, val)
            true_df, false_df = partition(df, question)

            if len(true_df) == 0 or len(false_df) == 0:
                continue

            gain = info_gain(true_df, false_df, current_mse)

            if gain >= best_gain:
                best_gain, best_question = gain, question

    return best_gain, best_question

class Leaf:
    def __init__(self, df):
        self.prediction = mean(df)

class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

def build_tree(df):
    gain, question = find_best_split(df)

    if gain == 0:
        return Leaf(df)

    true_df, false_df = partition(df, question)
    true_branch = build_tree(true_df)
    false_branch = build_tree(false_df)

    return Decision_Node(question, true_branch, false_branch)

def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + "Predict", node.prediction)
        return

    print(spacing + str(node.question))

    print(spacing + '--> True:')
    print_tree(node.true_branch, spacing + "  ")

    print(spacing + '--> False:')
    print_tree(node.false_branch, spacing + "  ")

def classify(row, node):
    if isinstance(node, Leaf):
        return node.prediction

    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)


In [5]:
hitters_df=pd.read_csv("dataset-87300.csv")

hitters_df.dropna(inplace=True)
hitters_df = pd.get_dummies(hitters_df, drop_first=True)

train_df, test_df = train_test_split(hitters_df, test_size=0.2, random_state=42)

In [6]:
hitters_df

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,Salary,League_N,Division_W,NewLeague_N
-Alan Ashby,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,475.0,1,1,1
-Alvin Davis,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,480.0,0,1,0
-Andre Dawson,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,500.0,1,0,1
-Andres Galarraga,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,91.5,1,0,1
-Alfredo Griffin,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,750.0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
-Willie McGee,497,127,7,65,48,37,5,2703,806,32,379,311,138,325,9,3,700.0,1,0,1
-Willie Randolph,492,136,5,76,50,94,12,5511,1511,39,897,451,875,313,381,20,875.0,0,0,0
-Wayne Tolleson,475,126,3,61,43,52,6,1700,433,7,217,93,146,37,113,7,385.0,0,1,0
-Willie Upshaw,573,144,9,85,60,78,8,3198,857,97,470,420,332,1314,131,12,960.0,0,0,0


In [7]:
tree = build_tree(train_df)


In [None]:
print_tree(tree)

Is League_N == 0?
--> True:
  Is AtBat == 490?
  --> True:
    Predict 1.0
  --> False:
    Is CHmRun == 2?
    --> True:
      Is Division_W == 0?
      --> True:
        Predict 0.0
      --> False:
        Predict 1.0
    --> False:
      Is Assists == 48?
      --> True:
        Predict 1.0
      --> False:
        Is Assists == 286?
        --> True:
          Predict 1.0
        --> False:
          Is Assists == 22?
          --> True:
            Predict 1.0
          --> False:
            Is PutOuts == 686?
            --> True:
              Predict 1.0
            --> False:
              Predict 0.0
--> False:
  Is HmRun == 11?
  --> True:
    Is Division_W == 1?
    --> True:
      Predict 1.0
    --> False:
      Predict 0.0
  --> False:
    Is Errors == 14?
    --> True:
      Predict 0.0
    --> False:
      Is CWalks == 117?
      --> True:
        Predict 0.0
      --> False:
        Is Assists == 70?
        --> True:
          Predict 0.0
        --> False:
       

In [None]:
def test_tree(tree, test_df):
    predictions = []
    actual_values = test_df.iloc[:, -1].values
    
    for _, row in test_df.iterrows():
        prediction = classify(row, tree)
        predictions.append(prediction)
    
    mse = np.mean((actual_values - predictions) ** 2)
    return mse


In [None]:
test_tree(tree,test_df)