Mount Google Drive (optional)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
# os.chdir("/content/drive/MyDrive/....")  # file path
print(os.getcwd())

# **HW2 : Decision Tree and Random Forest**
In *assignment 2*, you need to finish :

1. Basic Part : Implement a **Decision Tree** model and predict whether the patients in the validation set have diabetes
> * Step 1 : Load the input data
> * Step 2 : Calculate the Entropy and Information Gain
> * Step 3 : Find the Best Split
> * Step 4 : Split into 2 branches
> * Step 5 : Build decision tree
> * Step 6 : Save the answers from step2 to step5
> * Step 7 : Split data into training set and validation set
> * Step 8 : Train a decision tree model with training set
> * Step 9 : Predict the cases in the *validation set* by using the model trained in *Step8*
> * Step 10 : Calculate the f1-score of your predictions in *Step9*
> * Step 11 : Write the Output File

2. Advanced Part : Build a **Random Forest** model to make predictions
> * Step 1 : Load the input data
> * Step 2 : Load the test data
> * Step 3 : Build a random forest
> * Step 4 : Predict the cases in the test data by using the model trained in *Step3*
> * Step 5 : Save the predictions(from *Step 4*) in a csv file



# **Basic Part** (60%)
In this part, your need to implement a Decision Tree model by completing the following given functions.

Also, you need to run these functions with the given input variables and save the output in a csv file **hw2_basic.csv**

## Import Packages


> Note : You **cannot** import any other packages in both basic part and advanced part






In [118]:
import numpy as np
import pandas as pd
import math
import random
from numpy import sqrt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

## Step1: Load the input data
First, load the input file **hw2_input_basic.csv**

In [None]:
input_data = pd.read_csv('hw2_input_basic.csv')
input_data

## Global attributes
Define the global attributes
> Note : You **cannot** modify the values of these attributes we given in the basic part

In [120]:
max_depth = 2
depth = 0
min_samples_split = 2
n_features = input_data.shape[1] - 1

> You can add your own global attributes here

## Step2 : Calculate the Entropy and Information Gain 
Calculate the information gain and entropy values before separate data into left subtree and right subtree

In [121]:
def entropy(data):
  """
  This function measures the amount of uncertainty in a probability distribution
  args: 
  * data(type: DataFrame): the data you're calculating for the entropy
  return:
  * entropy_value(type: float): the data's entropy
  """
  diabetes_mellitus=data.loc[:, 'diabetes_mellitus']
  prob=0
  for i in range(len(diabetes_mellitus)):
    prob+=diabetes_mellitus[i]
  prob=prob/len(diabetes_mellitus)
  if(prob==1):
    return 0
  if(prob==0):
    return 0
  entropy_value= -1 * (np.log2(prob)*prob) - (np.log2(1-prob)*(1-prob))
  return entropy_value

# [Note] You have to save the value of "ans_entropy" into the output file
ans_entropy = entropy(input_data)
print("ans_entropy = ", ans_entropy)

ans_entropy =  0.9871377743721863


In [122]:
def information_gain(data, mask):
  """
  This function will calculate the information gain
  args:
  * data(type: DataFrame): the data you're calculating for the information gain
  * mask(type: Series): partition information(left/right) of current input data, 
    - boolean 1(True) represents split to left subtree
    - boolean 0(False) represents split to right subtree
  return:
  * ig(type: float): the information gain you can obtain by classify data with this given mask
  """
  diabetes_mellitus=data.loc[:, 'diabetes_mellitus']
  total_left=0
  p_left=0
  total_right=0
  p_right=0
  for i in range(len(mask)):
    if(mask[i]==1):
      total_left+=1
      p_left+=diabetes_mellitus[i]
    else:
      total_right+=1
      p_right+=diabetes_mellitus[i]
  if(total_left==0):
    return 0
  else:
    p_left=p_left/total_left
  if(total_right==0):
    return 0
  else:
    p_right=p_right/total_right
  
  before_e=entropy(data)
  after_left=0
  after_right=0
  if(p_left==0 or p_left==1):
    after_left=0
  else:
    after_left=-1 * (total_left/len(mask)) * ((np.log2(p_left)*p_left) + (np.log2(1-p_left)*(1-p_left)))
  if(p_right==0 or p_right==1):
    after_right=0
  else:
    after_right=-1 * (total_right/len(mask)) * ((np.log2(p_right)*p_right) + (np.log2(1-p_right)*(1-p_right)))
  after_e=after_left+after_right
  ig=before_e-after_e

  return ig

# [Note] You have to save the value of "ans_informationGain" into your output file
temp1 = np.zeros((int(input_data.shape[0]/4), 1), dtype=bool)
temp2 = np.ones(((input_data.shape[0]-int(input_data.shape[0]/4), 1)), dtype=bool)
temp_mask = np.concatenate((temp1, temp2))
df_mask = pd.DataFrame(temp_mask, columns=['mask'])
ans_informationGain = information_gain(input_data, df_mask['mask'])
print("ans_informationGain = ", ans_informationGain)

ans_informationGain =  0.0834598868480716


## Step3 : Find the Best Split
Find the best split combination, **feature** and **threshold**, by calculating the information gain


In [198]:
def find_best_split(data):
  """
  This function will find the best split combination of data
  args:
  * data(type: DataFrame): the input data
  return
  * best_ig(type: float): the best information gain you obtain
  * best_threshold(type: float): the value that splits data into 2 branches
  * best_feature(type: string): the feature that splits data into 2 branches
  """
  feature_str=data.columns
  data=data.sort_values(feature_str[0],ignore_index=True)
  best_ig=0
  best_threshold=0
  best_feature=""
  for i in range(data.shape[1]-1):
    temp_feature=data.iloc[:,i]
    fixed_feature=temp_feature
    temp_feature = temp_feature.sort_values(ignore_index = True)
    #print(i)
    #print(len(temp_feature))
    for j in range(len(temp_feature)-1):
      current_threshold=(temp_feature[j]+temp_feature[j+1])/2
      #print(current_threshold)
      mask=list()
      for k in range(len(temp_feature)):
        if(fixed_feature[k]<=current_threshold):
          mask.append(1)
        else:
          mask.append(0)
      #print(mask)
      current_ig=information_gain(data, mask)
      if(current_ig>best_ig):
        best_ig=current_ig
        best_threshold=current_threshold
        best_feature=feature_str[i]
  
  if(best_ig==0):
    return 0, 0, 'age'
  return best_ig, best_threshold, best_feature


# [Note] You have to save the value of "ans_ig", "ans_value", and "ans_name" into the output file
ans_ig, ans_value, ans_name = find_best_split(input_data)
print("ans_ig = ", ans_ig)
print("ans_value = ", ans_value)
print("ans_name = ", ans_name)

ans_ig =  0.3522950515812332
ans_value =  235.5
ans_name =  glucose_apache


## Step4 : Split into 2 branches
Using the best split combination you find in function *find_best_split()* to split data into Left Subtree and Right Subtree 

In [92]:
def make_partition(data, feature, threshold):
  """
  This function will split the data into 2 branches
  args:
  * data(type: DataFrame): the input data
  * feature(type: string): the attribute(column name)
  * threshold(type: float): the threshold for splitting the data
  return:
  * left(type: DataFrame): the divided data that matches(less than or equal to) the assigned feature's threshold
  * right(type: DataFrame): the divided data that doesn't match the assigned feature's threshold
  """
  left=data
  right=data
  temp=data.loc[:, feature]
  for i in range(data.shape[0]):
    if(temp[i]>threshold):
      left=left.drop([i],axis=0)
    else:
      right=right.drop([i],axis=0)
  left = left.sort_values(feature, ignore_index = True)
  right = right.sort_values(feature, ignore_index = True)
  return left, right


# [Note] You have to save the value of "ans_left" into the output file
left, right = make_partition(input_data, 'age', 61.0)
ans_left = left.shape[0]
print("ans_left = ", ans_left)

ans_left =  10


## Step5 : Build Decision Tree
Use the above functions to implement the decision tree

Instructions: 
1.  If current depth < max_depth and the remaining number of samples > min_samples_split: continue to classify those samples
2.  Use function *find_best_split()* to find the best split combination
3.  If the obtained information gain is **greater than 0**: can build a deeper decision tree (add depth)
4. Use function *make_partition()* to split the data into two parts
5. Save the features and corresponding thresholds (starting from the root) used by the decision tree into *ans_features[]* and *ans_thresholds[]* respectively




In [93]:
def build_tree(data, max_depth, min_samples_split, depth):
  """
  This function will build the decision tree
  args:
  * data(type: DataFrame): the data you want to apply to the decision tree
  * max_depth: the maximum depth of a decision tree
  * min_samples_split: the minimum number of instances required to do partition
  * depth: the height of the current decision tree
  return:
  * subtree: the decision tree structure including root, branch, and leaf (with the attributes and thresholds)
  """

  # check the condition of current depth and the remaining number of samples
  if(depth<max_depth):
    # call find_best_split() to find the best combination
    ig, threshold, feature = find_best_split(data)
    # check the value of information gain is greater than 0 or not 
    if(ig>0):
      # update the depth
      depth=depth+1
      # call make_partition() to split the data into two parts
      left, right = make_partition(data, feature, threshold)
      # If there is no data split to the left tree OR no data split to the left tree
      if(left.shape[0]==0 or right.shape[0]==0):
        # return the label of the majority
        diabetes_mellitus=data.loc[:, 'diabetes_mellitus']
        prob=0
        for i in range(len(diabetes_mellitus)):
          prob+=diabetes_mellitus[i]
        prob=prob/len(diabetes_mellitus)
        label=0
        if(prob>=0.5):
          label=1
        return label
      else:
        question = "{} {} {}".format(feature, "<=", threshold)
        subtree = {question: []}
        # call function build_tree() to recursively build the left subtree and right subtree
        left_subtree=build_tree(left, max_depth, min_samples_split, depth)
        right_subtree=build_tree(right, max_depth, min_samples_split, depth)
        if left_subtree == right_subtree:
          subtree = left_subtree
        else:
          subtree[question].append(left_subtree)
          subtree[question].append(right_subtree)
    else:
      diabetes_mellitus=data.loc[:, 'diabetes_mellitus']
      prob=0
      for i in range(len(diabetes_mellitus)):
        prob+=diabetes_mellitus[i]
      prob=prob/len(diabetes_mellitus)
      label=0
      if(prob>=0.5):
        label=1
      return label
  else:
    # return the label of the majority
    diabetes_mellitus=data.loc[:, 'diabetes_mellitus']
    prob=0
    for i in range(len(diabetes_mellitus)):
      prob+=diabetes_mellitus[i]
    prob=prob/len(diabetes_mellitus)
    label=0
    if(prob>=0.5):
      label=1
    return label

  return subtree

An example of the output from *build_tree()* 
```
{'bmi <= 33.5': [1, {'age <= 68.5': [0, 1]}]}
```
Therefore, 
```
ans_features = ['bmi', 'age']
ans_thresholds = [33.5, 68.5]
```



In [94]:
ans_features = []
ans_thresholds = []

decisionTree = build_tree(input_data, max_depth, min_samples_split, depth)
decisionTree


{'glucose_apache <= 235.5': [{'heart_rate_apache <= 143.5': [0, 1]}, 1]}

In [95]:
# [Note] You have to save the features in the "decisionTree" structure (from root to branch and leaf) into the output file
ans_features.append('glucose_apache')
ans_features.append('heart_rate_apache')

In [96]:
# [Note] You have to save the corresponding thresholds for the features in the "ans_features" list into the output file
ans_thresholds.append(235.5)
ans_thresholds.append(143.5)

## Step6 : Save answers

In [97]:
basic = []
basic.append(ans_entropy)
basic.append(ans_informationGain)
basic.append(ans_ig)
basic.append(ans_value)
basic.append(ans_name)
basic.append(ans_left)
for i in range(len(ans_features)):
  basic.append(ans_features[i])
for m in range(len(ans_thresholds)):
  basic.append(ans_thresholds[m])

## Step7 : Split data
Split data into training set and validation set
> Note: We have split the data into training set and validation. You **cannot** change the distribution of the data.

In [98]:
num_train = 20
num_validation = 10

training_data = input_data.iloc[:num_train]
validation_data = input_data.iloc[-num_validation:]

y_train = training_data[["diabetes_mellitus"]]
x_train = training_data.drop(['diabetes_mellitus'], axis=1)
y_validation = validation_data[["diabetes_mellitus"]]
x_validation = validation_data.drop(['diabetes_mellitus'], axis=1)
y_validation = y_validation.values.flatten()

print(input_data.shape)
print(training_data.shape)
print(validation_data.shape)

(30, 10)
(20, 10)
(10, 10)


## Step8 to Step10 : Make predictions with a decision tree

Define the attributions of the decision tree
> You **cannot** modify the values of these attributes in this part

In [99]:
max_depth = 2
depth = 0
min_samples_split = 2
n_features = x_train.shape[1]

We have finished the function '*classify_data()*' below, however, you can modify this function if you prefer completing it on your own way.

In [100]:
def classify_data(instance, tree):
  """
  This function will predict/classify the input instance
  args:
  * instance: a instance(case) to be predicted
  return:
  * answer: the prediction result (the classification result)
  """
  equation = list(tree.keys())[0] 
  if equation.split()[1] == '<=':
    temp_feature = equation.split()[0]
    temp_threshold = equation.split()[2]
    if float(instance[temp_feature]) > float(temp_threshold):
      answer = tree[equation][1]
    else:
      answer = tree[equation][0]
  else:
    if instance[equation.split()[0]] in (equation.split()[2]):
      answer = tree[equation][0]
    else:
      answer = tree[equation][1]

  if not isinstance(answer, dict):
    return answer
  else:
    return classify_data(instance, answer)


def make_prediction(tree, data):
  """
  This function will use your pre-trained decision tree to predict the labels of all instances in data
  args:
  * tree: the decision tree
  * data: the data to predict
  return:
  * y_prediction: the predictions
  """
  y_prediction=[]
  # [Note] You can call the function classify_data() to predict the label of each instance
  for i in range(data.shape[0]):
    y_prediction.append(classify_data(data[i:i+1],tree))


  return y_prediction


def calculate_score(y_true, y_pred):
  """
  This function will calculate the f1-score of the predictions
  args:
  * y_true: the ground truth
  * y_pred: the predictions
  return:
  * score: the f1-score
  """
  tp=0
  fp=0
  tn=0
  fn=0
  for i in range(len(y_pred)):  
    if(y_true[i]==1 and y_pred[i]==1):
      tp+=1
    elif(y_true[i]==1 and y_pred[i]==0):
      fn+=1
    elif(y_true[i]==0 and y_pred[i]==1):
      fp+=1
    else:
      tn+=1

  precision=tp/(tp+fp)
  recall=tp/(tp+fn)
  score=2*(precision*recall)/(precision+recall)
  return score


In [101]:
decision_tree = build_tree(training_data, max_depth, min_samples_split, depth)
y_pred = make_prediction(decision_tree, x_validation)
# [Note] You have to save the value of "ans_f1score" the your output file
ans_f1score = calculate_score(y_validation, y_pred)
print("ans_f1score = ", ans_f1score)

ans_f1score =  0.6666666666666666


## Step11 : Write the Output File
Save all of your answers in a csv file, named as **hw2_basic.csv**

In [102]:
ans_path = 'hw2_basic.csv'

# [Note] You have to save the value of "ans_f1score" into the output file
basic.append(ans_f1score)
print(basic)

pd.DataFrame(basic).to_csv(ans_path, header = None, index = None)

[0.9871377743721863, 0.0834598868480716, 0.3522950515812332, 235.5, 'glucose_apache', 10, 'glucose_apache', 'heart_rate_apache', 235.5, 143.5, 0.6666666666666666]


# **Advanced Part** (35%)

## Step1: Load the input data
First, load the input file **hw2_input_advanced.csv**

In [204]:
advanced_data = pd.read_csv('hw2_input_advanced.csv')

You can split *advanced_data* into training set and validaiton set

In [228]:
total_data=advanced_data.shape[0]
num_train = int(total_data * 0.7)
num_validation = total_data - num_train
print(num_train)
print(num_validation)
training_data = advanced_data.iloc[:num_train]
validation_data = advanced_data.iloc[-num_validation:]

y_train = training_data[["diabetes_mellitus"]]
x_train = training_data.drop(['diabetes_mellitus'], axis=1)
y_validation = validation_data[["diabetes_mellitus"]]
x_validation = validation_data.drop(['diabetes_mellitus'], axis=1)
y_validation = y_validation.values.flatten()
print(training_data.shape)


5865
2514
(5865, 25)


## Step2 : Load the test data
Load the input file **hw2_input_test.csv** to make predictions with the pre-trained random forest model

In [229]:
x_test = pd.read_csv('hw2_input_test.csv')
x_test

Unnamed: 0,age,bmi,gender,height,weight,arf_apache,bun_apache,creatinine_apache,gcs_eyes_apache,gcs_motor_apache,...,hematocrit_apache,intubated_apache,map_apache,resprate_apache,sodium_apache,temp_apache,ventilated_apache,wbc_apache,apache_4a_hospital_death_prob,apache_4a_icu_death_prob
0,62,32.866392,1,177.80,103.9,1,31.0,10.30,4,6,...,36.4,0,157,26,134,36.1,0,4.56,0.06,0.03
1,82,23.582766,0,157.50,58.5,0,26.0,0.54,3,4,...,32.8,0,42,25,142,36.1,0,6.00,0.14,0.06
2,61,31.684520,1,172.70,94.5,0,16.0,1.11,4,6,...,35.3,0,129,6,131,36.8,0,8.59,0.05,0.03
3,58,45.156250,0,160.00,115.6,0,19.0,0.70,1,4,...,30.1,1,131,23,138,34.9,1,16.03,0.33,0.22
4,74,25.817016,1,172.70,77.0,0,25.0,0.93,4,6,...,34.5,0,55,12,135,36.3,0,45.80,0.12,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
835,73,17.943584,0,157.48,44.5,0,12.0,0.30,4,6,...,33.8,0,129,9,144,36.9,0,7.70,0.02,0.01
836,79,29.049732,1,167.60,81.6,0,48.0,2.19,4,6,...,42.7,0,163,9,139,36.4,0,10.77,0.06,0.03
837,85,24.627827,0,152.40,57.2,0,11.0,0.48,3,5,...,29.5,0,67,9,139,36.6,0,7.35,0.16,0.05
838,68,32.510940,1,193.00,121.1,0,14.0,0.64,4,6,...,37.5,0,61,10,140,36.9,1,16.02,0.00,0.00


## Step3 : Build a Random Forest

Define the attributions of the random forest
> * You **can** modify the values of these attributes in advanced part
> * Each tree can have different attribute values
> * There must be **at least** 3 decision trees in the random forest model
> * Must use function *build_tree()* to build a random forest model
> * These are the parameters you can adjust : 


    ```
    max_depth = 
    depth = 0
    min_samples_split = 
    
    # total number of trees in a random forest
    n_trees = 

    # number of features to train a decision tree
    n_features = 

    # the ratio to select the number of instances
    sample_size = 
    n_samples = int(training_data.shape[0] * sample_size)
    ```




In [268]:
# Define the attributes
max_depth=5
depth=0
min_samples_split=2

n_trees=12
n_features=12
sample_size=0.08
n_samples = int(training_data.shape[0] * sample_size)
print(n_samples)

469


In [269]:
def build_forest(data, n_trees, n_features, n_samples):
  """
  This function will build a random forest.
  args:
  * data: all data that can be used to train a random forest
  * n_trees: total number of tree
  * n_features: number of features
  * n_samples: number of instances
  return:
  * forest: a random forest with 'n_trees' of decision tree
  """
  forest=[]
  for i in range(n_trees):
    start=int((num_train-n_samples)/n_trees*i)
    temp_data=training_data
    #print(i)
    #print(i+24-n_features-1)
    #print(temp_data.shape)
    #input_data = input_data.drop(input_data.iloc[:,1:3],axis=1)
    temp_data=temp_data.iloc[start:start+n_samples]
    temp_data=temp_data.sort_values('diabetes_mellitus',ignore_index=True)
    if(i<6):
      temp_data=temp_data.drop(temp_data.iloc[:,i*2:(i*2+24-n_features)], axis=1)
    else:
      temp_data=temp_data.drop(temp_data.iloc[:,((i-6)*2+24-n_features):24], axis=1)
      temp_data=temp_data.drop(temp_data.iloc[:,0:((i-6)*2)], axis=1)
    #for j in range(num_train-n_samples-1):
      #temp_data=temp_data.drop([j+start],axis=0)
    #advanced_path = 'hw2_advanced'+str(i)+'.csv'
    #pd.DataFrame(temp_data).to_csv(advanced_path, header = None, index = None)
    tree=build_tree(temp_data, max_depth, min_samples_split, depth)
    #print(tree)
    forest.append(tree)
  # must reuse function build_tree()


  return forest

In [271]:
forest = build_forest(training_data, n_trees, n_features, n_samples)
#print(forest)

In [177]:
save1_forest=forest

In [None]:
print(save1_forest)

In [272]:
save2_forest=forest
print(save2_forest)

[{'glucose_apache <= 212.0': [{'resprate_apache <= 15.5': [{'hematocrit_apache <= 41.7': [{'glucose_apache <= 174.0': [{'glucose_apache <= 104.5': [1, 0]}, {'hematocrit_apache <= 38.5': [1, 0]}]}, {'apache_4a_hospital_death_prob <= 0.16999999999999998': [0, 1]}]}, {'hematocrit_apache <= 36.8': [{'glucose_apache <= 151.5': [0, {'apache_4a_hospital_death_prob <= 0.02': [0, 1]}]}, {'heart_rate_apache <= 149.0': [0, 1]}]}]}, {'glucose_apache <= 262.5': [{'heart_rate_apache <= 131.0': [{'sodium_apache <= 135.0': [{'glucose_apache <= 232.5': [1, 0]}, {'heart_rate_apache <= 122.5': [1, 0]}]}, 1]}, {'temp_apache <= 32.95': [0, {'map_apache <= 126.0': [{'sodium_apache <= 126.5': [0, 1]}, {'sodium_apache <= 140.0': [1, 0]}]}]}]}]}, {'bmi <= 22.691355090000002': [{'map_apache <= 135.5': [{'apache_4a_hospital_death_prob <= 0.31': [{'resprate_apache <= 5.0': [1, 0]}, 0]}, 0]}, {'age <= 64.0': [{'bmi <= 33.280723885': [0, {'hematocrit_apache <= 37.349999999999994': [1, {'bmi <= 33.52383039': [1, 0]}

## Step4 : Make predictions with the random forest
> Note: Please print the f1-score of the predictions of each decision tree

In [273]:
scores=[]
def make_prediction_forest(forest, data):
  """
  This function will use the pre-trained random forest to make the predictions
  args:
  * forest: the random forest
  * data: the data used to predict
  return:
  * y_prediction: the predicted results
  """
  temp_predictions=[]
  y_prediction=[]
  for i in range(n_trees):
    temp_prediction=make_prediction(forest[i],data)
    temp_predictions.append(temp_prediction)
    score=calculate_score(y_validation,temp_prediction)
    scores.append(score)
    
  for j in range(len(temp_prediction)):
    temp_v=0
    for k in range(n_trees):
      temp_v+=temp_predictions[k][j]
    temp_v=temp_v/n_trees
    if(temp_v>=0.5):
      temp_v=1
    else:
      temp_v=0
    y_prediction.append(temp_v)
  return y_prediction
y_pred_test = make_prediction_forest(forest, x_validation)
for index in range(len(scores)):
  print("F1score : " + str(scores[index]))

F1score : 0.6562770562770563
F1score : 0.5240133143128862
F1score : 0.5892243026366069
F1score : 0.6025727826675694
F1score : 0.5637982195845698
F1score : 0.5194927195866603
F1score : 0.5626859328516786
F1score : 0.6563916591115141
F1score : 0.6461679669573199
F1score : 0.6781362007168459
F1score : 0.6604255319148935
F1score : 0.6889238020424194


In [274]:
y_pred_test = make_prediction_forest(forest, x_test)
sum=0
for i in range(840):
  sum+=y_pred_test[i]
print(sum)

513


## Step5 : Write the Output File
Save your predictions from the **random forest** in a csv file, named as **hw2_advanced.csv**

In [275]:
advanced = []
for i in range(len(y_pred_test)):
  advanced.append(y_pred_test[i])

In [277]:
advanced_path = 'hw2_advanced.csv'
pd.DataFrame(advanced).to_csv(advanced_path, header = None, index = None)