# Importing Libraries

In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import math

pd.set_option('display.float_format', lambda x: '%.2f' % x)


# Dummy Dataset

In [2]:
from sklearn.datasets import make_classification

x,y = make_classification(n_samples = 20,
                          n_features = 4,
                          random_state=42)
x = pd.DataFrame(x,columns = ["f1","f2","f3","f4"])
x['output'] = y

In [3]:
df = x.copy(deep = True)

# Algo

## Step 1

### We will assign an equal sample weight to each observation. 

In [4]:
#Initialize the equal weights

df['weights'] = [1/df.shape[0]] * df.shape[0]

## Step 2

### We will create M decision stumps, for M number of features.

In [6]:
# Create stump

In [117]:
# f1 feature
stump_f1 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f1.fit(df['f1'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f1 = stump_f1.predict(df['f1'].values.reshape(-1,1))
df['f1_pred'] = pred_f1


# f2 feature
stump_f2 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f2.fit(df['f2'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f2 = stump_f1.predict(df['f2'].values.reshape(-1,1))
df['f2_pred'] = pred_f2


# f3 feature
stump_f3 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f3.fit(df['f3'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f3 = stump_f3.predict(df['f3'].values.reshape(-1,1))
df['f3_pred'] = pred_f3


# f4 feature
stump_f4 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f4.fit(df['f4'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f4 = stump_f4.predict(df['f4'].values.reshape(-1,1))
df['f4_pred'] = pred_f4

In [6]:
# Error

In [118]:
def incorrect_classification(column):
    return [f"Index {i}" for i in range(len(df[column].values)) \
                                                             if df['output'].values[i] != df[column].values[i]]

print(f"F1 Error : {incorrect_classification('f1_pred')}")
print(f"F2 Error : {incorrect_classification('f2_pred')}")
print(f"F3 Error : {incorrect_classification('f3_pred')}")
print(f"F4 Error : {incorrect_classification('f4_pred')}")

F1 Error : ['Index 2', 'Index 4']
F2 Error : ['Index 0', 'Index 2', 'Index 3', 'Index 4', 'Index 6', 'Index 7', 'Index 10', 'Index 18', 'Index 19']
F3 Error : ['Index 3', 'Index 6', 'Index 10', 'Index 14']
F4 Error : ['Index 2', 'Index 3', 'Index 4', 'Index 6', 'Index 10']


## Step 3

### Out of all M decision stumps, I first have to select one best decision tree model. For selecting it, we will either calculate the Entropy or Gini coefficient. The model with lesser entropy will be selected (means model that is less disordered).

In [9]:
# Find Entrophy

In [119]:
def entropy(column):
        f1_positive = df[column].to_list().count(1)
        f1_negative = df[column].to_list().count(0)

        total  = df['output'].shape[0]

        feature_probab_0 = f1_negative/total
        feature_probab_1 = f1_positive/total

        entrophy = - (feature_probab_0 * math.log2(feature_probab_0)) + (feature_probab_1 * math.log2(feature_probab_1))
        return entrophy

In [9]:
entropy("f1_pred")

0.08659188145522112

In [120]:
print(f"Entropy for feature 1 is {entropy('f1_pred')}")
print(f"Entropy for feature 2 is {entropy('f2_pred')}")
print(f"Entropy for feature 3 is {entropy('f3_pred')}")
print(f"Entropy for feature 4 is {entropy('f4_pred')}")

Entropy for feature 1 is 0.08659188145522112
Entropy for feature 2 is 0.12613316560533977
Entropy for feature 3 is -0.16088845726903106
Entropy for feature 4 is -0.044028330112736824


## Step 4

### Algorithm would now check how many observations the model has misclassified.
### and find the total error based on that

In [None]:
# We picked feature 1 for finding total error 

In [121]:
print(f"F1 feature Error : {incorrect_classification('f3_pred')}")


F1 feature Error : ['Index 3', 'Index 6', 'Index 10', 'Index 14']


#### Total Error - The total error is just the sum of all misclassified data points' sample weights.


 - Suppose out of N observations, The f1 feature decision stump has misclassified T number of observations.

- For this, we will calculate the total error (TE), which is equal to T/N.

In [14]:
# In this case since we have 2 errors, out Total Error is 4/20
# TE = 4/20

## Step 5

### Now we will calculate the performance/ "Amount of Say"  of the first decision stump.

#### Formula of Performance/ Amount of Say
![image.png](attachment:61ccc552-24f0-4e69-b6d8-c1e04a13bfc7.png)

In [122]:
def amount_of_say(total_error):
    eq1 = (1 - total_error)/total_error
    performance = 0.5 * math.log2(eq1)
    return performance



In [123]:
# Performance / Amount of say of feature 1 (Given total_error is 2/20 i.e 0.1)
amount_of_say((4/20))

1.0

## Step 6

### Update the Weight using Peformance

![image.png](attachment:06fb7ebd-ffb4-41a7-995b-9d73a70e4664.png)

- When the sample is successfully identified, the amount of, say, (alpha) will be negative.
- When the sample is misclassified, the amount of (alpha) will be positive.

In [20]:
new_sample_weight = 0.05 * math.exp(1.584962500721156)
new_sample_weight

0.24395542060176725

In [127]:
incorrect_classification('f3_pred')

['Index 3', 'Index 6', 'Index 10', 'Index 14']

In [128]:
df1 = df.drop(columns = [ 'f2_pred',
                           'f3_pred', 'f4_pred'])
df1 = df1[['f1','f2','f3','f4','output','f1_pred','weights']].rename(columns = {"f1_pred" : "pred_output"})
df1['updated_weight'] = df1['weights']

#Weights updation on incorrect classification
new_sample_weight = 0.05 * math.exp(amount_of_say((4/20)))

df1.loc[3,'updated_weight'] = new_sample_weight
df1.loc[6,'updated_weight'] = new_sample_weight
df1.loc[10,'updated_weight'] = new_sample_weight
df1.loc[14,'updated_weight'] = new_sample_weight

#Weights updation on incorrect classification

new_incorrect_sample_weight = 0.05 * math.exp(-(amount_of_say((4/20))))
df1['updated_weight'] = np.where(df1['updated_weight'] != new_sample_weight, new_incorrect_sample_weight,new_sample_weight)

# Since updated weights are adding up to 1
total_sum = df1['updated_weight'].sum()

# Divide each weight to total weight
df1['adj_updated_weight'] = df1['updated_weight']/total_sum

df1

Unnamed: 0,f1,f2,f3,f4,output,pred_output,weights,updated_weight,adj_updated_weight
0,-0.37,1.26,0.4,-0.59,0,0,0.05,0.02,0.02
1,1.33,-0.21,1.17,-0.98,1,1,0.05,0.02,0.02
2,0.56,0.95,1.12,-1.16,0,1,0.05,0.02,0.02
3,0.89,-0.71,0.44,-0.25,1,1,0.05,0.14,0.16
4,0.47,1.16,1.16,-1.24,0,1,0.05,0.02,0.02
5,0.91,1.45,1.77,-1.82,1,1,0.05,0.02,0.02
6,-0.11,-0.87,-0.64,0.72,1,1,0.05,0.14,0.16
7,-0.86,1.47,0.05,-0.32,0,0,0.05,0.02,0.02
8,0.66,2.08,1.9,-2.05,1,1,0.05,0.02,0.02
9,1.4,0.04,1.39,-1.22,1,1,0.05,0.02,0.02


## Step 7

### Create Buckets

### After this, we have to make our second decision stump. For this, we will make a class intervals (Buckets) for the normalized weights

In [129]:
# Adding buckets

cumsum = df1['adj_updated_weight'].cumsum()

buckets = [f"0 - {cumsum[0]}"] + [f"{cumsum[i]} - {cumsum[i + 1]}" for i in range(0,len(cumsum)-1)]
df2 = df1.drop(columns = ['updated_weight'])
df2['Buckets'] = buckets
df2

Unnamed: 0,f1,f2,f3,f4,output,pred_output,weights,adj_updated_weight,Buckets
0,-0.37,1.26,0.4,-0.59,0,0,0.05,0.02,0 - 0.021950897232253797
1,1.33,-0.21,1.17,-0.98,1,1,0.05,0.02,0.021950897232253797 - 0.04390179446450759
2,0.56,0.95,1.12,-1.16,0,1,0.05,0.02,0.04390179446450759 - 0.06585269169676139
3,0.89,-0.71,0.44,-0.25,1,1,0.05,0.16,0.06585269169676139 - 0.22804910276774626
4,0.47,1.16,1.16,-1.24,0,1,0.05,0.02,0.22804910276774626 - 0.25000000000000006
5,0.91,1.45,1.77,-1.82,1,1,0.05,0.02,0.25000000000000006 - 0.27195089723225385
6,-0.11,-0.87,-0.64,0.72,1,1,0.05,0.16,0.27195089723225385 - 0.4341473083032387
7,-0.86,1.47,0.05,-0.32,0,0,0.05,0.02,0.4341473083032387 - 0.4560982055354925
8,0.66,2.08,1.9,-2.05,1,1,0.05,0.02,0.4560982055354925 - 0.4780491027677463
9,1.4,0.04,1.39,-1.22,1,1,0.05,0.02,0.4780491027677463 - 0.5000000000000001


## Step 8

### Creating new dataset for subsequent model based on the updated weight

- After that, we want to make a second weak model. But to do that, we need a sample dataset on which the second weak model can be run. For making it, we will run N number of iterations. On each iteration, it will calculate a random number ranging between 0-1 and this random will be compared with class intervals(Buckets) we created and on which class interval it lies, that row will be selected for sample data set. So new sample data set would also be of N observation. 

In [130]:
# Generate 20 random values from 0 to 1 using numpy.random
new_samples_bucket_value = np.random.random_sample(size = 20)
new_samples_bucket_value

array([0.4495945 , 0.29709205, 0.31414417, 0.19873258, 0.32456567,
       0.6209672 , 0.7501351 , 0.54184266, 0.25156389, 0.89328403,
       0.01595991, 0.52859411, 0.88172485, 0.23562916, 0.81380631,
       0.44755265, 0.46070909, 0.01155073, 0.33946869, 0.11294766])

In [80]:
df2['Buckets'][0].split("-")

['0 ', ' 0.015241254319229241']

In [131]:
df3 = df2.copy()
df3['b1'] = df2['Buckets'].apply(lambda x : float(x.split("-")[0]))
df3['b2'] = df2['Buckets'].apply(lambda x : float(x.split("-")[1]))
new_sample_df = pd.DataFrame(columns = df3.columns)

for random_value in new_samples_bucket_value:
    new_sample_df = pd.concat([new_sample_df,df3[(df3['b1'] < random_value) & (df3['b2'] > random_value)]])

In [132]:
new_sample_df

Unnamed: 0,f1,f2,f3,f4,output,pred_output,weights,adj_updated_weight,Buckets,b1,b2
7,-0.86,1.47,0.05,-0.32,0,0,0.05,0.02,0.4341473083032387 - 0.4560982055354925,0.43,0.46
6,-0.11,-0.87,-0.64,0.72,1,1,0.05,0.16,0.27195089723225385 - 0.4341473083032387,0.27,0.43
6,-0.11,-0.87,-0.64,0.72,1,1,0.05,0.16,0.27195089723225385 - 0.4341473083032387,0.27,0.43
3,0.89,-0.71,0.44,-0.25,1,1,0.05,0.16,0.06585269169676139 - 0.22804910276774626,0.07,0.23
6,-0.11,-0.87,-0.64,0.72,1,1,0.05,0.16,0.27195089723225385 - 0.4341473083032387,0.27,0.43
10,-0.05,-1.46,-0.93,1.09,1,1,0.05,0.16,0.5000000000000001 - 0.662196411070985,0.5,0.66
14,1.09,-0.02,1.05,-0.92,1,1,0.05,0.16,0.7280491027677463 - 0.8902455138387311,0.73,0.89
10,-0.05,-1.46,-0.93,1.09,1,1,0.05,0.16,0.5000000000000001 - 0.662196411070985,0.5,0.66
5,0.91,1.45,1.77,-1.82,1,1,0.05,0.02,0.25000000000000006 - 0.27195089723225385,0.25,0.27
15,-0.83,-0.96,-1.39,1.4,0,0,0.05,0.02,0.8902455138387311 - 0.9121964110709848,0.89,0.91


## Step 9

### Repeat the above steps till convergence

In [116]:
new_sample_df = new_sample_df[['f1','f2','f3','f4','output']]
new_sample_df['weights'] = 1/20
new_sample_df

Unnamed: 0,f1,f2,f3,f4,output,weights
4,0.47,1.16,1.16,-1.24,0,0.05
3,0.89,-0.71,0.44,-0.25,1,0.05
9,1.4,0.04,1.39,-1.22,1,0.05
4,0.47,1.16,1.16,-1.24,0,0.05
4,0.47,1.16,1.16,-1.24,0,0.05
4,0.47,1.16,1.16,-1.24,0,0.05
19,-0.66,1.61,0.33,-0.6,0,0.05
2,0.56,0.95,1.12,-1.16,0,0.05
16,-0.84,-0.66,-1.21,1.18,0,0.05
2,0.56,0.95,1.12,-1.16,0,0.05


In [None]:
# f1 feature
stump_f1 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f1.fit(new_sample_df['f1'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f1 = stump_f1.predict(new_sample_df['f1'].values.reshape(-1,1))
new_sample_df['f1_pred'] = pred_f1


# f2 feature
stump_f2 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f2.fit(new_sample_df['f2'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f2 = stump_f1.predict(new_sample_df['f2'].values.reshape(-1,1))
new_sample_df['f2_pred'] = pred_f2


# f3 feature
stump_f3 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f3.fit(new_sample_df['f3'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f3 = stump_f3.predict(new_sample_df['f3'].values.reshape(-1,1))
new_sample_df['f3_pred'] = pred_f3


# f4 feature
stump_f4 = DecisionTreeClassifier(max_depth= 1,random_state = 1)

stump_f4.fit(df['f4'].values.reshape(-1,1),df['output'].values.reshape(-1,1))

pred_f4 = stump_f4.predict(df['f4'].values.reshape(-1,1))
df['f4_pred'] = pred_f4