# Daily Dose of Data Science

This notebook accompanies the exercise for Bagging article.

Read the full blog here: [Why Bagging is So Ridiculously Effective At Variance Reduction?](https://www.dailydoseofds.com/why-bagging-is-so-ridiculously-effective-at-variance-reduction)

Author: Avi Chawla

## You are supposed to write your solution in between "YOUR CODE STARTS HERE" and "YOUR CODE ENDS HERE" in the sampling_function.

## Imports

In [None]:
import seaborn as sns

sns.set()
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.rcParams['font.family'] = ['Comic Sans MS', 'sans-serif']

colors = ['#fe7c73', '#2471A3']

# Bagging Ideas

## Feature subsetting for each tree:
- In a typical random forest, Every tree gets to see all the features in the training set.
- Here, we shall create a tree only on a subset of features.

### Define bootstrapping function

In [None]:
def sampling_function(df, row_sample_ratio, feature_sample_ratio):
    
    """
    This function accepts the following arguments:
    
    1. dataframe df of size (n,d), where:
    -- n: number of training samples in the entire training set
    -- d: number of dimensions in the entire training set
    
    2. row_sample_ratio: The fraction of rows to sample before duplicating during sample bootstrapping.
    
    3. feature_sample_ratio: The fraction of columns to sample before duplicating during column bootstrapping.
    
    This function returns another dataframe of size (n,d) which has:
    - some duplicated rows
    - some duplicated columns

    """
    
    # define total samples and columns
    total_samples, total_columns = df.shape
    
    # COLUMN SAMPLING:
    
    initial_column_size = int(feature_sample_ratio*total_columns)
    
    ### YOUR CODE STARTS HERE ###
    
    # randomly sample 'initial_column_size' number of columns from 'df' EXCEPT FOR THE Y COLUMN
    initial_column_sample_df = 
    
    remaining_columns = total_columns - initial_column_size
    
    new_columns = pd.DataFrame()
    for _ in range(remaining_columns):
        
        # randomly sample one column from 'initial_column_sample_df'
        sampled_column = 
        
        # add it to 'new_columns' df with its column name
        new_columns[] = 
        
    ### YOUR CODE ENDS HERE ###
        
    column_sampled_df = pd.concat((initial_column_sample_df, new_columns), axis = 1)
    
    # ROW SAMPLING
    
    # number of rows to sample
    initial_sample_size = int(row_sample_ratio*total_samples)
    
    ### YOUR CODE STARTS HERE ###
    
    # randomly sample 'initial_sample_size' number of columns from 'column_sampled_df'
    initial_row_sample_df = 
    
    new_rows = pd.DataFrame()
    
    remaining_rows = final_size - initial_sample_size
    for i in range(remaining_rows):
        
        # randomly sample one row from 'column_sampled_df'
        sampled_row = 
        
        # add it to 'new_rows' df
        new_rows = pd.concat((new_rows, sampled_row))
        
    bootstrapped_dataset = pd.concat((initial_row_sample_df, new_rows))
    return bootstrapped_dataset

### Define prediction function

In [None]:
from collections import Counter

def predict(models, test_data):
    # Assuming you have test data 'X_test' as a DataFrame
    predictions = []

    # Make predictions using each tree in 'all_trees'
    for model in models:
        model_predictions = model.predict(test_data)
        predictions.append(model_predictions)

    # Perform majority vote to aggregate predictions
    ensemble_predictions = []
    for i in range(len(test_data)):
        ensemble_prediction = Counter([prediction[i] for prediction in predictions]).most_common(1)[0][0]
        ensemble_predictions.append(ensemble_prediction)

    ensemble_predictions = np.array(ensemble_predictions)
    
    return ensemble_predictions

In [None]:
# Generate some synthetic data to train the decision tree on
X, y = make_classification(
    n_samples=1200, 
    n_features=2, 
    n_clusters_per_class=1,
    n_informative=2, class_sep=0.5,
    n_redundant=0,
    n_repeated=0,
    random_state=21
)

fig, ax = plt.subplots()
plt.scatter(X[:, 0], X[:, 1], c=[colors[i] for i in y])
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

X_train, y_train = X[:1000], y[:1000]
X_test, y_test = X[1000:], y[1000:]

### Decision Tree model

In [None]:

dtree = DecisionTreeClassifier(random_state=0)
dtree.fit(X_train, y_train)

xx, yy = np.meshgrid(np.linspace(np.floor(X_train[:, 0].min()), np.ceil(X_train[:, 0].max()), 100), np.linspace(np.floor(X_train[:, 1].min()), np.ceil(X_train[:, 1].max()), 100))
Z = dtree.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
plt.contourf(xx, yy, Z, alpha=0.5, cmap='coolwarm_r')
ax.set_title("Decision Tree", fontsize = 20, fontweight = "bold", pad=15)
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

dtree.score(X_test, y_test)

### Random Forest Model

In [None]:
rfmodel = RandomForestClassifier(max_features="sqrt", max_samples=0.5, n_estimators=21)
rfmodel.fit(X_train, y_train)

xx, yy = np.meshgrid(np.linspace(np.floor(X_train[:, 0].min()), np.ceil(X_train[:, 0].max()), 100), np.linspace(np.floor(X_train[:, 1].min()), np.ceil(X_train[:, 1].max()), 100))
Z = rfmodel.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
plt.contourf(xx, yy, Z, alpha=0.5, cmap='coolwarm_r')
ax.set_title("Random Forest", fontsize = 20, fontweight = "bold", pad=15)
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

rfmodel.score(X_test, y_test)

### Custom Model

In [None]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()

df_train[["X1", "X2"]] = X_train
df_train["y"] = y_train

df_test[["X1", "X2"]] = X_test
df_test["y"] = y_test

#### Train Models

In [None]:
total_models = 21
row_sample_ratio, feature_sample_ratio = 0.5, 0.5

all_trees = []

for i in range(total_models):
    model = DecisionTreeClassifier(max_features="sqrt")
    data = sampling_function(df_train, row_sample_ratio, feature_sample_ratio)
    
    model.fit(data[["X1", "X2"]], data['y'])
    
    all_trees.append(model)

#### Aggregate predictions and create plot

In [None]:
xx, yy = np.meshgrid(np.linspace(np.floor(X_train[:, 0].min()), np.ceil(X_train[:, 0].max()), 100), np.linspace(np.floor(X_train[:, 1].min()), np.ceil(X_train[:, 1].max()), 100))

Z = predict(all_trees, np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

fig, ax = plt.subplots()
plt.contourf(xx, yy, Z, alpha=0.5, cmap='coolwarm_r')
ax.set_title("Custom Random Forest", fontsize = 20, fontweight = "bold", pad = 15)
ax.set_xticklabels([])
ax.set_yticklabels([])
plt.show()

sum(predict(all_trees, df_test[["X1", "X2"]]) == df_test.y)/df_test.shape[0]

In [None]:
dtree_accuracy = dtree.score(X_test, y_test)
rf_accuracy = rfmodel.score(X_test, y_test)
custom_rf_accuracy = sum(predict(all_trees, df_test[["X1", "X2"]]) == df_test.y)/df_test.shape[0]

print(f"Decision Tree test accuracy: {dtree_accuracy}")
print(f"Random Forest test accuracy: {rf_accuracy}")
print(f"Custom Random Forest test accuracy: {custom_rf_accuracy}")