In [3]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
from collections import Counter
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import datasets, model_selection, metrics

%matplotlib inline

In [5]:
data = pd.read_csv('./data/categorical_managed_data.csv')
data.reset_index(drop=True, inplace=True)
data.to_csv('categorical_managed_data.csv', index=False)

In [6]:
data.head()

Unnamed: 0,Quantity,UnitPrice,Year,Total_sales,Country_Australia,Country_Austria,Country_Bahrain,Country_Belgium,Country_Brazil,Country_Canada,...,Month_May,Month_November,Month_October,Month_September,Day_Friday,Day_Monday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
0,6,2.55,2010,15.3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,6,3.39,2010,20.34,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,8,2.75,2010,22.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,6,3.39,2010,20.34,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,6,3.39,2010,20.34,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
X = data.drop(["Total_sales"],axis=1)
X = X.to_numpy()
y = data["Total_sales"].to_numpy()

In [8]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [9]:
pca = PCA(n_components=30)

pca.fit(X)

X = pca.transform(X)

explained_variance = pca.explained_variance_ratio_

print("Explained Variance Ratio:", explained_variance)

Explained Variance Ratio: [0.0336774  0.03149908 0.02196433 0.02162715 0.02112736 0.02069865
 0.01996719 0.01913911 0.01880022 0.0186358  0.01851428 0.01827404
 0.01811503 0.01802695 0.01788322 0.01778356 0.01764818 0.01740972
 0.01721389 0.0170482  0.01699502 0.01697602 0.01697021 0.01695264
 0.01695014 0.01693595 0.01692959 0.01690734 0.01687157 0.01685958]


# Regressor from Scratch

In [10]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))


        if (self.max_depth is not None and depth >= self.max_depth) or n_labels == 1:

            return {'value': np.mean(y)}


        best_feature, best_threshold = self._find_best_split(X, y)

        left_idxs = X[:, best_feature] < best_threshold
        X_left, y_left = X[left_idxs], y[left_idxs]
        X_right, y_right = X[~left_idxs], y[~left_idxs]

        return {'feature_idx': best_feature,
                'threshold': best_threshold,
                'left': self._grow_tree(X_left, y_left, depth + 1),
                'right': self._grow_tree(X_right, y_right, depth + 1)}

    def _find_best_split(self, X, y):
        best_feature, best_threshold, best_variance = None, None, float('inf')
        for feature_idx in range(self.n_features):
            min_value = np.min(X[:, feature_idx])
            max_value = np.max(X[:, feature_idx])
            thresholds = np.random.uniform(low=min_value, high=max_value, size=100)

            for threshold in thresholds:
                left_idxs = X[:, feature_idx] < threshold
                var_left = np.var(y[left_idxs])
                var_right = np.var(y[~left_idxs])
                variance = var_left + var_right
                if variance < best_variance:
                    best_feature, best_threshold, best_variance = feature_idx, threshold, variance
        return best_feature, best_threshold

    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])

    def _predict_tree(self, x, tree):
        if 'value' in tree:
            return tree['value']
        feature_val = x[tree['feature_idx']]
        branch = tree['left'] if feature_val < tree['threshold'] else tree['right']
        return self._predict_tree(x, branch)

In [11]:
class RandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=None, bootstrap=True):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.bootstrap = bootstrap

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_estimators):
            tree = DecisionTree(max_depth=self.max_depth)
            if self.bootstrap:
                idxs = np.random.choice(len(X), size=len(X), replace=True)
                X_bootstrapped, y_bootstrapped = X[idxs], y[idxs]
            else:
                X_bootstrapped, y_bootstrapped = X, y
            tree.fit(X_bootstrapped, y_bootstrapped)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(predictions, axis=0)

# Training

In [12]:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, y, test_size=0.25, random_state=0)
print("Train Shape:", X_train.shape)
print("Test Shape:", X_test.shape)

Train Shape: (393657, 30)
Test Shape: (131220, 30)


In [13]:
%%time
regressor = RandomForestRegressor(max_depth=4,n_estimators=12)
regressor.fit(X_train, Y_train)

CPU times: user 30min 35s, sys: 5.83 s, total: 30min 40s
Wall time: 31min 3s


In [15]:
y_pred = regressor.predict(X_test)
mse = metrics.mean_squared_error(Y_test, y_pred)
print("Mean squared error,",mse)

Mean squared error, 117171.81122132819
