In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pprint 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

In [None]:
cleaned_df = pd.read_csv('/cleaned_shifted_data.csv')

In [4]:

# Sample the dataset
sample_size = 10000
sampled_df = cleaned_df.sample(n=sample_size, random_state=11)

# Filter relevant columns (AQI constituents)
relevant_columns = ['PM2.5 (µg/m³)', 'PM10 (µg/m³)', 'NO (µg/m³)', 'NO2 (µg/m³)',
                    'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)', 'CO (mg/m³)',
                    'Ozone (µg/m³)']
X = cleaned_df[relevant_columns].values
y = cleaned_df['AQI_calculated_shifted'].values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
def RSS_reduction(child_L, child_R, parent):
    rss_parent = sum((parent - np.mean(parent))**2)
    rss_child_L = sum((child_L - np.mean(child_L))**2) 
    rss_child_R = sum((child_R - np.mean(child_R))**2)
    return rss_parent - (rss_child_L + rss_child_R)

def sort_x_by_y(x, y):
    unique_xs = np.unique(x)
    y_mean_by_x = np.array([y[x == unique_x].mean() for unique_x in unique_xs])
    ordered_xs = unique_xs[np.argsort(y_mean_by_x)]
    return ordered_xs

def all_rows_equal(X):
    return (X == X[0]).all()



In [13]:
class Node:
    
    def __init__(self, Xsub, ysub, ID, depth = 0, parent_ID = None, leaf = True):
        self.ID = ID
        self.Xsub = Xsub
        self.ysub = ysub
        self.size = len(ysub)
        self.depth = depth
        self.parent_ID = parent_ID
        self.leaf = leaf
        
class Splitter:
    
    def __init__(self):
        self.rss_reduction = 0
        self.no_split = True
        
    def _replace_split(self, rss_reduction, d, dtype = 'quant', t = None, L_values = None):
        self.rss_reduction = rss_reduction
        self.d = d
        self.dtype = dtype
        self.t = t        
        self.L_values = L_values     
        self.no_split = False

                  Timestamp  Unnamed: 0        Station  PM2.5 (µg/m³)  \
232903  2023-12-06 09:00:00      383076  LGBI Airport           144.0   
232904  2023-12-06 09:15:00      383077  LGBI Airport           144.0   
232905  2023-12-06 09:30:00      383078  LGBI Airport           144.0   
232906  2023-12-06 09:45:00      383079  LGBI Airport           144.0   
232907  2023-12-06 10:00:00      383080  LGBI Airport           144.0   
232908  2023-12-06 10:15:00      383081  LGBI Airport           144.0   
232909  2023-12-06 10:30:00      383082  LGBI Airport           144.0   
232910  2023-12-06 11:45:00      383087  LGBI Airport           170.0   
232911  2023-12-06 12:00:00      383088  LGBI Airport           170.0   
232912  2023-12-06 12:15:00      383089  LGBI Airport           170.0   
232913  2023-12-06 12:30:00      383090  LGBI Airport           170.0   
232914  2023-12-06 13:45:00      383095  LGBI Airport            89.0   
232915  2023-12-06 14:00:00      383096  LGBI Airpo

Index(['Timestamp', 'Unnamed: 0', 'Station', 'PM2.5 (µg/m³)', 'PM10 (µg/m³)',
       'NO (µg/m³)', 'NO2 (µg/m³)', 'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)',
       'CO (mg/m³)', 'Ozone (µg/m³)', 'Checks', 'AQI_calculated',
       'AQI_bucket_calculated', 'AQI_calculated_shifted',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [None]:
class DecisionTreeRegressor:
    
    def __init__(self, max_depth=100, min_size=2, C=None):
        self.max_depth = max_depth
        self.min_size = min_size
        self.C = C
    
    def fit(self, X, y):
        self.X = X
        self.y = y
        self.N, self.D = self.X.shape
        dtypes = [np.array(list(self.X[:, d])).dtype for d in range(self.D)]
        self.dtypes = ['quant' if (dtype == float or dtype == int) else 'cat' for dtype in dtypes]
        self.nodes_dict = {}
        self.current_ID = 0
        initial_node = Node(Xsub=X, ysub=y, ID=self.current_ID, parent_ID=None)
        self.nodes_dict[self.current_ID] = initial_node
        self.current_ID += 1
        self._build()
    
    # Buld the decision tree
    def _build(self):

        eligible_buds = self.nodes_dict
        current_depth = 0
        while current_depth < self.max_depth:
            current_depth += 1
            eligible_buds = {ID: node for (ID, node) in self.nodes_dict.items() if 
                                (node.leaf == True) &
                                (node.size >= self.min_size) & 
                                (~all_rows_equal(node.Xsub)) &
                                (len(np.unique(node.ysub)) > 1)}
            if len(eligible_buds) == 0:
                break
            for ID, bud in eligible_buds.items():
                self._find_split(bud)
                if not self.splitter.no_split:
                    self._make_split()

    
    # Find the best split for a node
    def _find_split(self, bud):
        splitter = Splitter()
        splitter.bud_ID = bud.ID
        eligible_predictors = np.random.permutation(np.arange(self.D))[:self.C] if self.C is not None else np.arange(self.D)
        
        for predictor in eligible_predictors:
            X_sub = bud.Xsub[:, predictor]
            dtype = self.dtypes[predictor]
            
            if len(np.unique(X_sub)) == 1:
                continue
            
            if dtype == 'quant':
                thresholds = np.linspace(np.min(X_sub), np.max(X_sub), num=self.C + 1)[1:-1]
                best_rss_reduction = -np.inf
                best_threshold = None
                
                for threshold in thresholds:
                    y_sub_L = bud.ysub[X_sub <= threshold]
                    y_sub_R = bud.ysub[X_sub > threshold]
                    rss_reduction = RSS_reduction(y_sub_L, y_sub_R, bud.ysub)
                    
                    if rss_reduction > best_rss_reduction:
                        best_rss_reduction = rss_reduction
                        best_threshold = threshold
                        
                if best_rss_reduction > splitter.rss_reduction:
                    splitter._replace_split(best_rss_reduction, predictor, dtype='quant', t=best_threshold)
            
            else:
                ordered_values = sort_x_by_y(X_sub, bud.ysub)
                num_splits = min(self.C, len(ordered_values) - 1)
                split_indices = np.random.choice(np.arange(1, len(ordered_values)), size=num_splits, replace=False)
                
                for index in split_indices:
                    L_values = ordered_values[:index]
                    y_sub_L = bud.ysub[np.isin(X_sub, L_values)]
                    y_sub_R = bud.ysub[~np.isin(X_sub, L_values)]
                    rss_reduction = RSS_reduction(y_sub_L, y_sub_R, bud.ysub)
                    
                    if rss_reduction > splitter.rss_reduction:
                        splitter._replace_split(rss_reduction, predictor, dtype='cat', L_values=L_values)
        
        self.splitter = splitter

    # Make split
    def _make_split(self):
        """
        Make a split based on the best split found.
        """
        parent_node = self.nodes_dict[self.splitter.bud_ID]
        parent_node.leaf = False
        parent_node.child_L = self.current_ID
        parent_node.child_R = self.current_ID + 1
        parent_node.d = self.splitter.d
        parent_node.dtype = self.splitter.dtype
        parent_node.t = self.splitter.t        
        parent_node.L_values = self.splitter.L_values
        
        X_sub = parent_node.Xsub[:, parent_node.d]
        
        if parent_node.dtype == 'quant':
            L_condition = X_sub <= parent_node.t
        else:
            L_condition = np.isin(X_sub, parent_node.L_values)
        
        Xchild_L = parent_node.Xsub[L_condition]
        ychild_L = parent_node.ysub[L_condition]
        Xchild_R = parent_node.Xsub[~L_condition]
        ychild_R = parent_node.ysub[~L_condition]
        
        child_node_L = Node(Xchild_L, ychild_L, depth=parent_node.depth + 1,
                            ID=self.current_ID, parent_ID=parent_node.ID)
        child_node_R = Node(Xchild_R, ychild_R, depth=parent_node.depth + 1,
                            ID=self.current_ID + 1, parent_ID=parent_node.ID)
        
        self.nodes_dict[self.current_ID] = child_node_L
        self.nodes_dict[self.current_ID + 1] = child_node_R
        self.current_ID += 2


        # Get leaf node means
        def _get_leaf_means(self):
            self.leaf_means = {}
            for node_ID, node in self.nodes_dict.items():
                if node.leaf:
                    self.leaf_means[node_ID] = node.ysub.mean()

        # Predict using the trained decision tree
        def predict(self, X_test):
            self._get_leaf_means()
            yhat = []
            for x in X_test:
                node = self.nodes_dict[0] 
                while not node.leaf:
                    if node.dtype == 'quant':
                        if x[node.d] <= node.t:
                            node = self.nodes_dict[node.child_L]
                        else:
                            node = self.nodes_dict[node.child_R]
                    else:
                        if x[node.d] in node.L_values:
                            node = self.nodes_dict[node.child_L]
                        else:
                            node = self.nodes_dict[node.child_R]
                yhat.append(self.leaf_means[node.ID])
            return np.array(yhat)

Training SVR with linear kernel


In [None]:
## Build model
tree = DecisionTreeRegressor()
tree.fit(X_train, y_train, max_depth = 7, min_size = 5)
y_pred = tree.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Create a decision tree classifier model object.
decision_tree_regressor = DecisionTreeRegressor()

# Train the decision tree classifier model using the training data.
decision_tree_regressor.fit(X_train, y_train)

# Use the trained model to make predictions on the test data.
y_pred = decision_tree_regressor.predict(X_test)

# Evaluate the performance of the model
mse = (mean_squared_error(y_test, y_pred))
print(f"Mean Squared Error: {mse}")

In [None]:
class RandomForestRegressor:
    
    def __init__(self, n_estimators=10, max_depth=100, min_size=2, C=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_size = min_size
        self.C = C
        self.trees = []
    
    def fit(self, X, y):
        for _ in range(self.n_estimators):
            tree = DecisionTreeRegressor(max_depth=self.max_depth, min_size=self.min_size, C=self.C)
            indices = np.random.choice(len(X), len(X), replace=True)  # Bootstrap sampling
            X_bootstrap, y_bootstrap = X[indices], y[indices]
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)
    
    def predict(self, X_test):
        predictions = np.zeros((len(X_test), len(self.trees)))
        for i, tree in enumerate(self.trees):
            predictions[:, i] = tree.predict(X_test)
        return np.mean(predictions, axis=1)


In [None]:
# Initialize the random forest regressor
random_forest = RandomForestRegressor(n_estimators=10, max_depth=100, min_size=2, C=None)

# Fit the random forest model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred = random_forest.predict(X_test)

# Evaluate the performance of the model (e.g., using mean squared error)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the random forest regressor
sklearn_random_forest = RandomForestRegressor(n_estimators=10, max_depth=100, min_samples_split=2)

# Fit the random forest model to the training data
sklearn_random_forest.fit(X_train, y_train)

# Make predictions on the test data
y_pred_sklearn = sklearn_random_forest.predict(X_test)

# Evaluate the performance of the scikit-learn random forest model (e.g., using mean squared error)
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
print("Mean Squared Error (scikit-learn RandomForestRegressor):", mse_sklearn)