In [16]:
import numpy as np
import pandas as pd

In [38]:
class SVR:
    def __init__(self, C=1.0, epsilon=0.1, kernel='linear', gamma='scale'):
        self.C = C
        self.epsilon = epsilon
        self.kernel = kernel
        self.gamma = gamma

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

        if self.kernel == 'linear':
            self.kernel_func = self.linear_kernel
        elif self.kernel == 'rbf':
            self.kernel_func = self.rbf_kernel

        n_samples, n_features = X.shape

        # Construct the Kernel matrix
        K = np.zeros((n_samples, n_samples))
        for i in range(n_samples):
            for j in range(n_samples):
                K[i,j] = self.kernel_func(X[i], X[j])

        # Solve the quadratic optimization problem directly
        # with SMO algorithm
        alphas = np.zeros(n_samples)
        b = 0
        tolerance = 1e-3

        for epoch in range(1000):
            alpha_changed = 0
            for i in range(n_samples):
                # Calculate Ei = f(x_i) - y_i
                f_xi = np.dot(alphas * y, K[i]) + b
                Ei = f_xi - y[i]

                if (y[i] * Ei < -self.epsilon and alphas[i] < self.C) or (y[i] * Ei > self.epsilon and alphas[i] > 0):
                    j = np.random.choice(np.delete(np.arange(n_samples), i))
                    # Calculate Ej = f(x_j) - y_j
                    f_xj = np.dot(alphas * y, K[j]) + b
                    Ej = f_xj - y[j]

                    alpha_i_old, alpha_j_old = alphas[i], alphas[j]

                    # Calculate L and H
                    if y[i] != y[j]:
                        L = max(0, alphas[j] - alphas[i])
                        H = min(self.C, self.C + alphas[j] - alphas[i])
                    else:
                        L = max(0, alphas[i] + alphas[j] - self.C)
                        H = min(self.C, alphas[i] + alphas[j])

                    if L == H:
                        continue

                    eta = 2 * K[i,j] - K[i,i] - K[j,j]
                    if eta >= 0:
                        continue

                    alphas[j] -= y[j] * (Ei - Ej) / eta
                    alphas[j] = min(H, max(L, alphas[j]))

                    if abs(alphas[j] - alpha_j_old) < tolerance:
                        continue

                    alphas[i] += y[i] * y[j] * (alpha_j_old - alphas[j])

                    b1 = b - Ei - y[i] * (alphas[i] - alpha_i_old) * K[i,i] - y[j] * (alphas[j] - alpha_j_old) * K[i,j]
                    b2 = b - Ej - y[i] * (alphas[i] - alpha_i_old) * K[i,j] - y[j] * (alphas[j] - alpha_j_old) * K[j,j]

                    if 0 < alphas[i] < self.C:
                        b = b1
                    elif 0 < alphas[j] < self.C:
                        b = b2
                    else:
                        b = (b1 + b2) / 2

                    alpha_changed += 1

            if alpha_changed == 0:
                break

        # Save support vectors
        self.support_vectors = X[alphas > 1e-5]
        self.support_vector_labels = y[alphas > 1e-5]
        self.alphas = alphas[alphas > 1e-5]
        self.b = b

    def predict(self, X):
        if self.kernel == 'linear':
            return np.dot(X, self.w) + self.b
        elif self.kernel == 'rbf':
            pred = 0
            for i in range(len(self.support_vectors)):
                pred += self.alphas[i] * self.support_vector_labels[i] * self.rbf_kernel(X, self.support_vectors[i])
            return pred + self.b

    def linear_kernel(self, x1, x2):
        return np.dot(x1, x2)

    def rbf_kernel(self, x1, x2):
        gamma = 1.0 / self.X_train.shape[1] if self.gamma == 'scale' else self.gamma
        return np.exp(-gamma * np.linalg.norm(x1 - x2) ** 2)

In [39]:
df = pd.read_csv("cleaned_shifted_data.csv")

In [40]:
df.head()

Unnamed: 0.1,Timestamp,Unnamed: 0,Station,PM2.5 (µg/m³),PM10 (µg/m³),NO (µg/m³),NO2 (µg/m³),NOx (ppb),NH3 (µg/m³),SO2 (µg/m³),CO (mg/m³),Ozone (µg/m³),Checks,AQI_calculated,AQI_bucket_calculated,AQI_calculated_shifted,AQI_bucket_calculated_shifted
0,2019-02-17 15:15:00,4573,Railway Colony,46.0,80.0,1.29,9.16,12.02,27.19,13.56,0.4,15.8,7,67.0,Satisfactory,296.0,Poor
1,2019-02-17 15:30:00,4574,Railway Colony,46.0,80.0,1.74,8.93,12.48,30.29,13.71,0.41,15.52,7,68.0,Satisfactory,297.0,Poor
2,2019-02-17 15:45:00,4575,Railway Colony,45.62,79.92,1.87,8.56,12.17,28.2,13.88,0.41,15.33,7,68.0,Satisfactory,298.0,Poor
3,2019-02-17 16:00:00,4576,Railway Colony,41.0,72.92,1.83,8.72,12.37,26.69,13.77,0.4,15.3,7,68.0,Satisfactory,298.0,Poor
4,2019-02-17 16:15:00,4577,Railway Colony,41.0,79.0,1.69,7.91,11.3,26.83,13.87,0.41,15.49,7,68.0,Satisfactory,299.0,Poor


In [41]:
df.columns

Index(['Timestamp', 'Unnamed: 0', 'Station', 'PM2.5 (µg/m³)', 'PM10 (µg/m³)',
       'NO (µg/m³)', 'NO2 (µg/m³)', 'NOx (ppb)', 'NH3 (µg/m³)', 'SO2 (µg/m³)',
       'CO (mg/m³)', 'Ozone (µg/m³)', 'Checks', 'AQI_calculated',
       'AQI_bucket_calculated', 'AQI_calculated_shifted',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [42]:
drop_cols = [0,1,2,12,14,16]
drop_cols = df.columns[drop_cols]
drop_cols

Index(['Timestamp', 'Unnamed: 0', 'Station', 'Checks', 'AQI_bucket_calculated',
       'AQI_bucket_calculated_shifted'],
      dtype='object')

In [43]:
df.drop(drop_cols,axis=1,inplace=True)

In [44]:
X = df.drop('AQI_calculated_shifted',axis = 1)
y = df['AQI_calculated_shifted']

In [45]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.99, random_state=42)

In [46]:
X_train.values

array([[3.4000e+01, 1.2500e+02, 3.8000e+00, ..., 3.3000e-01, 2.0760e+01,
        4.6000e+01],
       [7.5000e+01, 3.2600e+02, 6.0000e-02, ..., 1.1200e+00, 1.0720e+01,
        3.8200e+02],
       [8.1220e+01, 1.2667e+02, 4.7300e+01, ..., 9.5000e-01, 1.1740e+01,
        1.4200e+02],
       ...,
       [7.6000e+01, 1.1314e+02, 5.4900e+00, ..., 5.7000e-01, 7.9410e+01,
        1.7600e+02],
       [1.8000e+01, 2.7000e+01, 6.6300e+00, ..., 5.9000e-01, 1.7250e+01,
        6.8000e+01],
       [3.6000e+01, 5.7000e+01, 2.0100e+00, ..., 6.4000e-01, 1.2290e+01,
        5.3000e+01]])

In [49]:
# Fit SVR
svr = SVR(C=1.0, epsilon=0.1, kernel='rbf')
svr.fit(X_train.values, y_train.values)

In [56]:
svr.predict(X_test.loc[0])

67049167.5

In [54]:
predictions

67049167.5

In [52]:
from sklearn.metrics import mean_squared_error

In [53]:
mse = mean_squared_error(y_test, predictions)
print("Mean Squared Error:", mse)

InvalidParameterError: The 'y_pred' parameter of mean_squared_error must be an array-like. Got 67049167.5 instead.