In [None]:
# To add a new cell, type '# %%'
# To add a new markdown cell, type '# %% [markdown]'
# %%
from IPython import get_ipython

# %%
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import csv

get_ipython().run_line_magic('matplotlib', 'inline')


# %%
import itertools
import functools

class PolynomialFeature(object):
    """
    polynomial features

    transforms input array with polynomial features

    Example
    =======
    x =
    [[a, b],
    [c, d]]

    y = PolynomialFeatures(degree=2).transform(x)
    y =
    [[1, a, b, a^2, a * b, b^2],
    [1, c, d, c^2, c * d, d^2]]
    """

    def __init__(self, degree=1):
        """
        construct polynomial features

        Parameters
        ----------
        degree : int
            degree of polynomial
        """
        assert isinstance(degree, int)
        self.degree = degree

    def transform(self, x):
        """
        transforms input array with polynomial features

        Parameters
        ----------
        x : (sample_size, n) ndarray
            input array

        Returns
        -------
        output : (sample_size, 1 + nC1 + ... + nCd) ndarray
            polynomial features
        """
        if x.ndim == 1:
            x = x[:, None]
        x_t = x.transpose()
        features = [np.ones(len(x))]
        for degree in range(1, self.degree + 1):
            for items in itertools.combinations_with_replacement(x_t, degree):
                features.append(functools.reduce(lambda x, y: x * y, items))
        return np.asarray(features).transpose()


# %%
class LinearRegression():
    """
    Linear regression model
    y = X @ w
    t ~ N(t|X @ w, var)
    """

    def fit(self, X:np.ndarray, t:np.ndarray):
        """
        perform least squares fitting

        Parameters
        ----------
        X : (N, D) np.ndarray
            training independent variable
        t : (N,) np.ndarray
            training dependent variable
        """
        self.w = np.linalg.pinv(X) @ t
        self.var = np.mean(np.square(X @ self.w - t))

    def predict(self, X:np.ndarray, return_std:bool=False):
        """
        make prediction given input

        Parameters
        ----------
        X : (N, D) np.ndarray
            samples to predict their output
        return_std : bool, optional
            returns standard deviation of each prediction if True

        Returns
        -------
        y : (N,) np.ndarray
            prediction of each sample
        y_std : (N,) np.ndarray
            standard deviation of each prediction
        """
        y = X @ self.w
        if return_std:
            y_std = np.sqrt(self.var) + np.zeros_like(y)
            return y, y_std
        return y


# %%
class BayesianRegression():
    """
    Bayesian regression model

    w ~ N(w|0, alpha^(-1)I)
    y = X @ w
    t ~ N(t|X @ w, beta^(-1))
    """

    def __init__(self, alpha:float=1., beta:float=1.):
        self.alpha = alpha
        self.beta = beta
        self.w_mean = None
        self.w_precision = None

    def _is_prior_defined(self) -> bool:
        return self.w_mean is not None and self.w_precision is not None

    def _get_prior(self, ndim:int) -> tuple:
        if self._is_prior_defined():
            return self.w_mean, self.w_precision
        else:
            return np.zeros(ndim), self.alpha * np.eye(ndim)

    def fit(self, X:np.ndarray, t:np.ndarray):
        """
        bayesian update of parameters given training dataset

        Parameters
        ----------
        X : (N, n_features) np.ndarray
            training data independent variable
        t : (N,) np.ndarray
            training data dependent variable
        """

        mean_prev, precision_prev = self._get_prior(np.size(X, 1))

        w_precision = precision_prev + self.beta * X.T @ X
        w_mean = np.linalg.solve(
            w_precision,
            precision_prev @ mean_prev + self.beta * X.T @ t
        )
        self.w_mean = w_mean
        self.w_precision = w_precision
        self.w_cov = np.linalg.inv(self.w_precision)

    def predict(self, X:np.ndarray, return_std:bool=False, sample_size:int=None):
        """
        return mean (and standard deviation) of predictive distribution

        Parameters
        ----------
        X : (N, n_features) np.ndarray
            independent variable
        return_std : bool, optional
            flag to return standard deviation (the default is False)
        sample_size : int, optional
            number of samples to draw from the predictive distribution
            (the default is None, no sampling from the distribution)

        Returns
        -------
        y : (N,) np.ndarray
            mean of the predictive distribution
        y_std : (N,) np.ndarray
            standard deviation of the predictive distribution
        y_sample : (N, sample_size) np.ndarray
            samples from the predictive distribution
        """

        if sample_size is not None:
            w_sample = np.random.multivariate_normal(
                self.w_mean, self.w_cov, size=sample_size
            )
            y_sample = X @ w_sample.T
            return y_sample
        y = X @ self.w_mean
        if return_std:
            y_var = 1 / self.beta + np.sum(X @ self.w_cov * X, axis=1)
            y_std = np.sqrt(y_var)
            return y, y_std
        return y



    def _log_likelihood(self, X, t, w):
        return -0.5 * self.beta * np.square(t - X @ w).sum()


# %%

avocado_df = pd.read_csv('avocado.csv')
del avocado_df["Unnamed: 0"]


# %%
avocado_df.info()


# %%
avocado_df.describe()


# %%
avocado_df.head(10)


# %%
#Scatter plot of the data
from pandas.plotting import scatter_matrix

scatter_matrix(avocado_df,figsize = (13,13))
print('The correlation matrix:')
corr_mtx = avocado_df.corr()
corr_mtx.round(2)


# %%
col_mapping = [f"{c[0]}:{c[1]}" for c in enumerate(avocado_df.columns)]
col_mapping_dict = {c[0]:c[1] for c in enumerate(avocado_df.columns)}


# %%
print(col_mapping)


# %%
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

avocado_array = avocado_df.iloc[:,1:10].to_numpy()
X,y = avocado_array[:,1:],avocado_array[:,0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
print(X.shape,y.shape,X_train.shape, y_train.shape, X_test.shape, y_test.shape )


# %%
from sklearn.preprocessing import StandardScaler
# standarization
std_trans = StandardScaler()
X_train = std_trans.fit_transform(X_train)
X_test = std_trans.transform(X_test)


# %%
feature = PolynomialFeature(degree=1)
X_train1, X_test1 = feature.transform(X_train), feature.transform(X_test)
Linear = LinearRegression()
Linear.fit(X_train1, y_train)
errors_linear = mean_squared_error(Linear.predict(X_test1),y_test)

Bayes = BayesianRegression(alpha=1., beta=100.)
errors_bayes = []

for idx in range(X_train1.shape[0]):
    Bayes.fit(X_train1[idx:idx+1], y_train[idx:idx+1])
    errors_bayes.append(mean_squared_error(Bayes.predict(X_test1),y_test))

    
plt.plot(errors_bayes,  'r', label = 'Bayesian Regression' ,linewidth = 2.)
plt.plot([errors_linear]*X_train1.shape[0], 'b:', label = 'Linear Regression')

plt.yscale('log')
plt.xlabel('Iterations')
plt.ylabel('Validation MSE')
plt.legend()


# %%
#test comments 


# %%