In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from typing import List, Dict

from load_datasets import get_diamonds, get_wines
from models.linear_regression import CustomLinearRegression

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


In [147]:
SEED = 42
COST_FUNCTIONS = ['mse', 'mae', 'huber']
LEARNING_FUNCTIONS = ['gradient_descent', 'stochastic_gradient_descent', 'least_squares']

FEATURES_DIAMONDS = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']
TARGET_DIAMONDS = 'price'

FEATURES_WINES = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 
                  'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
TARGET_WINES = 'quality'

METRICS = ['mse', 'mae', 'r2']

In [148]:
models = dict()

In [149]:
class Dataset:
    def __init__(self, 
                 dataframe: pd.DataFrame, 
                 features: List[str], 
                 target: List[str], 
                 test_size: int = 0.2, 
                 random_state: int = SEED):
        self.dataframe = dataframe
        self.features = features
        self.target = target
        self.random_state = random_state
        self.test_size = test_size
        
    def split(self):
        self.x_train, self.x_test, self.y_train, self.y_test = \
            train_test_split(self.dataframe[self.features], 
                             self.dataframe[self.target], 
                             test_size=self.test_size, 
                             random_state=self.random_state)

In [150]:
def train(model, dataset: Dataset):
    model.fit(dataset.x_train, dataset.y_train)

def calculate_metrics(actual, predictions):
    metrics: Dict[str, float] = dict()
    
    for metric in METRICS:
        if metric == 'mse':
            metrics[metric] = mean_squared_error(actual, predictions)
        elif metric == 'mae':
            metrics[metric] = mean_absolute_error(actual, predictions)
        elif metric == 'r2':
            metrics[metric] = r2_score(actual, predictions)
    return metrics

In [151]:
diamonds_df = get_diamonds()
wines_df = get_wines()

diamonds = Dataset(diamonds_df, FEATURES_DIAMONDS, TARGET_DIAMONDS)
wines = Dataset(wines_df, FEATURES_WINES, TARGET_WINES)

diamonds.split()
wines.split()

In [152]:
for cost_function in COST_FUNCTIONS:
    for learning_function in LEARNING_FUNCTIONS:
        # models[f"LinearRegression_{cost_function}_{learning_function}"] = \
        #     CustomLinearRegression(cost_function=cost_function, learning_function=learning_function)

        # models[f"RandomForestRegressor_{cost_function}_{learning_function}"] = RandomForestRegressor()

        pass

In [153]:
models["LinearRegression_sklearn_diamonds"] = LinearRegression()
models["LinearRegression_sklearn_wines"] = LinearRegression()

### Diamonds:

In [154]:
train(models["LinearRegression_sklearn_diamonds"], diamonds)

In [155]:
predictions = models["LinearRegression_sklearn_diamonds"].predict(diamonds.x_test)
mse = mean_squared_error(diamonds.y_test, predictions)
new_df = pd.DataFrame({'Actual': diamonds.y_test, 'Predicted': predictions})
print(new_df)
print(f'MSE: {mse}')

       Actual     Predicted
id                         
1389      559    363.646688
50053    2201   3355.932308
41646    1238   2139.566064
42378    1304   2371.644610
17245    6901  10551.699188
...       ...           ...
44082    1554   1884.977785
23714     633   1064.398487
31376     761    619.037553
21773    9836   7788.029534
4999     3742   4643.639100

[10788 rows x 2 columns]
MSE: 1825912.9915253473


In [159]:
predictions = models["LinearRegression_sklearn_diamonds"].predict(diamonds.x_test)

# new_df = pd.DataFrame({'Actual': diamonds.y_train, 'Predicted': predictions})
# print(new_df)
metrics = calculate_metrics(diamonds.y_test, predictions)
for metric in metrics:
    print(f'{metric}: {metrics[metric]}')

mse: 1830068.2345561874
mae: 863.0738090430269
r2: 0.88504407867572


### Wines:

In [157]:
train(models["LinearRegression_sklearn_wines"], wines)

In [158]:
predictions = models["LinearRegression_sklearn_wines"].predict(wines.x_test)

metrics = calculate_metrics(diamonds.y_test, predictions)
for metric in metrics:
    print(f'{metric}: {metrics[metric]}')

ValueError: Found input variables with inconsistent numbers of samples: [3918, 980]