In [43]:
import pandas as pd
import re
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

import torch
import torch.nn as nn

import json

In [4]:
df = pd.read_csv(os.path.join('..', 'data', 'filter-data-cleaned.csv'))
filter_list = ['F5', 'F6', 'F7', 'F8', 'F9', 'G2', 'G3', 'G4', 'H11', 'H13', 'H14', 'H3', 'M5', 'M6']

In [5]:
df_clean = df.drop(columns=['Depth','Pockets','description','item_type', 'dimensions', 'Date'])
df_clean = df_clean.dropna()
df_clean.shape

(3780, 8)

In [6]:
def relative_error(y_true, y_pred):
    return np.abs(y_true - y_pred) / y_true


def relative_error_loss_function(y_true, y_pred):
    return np.mean(relative_error(y_true, y_pred))


def mean_squared_error(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)


def scoring_function(clf, X, y):
    y_pred = clf.predict(X)
    return relative_error_loss_function(y, y_pred)


def fit_grid_search(cv, X_train, y_train):
    cv.fit(X_train, y_train)

    return cv.best_estimator_, cv.best_params_, cv.best_score_


def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    relative_err = relative_error(y_test, y_pred)

    mean_relative_err = np.mean(relative_err)
    relative_err_std_dev = np.std(relative_err)
    relative_err_median = np.median(relative_err)
    mse = mean_squared_error(y_test, y_pred)

    return mean_relative_err, relative_err_std_dev, relative_err_median, mse

In [7]:
df_model = df_clean.drop(columns=['company_name', 'quote_id'])
df_model.head(3)

Unnamed: 0,qty,unit_price,filter_efficiency,Length,Height,Gutter
0,36.0,7.5,G4,625.0,500.0,50.0
1,36.0,27.5,F7,625.0,500.0,50.0
2,16.0,28.5,F7,592.0,490.0,48.0


## Train and evaluate the models

In [8]:
X = df_model.drop(columns=['unit_price'])
y = df_model['unit_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ohe = OneHotEncoder(cols=['filter_efficiency'], use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)

In [9]:
too_few_data_points_filters = df_model['filter_efficiency'].value_counts()[df_model['filter_efficiency'].value_counts() < 10].index

df_model = df_model[~df_model['filter_efficiency'].isin(too_few_data_points_filters)]


In [10]:
X = df_model.drop(columns=['unit_price'])
y = df_model['unit_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X['filter_efficiency'])

ohe = OneHotEncoder(cols=['filter_efficiency'], use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)
X_train.head(3)

Unnamed: 0,qty,filter_efficiency_G4,filter_efficiency_H14,filter_efficiency_F7,filter_efficiency_G2,filter_efficiency_F9,filter_efficiency_G3,filter_efficiency_M5,filter_efficiency_M6,filter_efficiency_H13,filter_efficiency_F8,Length,Height,Gutter
3394,2.0,1,0,0,0,0,0,0,0,0,0,592.0,490.0,48.0
1150,2.0,1,0,0,0,0,0,0,0,0,0,440.0,140.0,20.0
574,3.0,0,1,0,0,0,0,0,0,0,0,1220.0,610.0,70.0


In [36]:

class LossFunctionNN(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, y_pred, y_true):
        return torch.mean(torch.abs(y_true - y_pred) / y_true)


def train_nn(X_train, y_train):
    input_size = X_train.shape[1]


    model = nn.Sequential(
        nn.Linear(input_size, 128),
        nn.ReLU(),
        nn.Linear(128, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 1)
    )

    loss_fn = LossFunctionNN()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    X_train_tensor = torch.from_numpy(X_train).float()
    y_train_tensor = torch.from_numpy(y_train).float().reshape(-1, 1)

    for epoch in range(500):
        y_pred = model(X_train_tensor)

        train_loss = loss_fn(y_pred, y_train_tensor)

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

    return model, train_loss.item()


def predict(model, X, y):
    X_tensor = torch.from_numpy(X).float()
    y_tensor = torch.from_numpy(y).float().reshape(-1, 1)

    y_pred = model(X_tensor)

    loss_fn = LossFunctionNN()
    loss = loss_fn(y_pred, y_tensor)

    return y_pred.detach().numpy().reshape(-1), loss.item()


def run_neural_net(model_name, X_train, y_train, X_test, y_test):
    avg_relative_error_val = []

    for idx_train, idx_val in KFold(n_splits=5).split(X_train):
        X_train_temp = X_train[idx_train]
        y_train_temp = y_train[idx_train]

        X_val = X_train[idx_val]
        y_val = y_train[idx_val]

        model, _ = train_nn(X_train_temp, y_train_temp)
        _, mean_relative_error_val = predict(model, X_val, y_val)

        avg_relative_error_val.append(mean_relative_error_val)

    mean_relative_error_val = np.mean(avg_relative_error_val)

    model, _ = train_nn(X_train, y_train)
    y_pred, _ = predict(model, X_test, y_test)

    relative_err = relative_error(y_test, y_pred)

    mean_relative_err = np.mean(relative_err)
    relative_err_std_dev = np.std(relative_err)
    relative_err_median = np.median(relative_err)
    mse = mean_squared_error(y_test, y_pred)
    
    return {
        'model_type': model_name,
        'model': 'NeuralNetwork',
        'best_model': model,
        'best_params': 'None',
        'mean_relative_error_val': mean_relative_error_val,
        'mean_relative_error_test': mean_relative_err,
        'std_dev_relative_error_test': relative_err_std_dev,
        'median_relative_error_test': relative_err_median,
        'mse_test': mse,
    }


In [37]:
filter_list = [
    df_model['filter_efficiency'].unique(),
    ['G4'],
    ['M5'],
    ['F7'],
]

results = []


def run_regression(model, model_name, param_grid, X_train, y_train, X_test, y_test):
    cv = GridSearchCV(model, param_grid=param_grid, cv=5, scoring=scoring_function)
    best_model, best_params, val_loss = fit_grid_search(cv, X_train, y_train)
    mean_relative_error, std_dev_relative_error, median_relative_error, mse = test_model(best_model, X_test, y_test)
    return {
        'model_type': model_name,
        'model': best_model.__class__.__name__,
        'best_model': best_model,
        'best_params': best_params,
        'mean_relative_error_val': val_loss,
        'mean_relative_error_test': mean_relative_error,
        'std_dev_relative_error_test': std_dev_relative_error,
        'median_relative_error_test': median_relative_error,
        'mse_test': mse,
    }


def create_filter_mask(filters, df):
    mask = np.zeros(df.shape[0], dtype=bool)
    for filt in filters:
        mask = mask | df[f'filter_efficiency_{filt}'] == 1
    return mask


def execute_train(X_train_param, y_train_param, X_test_param, y_test_param):
    results = []

    for filters in filter_list:
        train_mask = create_filter_mask(filters, X_train_param)
        test_mask = create_filter_mask(filters, X_test_param)

        model_name = 'general' if len(filters) > 1 else filters[0]

        print(f'Training {model_name} model...')

        X_train, X_test = X_train_param[train_mask], X_test_param[test_mask]
        y_train, y_test = y_train_param[train_mask], y_test_param[test_mask]

        # Convert to numpy arrays (this will avoid an error on the gradio side)
        X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values, y_test.values

        # Train Neural Network
        print('\tNeural Network...', end='')
        results.append(
            run_neural_net(model_name, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train Linear Regression
        print('\tLinear regression...', end='')
        results.append(
            run_regression(LinearRegression(), model_name, {}, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train SVR
        print('\tSVR regression...', end='')
        param_grid = {
            'C': [1, 10, 100, 1000],
            'kernel': ['rbf','poly']
        }
        results.append(
            run_regression(SVR(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train Decision Tree
        print('\tDecision Tree regression...', end='')
        param_grid={
            'max_depth': [10, 100],
            'min_samples_split': [2, 5, 10],
            'max_features': ('sqrt', 'log2')
        }
        results.append(
            run_regression(DecisionTreeRegressor(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train Random Forest
        print('\tRandom Forest regression...', end='')
        param_grid={
            'n_estimators': [10, 100],
            'max_depth': [10, 100],
            'min_samples_split': [2, 5, 10],
            'max_features': ( 'sqrt', 'log2'),
        }
        results.append(
            run_regression(RandomForestRegressor(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train KNN
        print('\tKNeighbours regression...', end='')
        param_grid={
            'n_neighbors': [2, 3, 5, 7, 10, 15, 17, 20],
            'weights': ['uniform', 'distance'],
            'p': [1, 2, 3]
        }
        results.append(
            run_regression(KNeighborsRegressor(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')
    
    return results

results_df = pd.DataFrame(execute_train(X_train, y_train, X_test, y_test))

Training general model...
	Neural Network...done.
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.
Training G4 model...
	Neural Network...done.
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.
Training M5 model...
	Neural Network...done.
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.
Training F7 model...
	Neural Network...done.
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.


In [38]:
results_df.sort_values(by=['model_type', 'mean_relative_error_val'], ascending=True)

Unnamed: 0,model_type,model,best_model,best_params,mean_relative_error_val,mean_relative_error_test,std_dev_relative_error_test,median_relative_error_test,mse_test
18,F7,NeuralNetwork,"[Linear(in_features=14, out_features=128, bias...",,0.138754,0.138357,0.151191,0.085693,143.872744
22,F7,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=100, max_feat...","{'max_depth': 100, 'max_features': 'log2', 'mi...",0.18094,0.169517,0.214529,0.113591,106.093826
23,F7,KNeighborsRegressor,"KNeighborsRegressor(n_neighbors=20, p=1)","{'n_neighbors': 20, 'p': 1, 'weights': 'uniform'}",0.20951,0.239345,0.379295,0.098312,179.643669
21,F7,DecisionTreeRegressor,"DecisionTreeRegressor(max_depth=10, max_featur...","{'max_depth': 10, 'max_features': 'sqrt', 'min...",0.223844,0.230223,0.456399,0.141348,264.641119
19,F7,LinearRegression,LinearRegression(),{},0.311736,0.336877,0.346128,0.246303,211.528598
20,F7,SVR,"SVR(C=1, kernel='poly')","{'C': 1, 'kernel': 'poly'}",0.316443,0.337459,0.457035,0.190053,210.90738
6,G4,NeuralNetwork,"[Linear(in_features=14, out_features=128, bias...",,0.135351,0.142434,0.211463,0.097499,22.840508
8,G4,SVR,SVR(C=1),"{'C': 1, 'kernel': 'rbf'}",0.139132,0.143134,0.13544,0.109317,17.012691
10,G4,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=10, max_featu...","{'max_depth': 10, 'max_features': 'log2', 'min...",0.147427,0.163198,0.165436,0.115654,8.691764
11,G4,KNeighborsRegressor,"KNeighborsRegressor(n_neighbors=2, p=3)","{'n_neighbors': 2, 'p': 3, 'weights': 'uniform'}",0.160756,0.177131,0.332358,0.103137,14.256637


## Get the best model for each filter efficiency

In [39]:
best_models = results_df.sort_values(by='mean_relative_error_val')

# Get results for unique column filters
best_models_unique = best_models.drop_duplicates(subset='model_type')
best_models_unique

Unnamed: 0,model_type,model,best_model,best_params,mean_relative_error_val,mean_relative_error_test,std_dev_relative_error_test,median_relative_error_test,mse_test
6,G4,NeuralNetwork,"[Linear(in_features=14, out_features=128, bias...",,0.135351,0.142434,0.211463,0.097499,22.840508
18,F7,NeuralNetwork,"[Linear(in_features=14, out_features=128, bias...",,0.138754,0.138357,0.151191,0.085693,143.872744
0,general,NeuralNetwork,"[Linear(in_features=14, out_features=128, bias...",,0.264016,0.244641,0.410091,0.139628,450.446001
12,M5,NeuralNetwork,"[Linear(in_features=14, out_features=128, bias...",,0.269064,0.28402,0.259722,0.192896,138.595496


## Print the model_type, model and params

In [40]:
for i, row in best_models_unique.iterrows():
    # Print model_type, model and params
    print(f'Model type: {row["model_type"]}')
    print(f'Model: {row["model"]}')
    print(f'Best params: {row["best_params"]}')
    
    print()


Model type: G4
Model: NeuralNetwork
Best params: None

Model type: F7
Model: NeuralNetwork
Best params: None

Model type: general
Model: NeuralNetwork
Best params: None

Model type: M5
Model: NeuralNetwork
Best params: None



## Save the best models

In [41]:
for model_data in best_models_unique.to_dict(orient='records'):
    model_name = model_data['model_type'].lower()
    model = model_data['model']

    if model == 'NeuralNetwork':
        m = model_data['best_model']
        torch.save(m.state_dict(), os.path.join('..', 'models', f'best_model_{model_name}.pt'))
    
    else:
        with open(os.path.join('..', 'models', f'best_model_{model_name}.pkl'), 'wb') as f:
            pickle.dump(model_data['best_model'], f)

In [42]:
features = X_train.columns
features

Index(['qty', 'filter_efficiency_G4', 'filter_efficiency_H14',
       'filter_efficiency_F7', 'filter_efficiency_G2', 'filter_efficiency_F9',
       'filter_efficiency_G3', 'filter_efficiency_M5', 'filter_efficiency_M6',
       'filter_efficiency_H13', 'filter_efficiency_F8', 'Length', 'Height',
       'Gutter'],
      dtype='object')

## Save manifest so gradio can identify which models to use

In [56]:
best_models_dict = {
    'best_model_f7': best_models_unique[best_models_unique['model_type'] == 'F7']['model'].values[0],
    'best_model_g4': best_models_unique[best_models_unique['model_type'] == 'G4']['model'].values[0],
    'best_model_m5': best_models_unique[best_models_unique['model_type'] == 'M5']['model'].values[0],
    'best_model_general': best_models_unique[best_models_unique['model_type'] == 'general']['model'].values[0],
}
best_models_dict

json.dump(best_models_dict, open(os.path.join('..', 'models', 'gradio_manifest.json'), 'w'))