In [16]:
import pandas as pd
import re
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

In [2]:
df = pd.read_csv(os.path.join('..', 'data', 'filter-data-cleaned.csv'))
filter_list = ['F5', 'F6', 'F7', 'F8', 'F9', 'G2', 'G3', 'G4', 'H11', 'H13', 'H14', 'H3', 'M5', 'M6']

In [3]:
df_clean = df.drop(columns=['Depth','Pockets','description','item_type', 'dimensions', 'Date'])
df_clean = df_clean.dropna()
df_clean.shape

(3780, 8)

In [4]:
company_names = df_clean['company_name'].str.lower()
company_names

0       a preditiva
1       a preditiva
2       a preditiva
3       a preditiva
4       a preditiva
           ...     
3775        valinox
3776        valinox
3777        valinox
3778        valinox
3779        valinox
Name: company_name, Length: 3780, dtype: object

In [5]:
def relative_error(y_true, y_pred):
    return np.abs(y_true - y_pred) / y_true


def loss_function(y_true, y_pred):
    return np.mean(relative_error(y_true, y_pred))


def scoring_function(clf, X, y):
    y_pred = clf.predict(X)
    return loss_function(y, y_pred)


def fit_grid_search(cv, X_train, y_train):
    cv.fit(X_train, y_train)

    return cv.best_estimator_, cv.best_params_, cv.best_score_


def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    error = relative_error(y_test, y_pred)

    mean = np.mean(error)
    std = np.std(error)
    median = np.median(error)

    return mean, std, median

In [6]:
df_model = df_clean.drop(columns=['company_name', 'quote_id'])
df_model.head(3)

Unnamed: 0,qty,unit_price,filter_efficiency,Length,Height,Gutter
0,36.0,7.5,G4,625.0,500.0,50.0
1,36.0,27.5,F7,625.0,500.0,50.0
2,16.0,28.5,F7,592.0,490.0,48.0


## Train and evaluate the models

In [7]:
X = df_model.drop(columns=['unit_price'])
y = df_model['unit_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

ohe = OneHotEncoder(cols=['filter_efficiency'], use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)

In [8]:
too_few_data_points_filters = df_model['filter_efficiency'].value_counts()[df_model['filter_efficiency'].value_counts() < 10].index

df_model = df_model[~df_model['filter_efficiency'].isin(too_few_data_points_filters)]


In [9]:
X = df_model.drop(columns=['unit_price'])
y = df_model['unit_price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X['filter_efficiency'])

ohe = OneHotEncoder(cols=['filter_efficiency'], use_cat_names=True)
X_train = ohe.fit_transform(X_train)
X_test = ohe.transform(X_test)
X_train.head(3)

Unnamed: 0,qty,filter_efficiency_G4,filter_efficiency_M5,filter_efficiency_G3,filter_efficiency_F7,filter_efficiency_G2,filter_efficiency_F9,filter_efficiency_M6,filter_efficiency_H14,filter_efficiency_F8,filter_efficiency_H13,Length,Height,Gutter
10,5.0,1,0,0,0,0,0,0,0,0,0,800.0,500.0,48.0
647,100.0,0,1,0,0,0,0,0,0,0,0,592.0,592.0,600.0
3388,4.0,1,0,0,0,0,0,0,0,0,0,490.0,490.0,45.0


In [10]:
filter_list = [
    df_model['filter_efficiency'].unique(),
    ['G4'],
    ['M5'],
    ['F7'],
]

results = []


def run_regression(model, model_name, param_grid, X_train, y_train, X_test, y_test):
    cv = GridSearchCV(model, param_grid=param_grid, cv=5, scoring=scoring_function)
    best_model, best_params, val_loss = fit_grid_search(cv, X_train, y_train)
    mean_test_loss, std_test_loss, median_test_loss = test_model(best_model, X_test, y_test)
    return {
        'model_type': model_name,
        'model': best_model.__class__.__name__,
        'best_model': best_model,
        'best_params': best_params,
        'val_loss': val_loss,
        'mean_test_loss': mean_test_loss,
        'std_test_loss': std_test_loss,
        'median_test_loss': median_test_loss
    }


def create_filter_mask(filters, df):
    mask = np.zeros(df.shape[0], dtype=bool)
    for filt in filters:
        mask = mask | df[f'filter_efficiency_{filt}'] == 1
    return mask


def execute_train(X_train_param, y_train_param, X_test_param, y_test_param):
    results = []

    for filters in filter_list:
        train_mask = create_filter_mask(filters, X_train_param)
        test_mask = create_filter_mask(filters, X_test_param)

        model_name = 'general' if len(filters) > 1 else filters[0]

        print(f'Training {model_name} model...')

        X_train, X_test = X_train_param[train_mask], X_test_param[test_mask]
        y_train, y_test = y_train_param[train_mask], y_test_param[test_mask]

        # Convert to numpy arrays (this will avoid an error on the gradio side)
        X_train, X_test, y_train, y_test = X_train.values, X_test.values, y_train.values, y_test.values

        # Train Linear Regression
        print('\tLinear regression...', end='')
        results.append(
            run_regression(LinearRegression(), model_name, {}, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train SVR
        print('\tSVR regression...', end='')
        param_grid = {
            'C': [1, 10, 100, 1000],
            'kernel': ['rbf','poly']
        }
        results.append(
            run_regression(SVR(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train Decision Tree
        print('\tDecision Tree regression...', end='')
        param_grid={
            'max_depth': [10, 100],
            'min_samples_split': [2, 5, 10],
            'max_features': ('sqrt', 'log2')
        }
        results.append(
            run_regression(DecisionTreeRegressor(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train Random Forest
        print('\tRandom Forest regression...', end='')
        param_grid={
            'n_estimators': [10, 100],
            'max_depth': [10, 100],
            'min_samples_split': [2, 5, 10],
            'max_features': ( 'sqrt', 'log2'),
        }
        results.append(
            run_regression(RandomForestRegressor(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')

        # Train KNN
        print('\tKNeighbours regression...', end='')
        param_grid={
            'n_neighbors': [2, 3, 5, 7, 10, 15, 17, 20],
            'weights': ['uniform', 'distance'],
            'p': [1, 2, 3]
        }
        results.append(
            run_regression(KNeighborsRegressor(), model_name, param_grid, X_train, y_train, X_test, y_test)
        )
        print('done.')
    
    return results

results_df = pd.DataFrame(execute_train(X_train, y_train, X_test, y_test))

Training general model...
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.
Training G4 model...
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.
Training M5 model...
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.
Training F7 model...
	Linear regression...done.
	SVR regression...done.
	Decision Tree regression...done.
	Random Forest regression...done.
	KNeighbours regression...done.


In [12]:
results_df.sort_values(by=['model_type', 'val_loss'], ascending=True)

Unnamed: 0,model_type,model,best_model,best_params,val_loss,mean_test_loss,std_test_loss,median_test_loss
18,F7,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=100, max_feat...","{'max_depth': 100, 'max_features': 'log2', 'mi...",0.179751,0.14932,0.179244,0.094756
19,F7,KNeighborsRegressor,"KNeighborsRegressor(n_neighbors=20, p=1)","{'n_neighbors': 20, 'p': 1, 'weights': 'uniform'}",0.213386,0.191041,0.233104,0.117596
17,F7,DecisionTreeRegressor,"DecisionTreeRegressor(max_depth=100, max_featu...","{'max_depth': 100, 'max_features': 'sqrt', 'mi...",0.228513,0.190944,0.28025,0.107504
16,F7,SVR,"SVR(C=1, kernel='poly')","{'C': 1, 'kernel': 'poly'}",0.314164,0.279555,0.341412,0.194506
15,F7,LinearRegression,LinearRegression(),{},0.323258,0.298291,0.279024,0.249574
6,G4,SVR,"SVR(C=1000, kernel='poly')","{'C': 1000, 'kernel': 'poly'}",0.141799,0.142712,0.174315,0.100056
8,G4,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=10, max_featu...","{'max_depth': 10, 'max_features': 'log2', 'min...",0.148557,0.136937,0.155709,0.100181
9,G4,KNeighborsRegressor,"KNeighborsRegressor(n_neighbors=2, p=3)","{'n_neighbors': 2, 'p': 3, 'weights': 'uniform'}",0.169331,0.162204,0.265572,0.092885
7,G4,DecisionTreeRegressor,"DecisionTreeRegressor(max_depth=100, max_featu...","{'max_depth': 100, 'max_features': 'sqrt', 'mi...",0.178873,0.159632,0.39112,0.086957
5,G4,LinearRegression,LinearRegression(),{},0.196826,0.182161,0.164214,0.154151


## Get the best model for each filter efficiency

In [13]:
best_models = results_df.sort_values(by='val_loss')

# Get results for unique column filters
best_models_unique = best_models.drop_duplicates(subset='model_type')
best_models_unique

Unnamed: 0,model_type,model,best_model,best_params,val_loss,mean_test_loss,std_test_loss,median_test_loss
6,G4,SVR,"SVR(C=1000, kernel='poly')","{'C': 1000, 'kernel': 'poly'}",0.141799,0.142712,0.174315,0.100056
18,F7,RandomForestRegressor,"(DecisionTreeRegressor(max_depth=100, max_feat...","{'max_depth': 100, 'max_features': 'log2', 'mi...",0.179751,0.14932,0.179244,0.094756
11,M5,SVR,SVR(C=1),"{'C': 1, 'kernel': 'rbf'}",0.312444,0.261551,0.230983,0.210373
2,general,DecisionTreeRegressor,"DecisionTreeRegressor(max_depth=10, max_featur...","{'max_depth': 10, 'max_features': 'log2', 'min...",0.390854,0.22463,0.818783,0.118824


## Save the best models

In [18]:
for model_data in best_models_unique.to_dict(orient='records'):
    model_name = model_data['model_type'].lower()

    with open(os.path.join('..', 'models', f'best_model_{model_name}.pkl'), 'wb') as f:
        pickle.dump(model_data['best_model'], f)

In [30]:
features = X_train.columns
features

Index(['qty', 'filter_efficiency_G4', 'filter_efficiency_M5',
       'filter_efficiency_G3', 'filter_efficiency_F7', 'filter_efficiency_G2',
       'filter_efficiency_F9', 'filter_efficiency_M6', 'filter_efficiency_H14',
       'filter_efficiency_F8', 'filter_efficiency_H13', 'Length', 'Height',
       'Gutter'],
      dtype='object')