In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import os

In [2]:
data_Gucci = pd.read_csv('data/df_Gucci_canada_US.csv')
data_LV = pd.read_csv('data/df_LV_canada_UK.csv')
data_LP = pd.read_csv('data/df_LP_canada_US.csv')

data = pd.concat([data_Gucci, data_LV, data_LP], ignore_index=True)

In [3]:
# Write the DataFrame back to a CSV file
data.to_csv('Final_data.csv', index=False)
data

Unnamed: 0,website_name,competence_date,country_code,currency_code,brand,category1_code,category2_code,category3_code,product_code,title,itemurl,imageurl,full_price,price,full_price_eur,price_eur,flg_discount,Category,Group
0,Gucci,2023-11-17,CAN,CAD,GUCCI,MAKE-UP,LIPS,N.A.,6478599PL139216,216 My Cousin Rachel Rouge De Beauté Brillant,https://www.gucci.com/ca/en/pr/beauty/make-up/...,http://media.gucci.com/style/White_South_0_160...,61.0,61.0,41.07,41.07,0,MAKEUP,OTHER
1,Gucci,2023-11-17,CAN,CAD,GUCCI,FINE-JEWELRY,FINE-JEWELRY-FOR-WOMEN,N.A.,702394J85H08268,Gucci Link to Love baguette tourmaline necklace,https://www.gucci.com/ca/en/pr/jewelry-watches...,http://media.gucci.com/style/White_South_0_160...,5630.0,5630.0,3791.24,3791.24,0,ACCESSORIES,WOMEN
2,Gucci,2023-11-17,CAN,CAD,GUCCI,MAKE-UP,FACE,N.A.,7026019PRD99005,05 Gucci Blush De Beauté,https://www.gucci.com/ca/en/pr/beauty/make-up/...,http://media.gucci.com/style/White_South_0_160...,67.0,67.0,45.11,45.11,0,MAKEUP,OTHER
3,Gucci,2023-11-17,CAN,CAD,GUCCI,HANDBAGS,SHOULDER-BAGS-FOR-WOMEN,N.A.,702721U3ZDT3760,Gucci Diana small tote bag,https://www.gucci.com/ca/en/pr/women/handbags/...,http://media.gucci.com/style/White_South_0_160...,5155.0,5155.0,3471.38,3471.38,0,BAGS,WOMEN
4,Gucci,2023-11-17,CAN,CAD,GUCCI,FINE-JEWELRY,FINE-JEWELRY-FOR-WOMEN,N.A.,745649J85008000,Gucci Link to Love double earrings,https://www.gucci.com/ca/en/pr/jewelry-watches...,http://media.gucci.com/style/White_South_0_160...,7670.0,7670.0,5164.98,5164.98,0,ACCESSORIES,WOMEN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28576,Loro Piana,2023-11-17,USA,USD,LORO PIANA,WOMAN,TROUSERS-AND-SHORTS,N.A.,FAN4351,G,https://us.loropiana.com/en/p/woman/trousers-a...,https://media.loropiana.com/HYBRIS/FAN/FAN4351...,9625.0,9625.0,8879.15,8879.15,0,CLOTHING,WOMEN
28577,Loro Piana,2023-11-17,USA,USD,LORO PIANA,WOMAN,ACCESSORIES,HATS,FAN4625,E,https://us.loropiana.com/en/p/woman/accessorie...,https://media.loropiana.com/HYBRIS/FAN/FAN4625...,750.0,750.0,691.88,691.88,0,ACCESSORIES,WOMEN
28578,Loro Piana,2023-11-17,USA,USD,LORO PIANA,WOMAN,LEATHER-GOODS,TOP-HANDLE-AND-TOTES,FAN4833,L,https://us.loropiana.com/en/p/fall-winter/woma...,https://media.loropiana.com/HYBRIS/FAN/FAN4833...,5325.0,5325.0,4912.36,4912.36,0,BAGS,WOMEN
28579,Loro Piana,2023-11-17,USA,USD,LORO PIANA,MAN,KNITWEAR,N.A.,FAN4943,S,https://us.loropiana.com/p/holiday-season/man/...,https://media.loropiana.com/HYBRIS/FAN/FAN4943...,6600.0,6600.0,6088.56,6088.56,0,CLOTHING,MEN


In [4]:
#Features
X =  data[['website_name', 'country_code', 'brand', 'product_code','title', 'Category', 'Group' ]]

#Labels
y = data['full_price_eur']

In [5]:
# Creating a instance of label Encoder.
label_encoder = LabelEncoder()

#convert string features to numeric form
string_features =['website_name', 'country_code', 'brand', 'product_code','title', 'Category', 'Group']

for feature in string_features:
    # Apply LabelEncoder to each string column
    X.loc[:,feature] = label_encoder.fit_transform(X[feature])

X


Unnamed: 0,website_name,country_code,brand,product_code,title,Category,Group
0,0,0,0,2053,162,11,5
1,0,0,0,3303,4124,0,6
2,0,0,0,3329,60,11,5
3,0,0,0,3346,4039,1,6
4,0,0,0,5620,4134,0,6
...,...,...,...,...,...,...,...
28576,1,2,1,8658,3193,3,6
28577,1,2,1,8703,2667,0,6
28578,1,2,1,8723,4974,1,6
28579,1,2,1,8730,9018,3,4


In [6]:
def initializeModels():
    linear_reg = LinearRegression(fit_intercept=True)
    #ridge_reg = Ridge()
    knn_reg = KNeighborsRegressor()
    sv_reg = SVR()
    random_forest = RandomForestRegressor()
    #ada_reg = AdaBoostRegressor()

    return linear_reg, knn_reg, sv_reg, random_forest

In [7]:
def getBestParams(models, X, y):


    knn_grid = {'n_neighbors': [1, 3, 5, 7, 9, 12]}
    sv_grid = {
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.2, 0.5, 0.7, 0.8, 0.9, 1],
        'kernel': ['linear', 'rbf', 'poly']}
    random_forest_grid = {'n_estimators': [50, 100, 150, 200, 250, 300, 350, 400],
        'max_depth': [4, 6, 8, 10, 12, 16, 18, 20],
        'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8]}

    grids = [knn_grid, sv_grid, random_forest_grid]
    best_params = []

    results = ""

    for i in range(len(models)):
        model = models[i]
        print("Fetching best parameters for", model)
        # Use GridSearchCV to search through the parameter grid
        grid_search = GridSearchCV(estimator=model, param_grid=grids[i], cv=3, scoring=['neg_root_mean_squared_error', 'r2'], refit='neg_root_mean_squared_error', verbose=0, n_jobs=-1)
        grid_search.fit(X, y)

        # Print negative RMSE for best parameters for each model
        results += f"Results for {model}:\n"
        results += f"Negative RMS error for {model}: {grid_search.best_score_}\n"

        # Save the best parameters and best score
        best_params.append(grid_search.best_params_)
    
    print()
    return best_params, results
  

In [8]:
def defineModels():
    
    model_linear_regression = make_pipeline(
                       StandardScaler(),
                        LinearRegression(fit_intercept=True))
    model_knn_neighbours = make_pipeline(
                       StandardScaler(),
                       KNeighborsRegressor(n_neighbors=best_params[0]['n_neighbors']))

    model_random_forest = make_pipeline(
                          StandardScaler(),
                          RandomForestRegressor(n_estimators=best_params[1]['n_estimators'], max_depth=best_params[2]['max_depth'], min_samples_leaf=best_params[2]['min_samples_leaf']))

    model_sv_reg = make_pipeline(
                          StandardScaler(),
                          SVR(C=best_params[4]['C'], epsilon=best_params[4]['epsilon'], kernel=best_params[4]['kernel']))
    
    return model_linear_regression, model_knn_neighbours, model_random_forest, model_sv_reg



In [9]:
# Evaluate models
def evaluate_models(models, X, y, num_test=1, show_progress=False):

    for model in models:
        print(f"Evaluating {model} for {num_test} test(s)...")
        stats = np.array([0, 0], dtype=np.float64)

        for i in range(num_test):
            # Split the dataset into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y)
            
            # Train and predict using model
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)

            # Calculate metrics
            stats[0] += (mean_squared_error(y_test, predictions))
            stats[1] += (r2_score(y_test, predictions))

            if(show_progress and i % 20 == 0):
                print(f"\tTest {i} completed")
            
        # Calculate mean of metrics
        # We want low MSE, high R^2
        stats /= num_test
        print(f"Results for {model}:")
        print(f"\tMean Squared Error: {stats[0]}")
        print(f"\tR-squared: {stats[1]}\n")

In [None]:

# Write results to file
# Create a folder for the results

models = list(initializeModels())

# No parameters to decide for linear regression
models.remove(models[0])

print("Fetching best parameters...")

X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.2, random_state=None)

best_params, results = getBestParams(models, X_train, y_train)
results += "\n"

for i in range(len(models)):
  results += f"{models[i]}: {best_params[i]}\n"

  print(results)
    


Fetching best parameters...
Fetching best parameters for KNeighborsRegressor()
Fetching best parameters for SVR()
