In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.svm import LinearSVR, SVR
from sklearn.metrics import mean_squared_error

## Use this DF from here on with the filled Price

In [2]:
df = pd.read_csv(
    './data/Modelar_UH2021_filled_precio.txt', parse_dates=[1], index_col=0
)

  mask |= (ar1 == a)


In [3]:
df_est = pd.read_csv(
    './data/Estimar_UH2021_filled_precio.txt', parse_dates=[1], index_col=0
)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Feature engineering

### Modelar

In [4]:
df=df.drop_duplicates()

conditions = [ (df["estado"] == 'No Rotura'), (df["estado"] == 'Transito'), (df["estado"] == 'Rotura') ]
values = [1, 0, -1]
df["estado_num"] = np.select(conditions, values)

df["weekday"] = df["fecha"].dt.weekday
df["antiguedad"] = df["antiguedad"].astype('Int64')
df["antiguedad_std"] = df["antiguedad"]-df["antiguedad"].min()
df["categoria_dos"] = df["categoria_dos"].astype("Int64")

df = df.drop(columns=["estado", "antiguedad"])

In [5]:
# Cyclical features
df['weekday_sin'] = np.sin(df.fecha.dt.weekday * (2*np.pi/7))
df['weekday_cos'] = np.cos(df.fecha.dt.weekday * (2*np.pi/7))

month_con = df["fecha"].dt.month + (df["fecha"].dt.day / df["fecha"].dt.days_in_month)
df['month_sin'] = np.sin((month_con-1) * (2*np.pi/12))
df['month_cos'] = np.cos((month_con-1) * (2*np.pi/12))

### Estimar

In [6]:
df_est=df_est.drop_duplicates()

conditions = [ (df_est["estado"] == 'No Rotura'), (df_est["estado"] == 'Transito') ]
values = [1, 0]
df_est["estado_num"] = np.select(conditions, values)

df_est["weekday"] = df_est["fecha"].dt.weekday
df_est["antiguedad"] = pd.to_numeric(df_est["antiguedad"], errors='coerce') 
df_est["antiguedad"] = df_est["antiguedad"].astype('Int64')
df_est["antiguedad_std"] = df_est["antiguedad"]-df_est["antiguedad"].min()
df_est["categoria_dos"] = pd.to_numeric(df_est["categoria_dos"], errors='coerce') 
df_est["categoria_dos"] = df_est["categoria_dos"].astype("Int64")

df_est = df_est.drop(columns=["estado", "antiguedad"])

In [7]:
# Cyclical features
df_est['weekday_sin'] = np.sin(df_est.fecha.dt.weekday * (2*np.pi/7))
df_est['weekday_cos'] = np.cos(df_est.fecha.dt.weekday * (2*np.pi/7))

month_con = df_est["fecha"].dt.month + (df_est["fecha"].dt.day / df_est["fecha"].dt.days_in_month)
df_est['month_sin'] = np.sin((month_con-1) * (2*np.pi/12))
df_est['month_cos'] = np.cos((month_con-1) * (2*np.pi/12))

### Further tuning

In [8]:
# Only use data before the pattern change
#df = df[df.fecha < datetime.datetime(2016,1,24)]
#df_est = df_est[df_est.fecha < datetime.datetime(2016,1,24)]
# Only use data after the pattern change
#df = df[df.fecha > datetime.datetime(2016,1,25)]
#df_est = df_est[df_est.fecha > datetime.datetime(2016,1,25)]

In [9]:
# Drop unwanted columns
df = df.drop(columns=["fecha", "id", "weekday"])
df_est = df_est.drop(columns=["fecha", "id", "weekday"])

In [10]:
# Drop nans
df = df.dropna()
df_est = df_est.dropna()

### Split dataset in categories

In [11]:
# Split dataset in categorias_uno and limit number of training samples per categoria_uno
number_samples_desired = 1000

list_categoria_uno = sorted( df["categoria_uno"].unique() )
data_cat = [None]*len(list_categoria_uno)
data_est_cat = [None]*len(list_categoria_uno)

for index in range(len(list_categoria_uno)):

    number_samples = number_samples_desired
    number_samples_available = len(df[df["categoria_uno"] == list_categoria_uno[index]])
    if number_samples_desired > number_samples_available:
        number_samples = number_samples_available
        print(f"Only {number_samples_available} samples for categoria_uno = {list_categoria_uno[index]}")
        
    # Modelar
    data_cat[index] = df[df["categoria_uno"] == list_categoria_uno[index]].sample(n=number_samples, random_state=0)
    data_cat[index] = data_cat[index].drop(columns = "categoria_uno")
    # Estimar
    data_est_cat[index] = df_est[df_est["categoria_uno"] == list_categoria_uno[index]]
    data_est_cat[index] = data_est_cat[index].drop(columns = "categoria_uno")

Only 487 samples for categoria_uno = D


## Train / test split

In [12]:
X = [None]*len(list_categoria_uno)
y = [None]*len(list_categoria_uno)
X_train = [None]*len(list_categoria_uno)
X_test = [None]*len(list_categoria_uno)
y_train = [None]*len(list_categoria_uno)
y_test = [None]*len(list_categoria_uno)

for index in range(len(list_categoria_uno)):
    X[index] = data_cat[index][data_cat[index].columns.difference(["unidades_vendidas"])]
    y[index] = data_cat[index]["unidades_vendidas"]

    X_train[index], X_test[index], y_train[index], y_test[index] = train_test_split(
                                                            X[index], y[index], test_size=0.10, random_state=0)

print(X[0].shape, y[0].shape)
print(len(X_train[0]), len(y_train[0]), len(X_test[0]))
print(X_train[0].shape, y_train[0].shape, X_test[0].shape)

(1000, 11) (1000,)
900 900 100
(900, 11) (900,) (100, 11)


In [13]:
X_train[0]

Unnamed: 0,antiguedad_std,campaña,categoria_dos,dia_atipico,estado_num,month_cos,month_sin,precio,visitas,weekday_cos,weekday_sin
1561636,3139,0,127,-1,1,9.121663e-01,-0.409820,21.70,11,0.623490,-0.781831
3349378,1027,0,304,0,1,-9.884683e-01,-0.151428,39.73,200,-0.222521,-0.974928
2916798,1315,0,82,0,1,-7.363257e-01,0.676627,35.72,90,-0.222521,0.974928
2257463,351,0,82,0,1,5.309397e-01,0.847410,74.72,240,-0.222521,-0.974928
2477286,1022,0,127,0,1,8.435107e-02,0.996436,16.15,75,-0.222521,-0.974928
...,...,...,...,...,...,...,...,...,...,...,...
1422197,1078,1,236,1,1,7.547096e-01,-0.656059,45.35,428,-0.900969,0.433884
6400,370,0,82,0,1,-8.746197e-01,0.484810,47.74,0,1.000000,0.000000
1821418,951,0,82,-1,-1,9.948693e-01,0.101168,56.25,16,-0.222521,0.974928
2520748,635,0,255,0,1,6.123234e-17,1.000000,8.82,5,-0.900969,0.433884


## Normalitzation of selected features

In [14]:
# Normalize some features of the train and test datasets with the Standard Scaler/Robust Scaler 
# Select which columns to use with the scaler

selected_columns = [
#    "fecha",
#    "id",
    "antiguedad_std",
#    "campaña",
#    "categoria_uno",
    "categoria_dos",
#    "dia_atipico",
#    "estado_num",
#    "month_cos",
#    "month_sin",
    "precio",
    "visitas",
#    "weekday",
#    "weekday_cos",
#    "weekday_sin",
    ] 

In [15]:
# I train the scaler with ALL training data available
scaler = StandardScaler().fit(df.loc[:,selected_columns])

X_train_scaled = [None]*len(list_categoria_uno)
X_train_non_scaled = [None]*len(list_categoria_uno)
X_test_scaled = [None]*len(list_categoria_uno)
X_test_non_scaled = [None]*len(list_categoria_uno)

for index in range(len(list_categoria_uno)):
    X_train_scaled[index] = scaler.transform(X_train[index].loc[:, selected_columns])
    X_train_non_scaled[index] = X_train[index][X_train[index].columns.difference(selected_columns)]
    X_train_scaled[index] = np.concatenate([X_train_non_scaled[index], X_train_scaled[index]], axis=1)

    X_test_scaled[index] = scaler.transform(X_test[index].loc[:, selected_columns])
    X_test_non_scaled[index] = X_test[index][X_test[index].columns.difference(selected_columns)]
    X_test_scaled[index] = np.concatenate([X_test_non_scaled[index], X_test_scaled[index]], axis=1)

## Train models and predict validation samples

In [17]:
print("Start training: ", datetime.datetime.now())

predictor = [None]*len(list_categoria_uno)
y_predicted = [None]*len(list_categoria_uno)

for index in range(len(list_categoria_uno)):
    predictor[index] = LinearSVR(random_state=0, tol=1e-5, max_iter=100000)
#    predictor[index] = SVR(kernel='rbf', C=2., tol=1e-5, max_iter=100000)
    print(predictor[index])

    predictor[index].fit(X_train_scaled[index], y_train[index])
    y_predicted[index] = predictor[index].predict(X_test_scaled[index])

print("End training: ", datetime.datetime.now())

Start training:  2021-03-11 12:02:33.809829
LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=100000,
          random_state=0, tol=1e-05, verbose=0)
LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=100000,
          random_state=0, tol=1e-05, verbose=0)
LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=100000,
          random_state=0, tol=1e-05, verbose=0)
LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=100000,
          random_state=0, tol=1e-05, verbose=0)
LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
          intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=100000,
          random_state=0, tol=1e-05, verbose=0)
LinearSVR(C=1.0, dual

## Evaluation

In [18]:
def metrica_atmira(y_test, y_predicted):
    rmse = mean_squared_error(y_test, y_predicted, squared=False)
    rrmse = rmse/y_test.mean()
    # Si el valor és negatiu és que hi ha hagut més demanda de la prevista, si el valor és positiu compta com a CF
    diferencia = y_predicted - y_test
    CF = np.sum(diferencia >= 0)/len(y_test)
    metrica_minimitzar = (0.7*rrmse) + (0.3*(1-CF))
    print("rmse = ", rmse)
    print("y_mean = ", y_test.mean())
    print("rrmse = ", rrmse)
    print("CF =", CF)
    return metrica_minimitzar

In [19]:
# Reconstruct joint y_test and y_predicted
y_test_reconst = pd.Series()
y_predicted_reconst = np.array([])
for index in range(len(list_categoria_uno)):
    y_test_reconst = y_test_reconst.append(y_test[index])
    y_predicted_reconst = np.concatenate((y_predicted_reconst,y_predicted[index]))
    print(list_categoria_uno[index], len(y_test[index]), len(y_predicted[index]), 
          len(y_test_reconst), len(y_predicted_reconst))

A 100 100 100 100
B 100 100 200 200
C 100 100 300 300
D 49 49 349 349
E 100 100 449 449
F 100 100 549 549
G 100 100 649 649
H 100 100 749 749
I 100 100 849 849
K 100 100 949 949
L 100 100 1049 1049
N 100 100 1149 1149
O 100 100 1249 1249


In [20]:
metrica_atmira(y_test_reconst, y_predicted_reconst)

rmse =  11.67899295726826
y_mean =  3.566853482786229
rrmse =  3.274312503620215
CF = 0.477982385908727


2.448624036761532