In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

path = "/content/drive/MyDrive/proyecto series de tiempo/data/"

In [4]:
train = pd.read_csv(path + 'train.csv', parse_dates=['Date'])
test_series = pd.read_csv(path + 'test.csv', parse_dates=['Date'])
stores = pd.read_csv(path + 'stores.csv')
features = pd.read_csv(path + 'features.csv', parse_dates=['Date'])

In [5]:
data = train.merge(features, on=['Store', 'Date']).merge(stores, on='Store')

In [6]:
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year
data['IsHoliday'] = data['IsHoliday_x']
data = data.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)
data = data.fillna(0)
data['Type_num'] = data['Type'].map({'A':3,'B':2,'C':1})

In [7]:
data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type,Size,Month,Year,IsHoliday,Type_num
0,1,1,2010-02-05,24924.5,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2,2010,False,3
1,1,2,2010-02-05,50605.27,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2,2010,False,3
2,1,3,2010-02-05,13740.12,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2,2010,False,3
3,1,4,2010-02-05,39954.04,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2,2010,False,3
4,1,5,2010-02-05,32229.38,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,2,2010,False,3


In [8]:
data.shape

(421570, 19)

# Regresión

Se probarán algunos algoritmos de regresión utilizando las características que se tienen junto a la serie de tiempo

In [9]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Se utilizarán las características: Store, Dept, Temperature, Fuel_Price, Markdown1, MarkDown2, Markdown3, Markdown5, CPI, Type_num, Month e IsHoliday. Tomando en cuenta la información obtenida del EDA para evitar colinealidad en los modelos.

In [10]:
features = ['Store','Dept','Temperature','Fuel_Price','MarkDown1','MarkDown2',
            'MarkDown3','MarkDown5','CPI','Type_num','Month','IsHoliday']
data_features = data[features].copy().values
sales = data['Weekly_Sales'].copy().values

Para la realización de los experimentos se van a utilizar el 70% de los datos para el entrenamiento y el 30% restante como datos de prueba

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data_features, sales, test_size=0.3, random_state=1234)

In [12]:
print("Dimensiones de conjuntos de entrenamiento y prueba")
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

Dimensiones de conjuntos de entrenamiento y prueba
X_train (295099, 12)
X_test (126471, 12)
y_train (295099,)
y_test (126471,)


Los datos se estandarizan para un entrenamiento más estable

In [13]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [14]:
# # Para cv (tarda mucho)
# modelos = {"SVR":SVR(), "RandomForestRegressor":RandomForestRegressor(),
#             "LinearRegression":LinearRegression()}
# resultados = []
# for modelo in modelos.values():
#   kf = KFold(n_splits=10, random_state=1234, shuffle=True)
#   cv_resultados = cross_val_score(modelo, X_train_scaled, y_train ,cv=kf)
#   resultados.append(cv_resultados)

# plt.boxplot(resultados, labels=modelos.keys())
# plt.show()

In [15]:
rfr = RandomForestRegressor()
rfr.fit(X_train_scaled, y_train)

In [16]:
rfr_train_pred = rfr.predict(X_train_scaled)
rfr_test_pred = rfr.predict(X_test_scaled)

In [17]:
print("MAE train data:", mean_absolute_error(y_train, rfr_train_pred))
print("MAE test data:", mean_absolute_error(y_test, rfr_test_pred))

MAE train data: 619.6686073111056
MAE test data: 1689.9066954021082


In [18]:
print("RMSE train data:", np.sqrt(mean_squared_error(y_train, rfr_train_pred)))
print("RMSE test data:", np.sqrt(mean_squared_error(y_test, rfr_test_pred)))

RMSE train data: 1656.358959718375
RMSE test data: 4472.746640311278


In [19]:
print("R2 Score train data:", r2_score(y_train, rfr_train_pred))
print("R2 Score test data:", r2_score(y_test, rfr_test_pred))

R2 Score train data: 0.9946778630783634
R2 Score test data: 0.9612675442469368


Parece ser que RandomForestRegressor obtiene buenos resultados, sobretodo en R2

In [20]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

In [21]:
lr_train_pred = lr.predict(X_train_scaled)
lr_test_pred = lr.predict(X_test_scaled)

In [22]:
print("MAE train data:", mean_absolute_error(y_train, lr_train_pred))
print("MAE test data:", mean_absolute_error(y_test, lr_test_pred))

MAE train data: 14787.46238288759
MAE test data: 14824.9521121359


In [23]:
print("RMSE train data:", np.sqrt(mean_squared_error(y_train, lr_train_pred)))
print("RMSE test data:", np.sqrt(mean_squared_error(y_test, lr_test_pred)))

RMSE train data: 21987.811321037647
RMSE test data: 22000.68056212827


In [24]:
print("R2 Score train data:", r2_score(y_train, lr_train_pred))
print("R2 Score test data:", r2_score(y_test, lr_test_pred))

R2 Score train data: 0.06213328272302421
R2 Score test data: 0.06287253155950245


No muy buenos resultados en comparación a RandomForestRegressor

In [25]:
poly_features = PolynomialFeatures(degree=3)
X_poly = poly_features.fit_transform(X_train_scaled)
X_poly_test = poly_features.transform(X_test_scaled)

In [26]:
pr = LinearRegression()
pr.fit(X_poly, y_train)

In [27]:
pr_train_pred = pr.predict(X_poly)
pr_test_pred = pr.predict(X_poly_test)

In [28]:
print("MAE train data:", mean_absolute_error(y_train, pr_train_pred))
print("MAE test data:", mean_absolute_error(y_test, pr_test_pred))

MAE train data: 12413.080364555097
MAE test data: 19470616647.502502


In [29]:
print("RMSE train data:", np.sqrt(mean_squared_error(y_train, pr_train_pred)))
print("RMSE test data:", np.sqrt(mean_squared_error(y_test, pr_test_pred)))

RMSE train data: 19802.828802332337
RMSE test data: 46619515800.643776


In [30]:
print("R2 Score train data:", r2_score(y_train, pr_train_pred))
print("R2 Score test data:", r2_score(y_test, pr_test_pred))

R2 Score train data: 0.23926817231315012
R2 Score test data: -4207866513201.4717


Parece ser que RandomForestRegressor es el modelo que logró obtener mejores predicciones, comparando con Linear y Polynomial Regression