# PI DATA - BigMart

In [None]:
import pandas as pd
import pandas_profiling as ppr

import plotly.graph_objects as go
import plotly.express as px

import datetime as dt
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression


## Importación de los datos

In [None]:
train_df = pd.read_csv('data/Train_BigMart.csv')
test_df = pd.read_csv('data/Test_BigMart.csv')

## Exploración de los datos

In [None]:
def profile_data(df):    
    pr = ppr.ProfileReport(df, title="Pandas Profiling Report")
    return pr

Se utilizó la biblioteca pandas_profiling para revisar de manera detallada el conjunto de datos. 

In [None]:
profile_data(train_df)

### Observaciones

- Item_Identifier cuenta con alta cardinalidad. Se observa que un mismo item se repite hasta 10 veces, probablemente porque se vende hasta en 10 tiendas distintas.

- Item_Weight cuenta con el 17.2% de datos faltantes, por lo que se trabajará en completar la información.

- Item_Fat_Content tiene problemas con las etiquetas de los datos, estas no están estandarizadas.

- Item_Visibility cuenta con un 6.2% de zeros y además tiene una distribución asimétrica con cola a la derecha. Este atributo puede normalizarse.

- Item_Type cuenta con 16 categorías. Se intentará agrupar las categorías para disminuir su número y mejorar el rendimiento de la predicción.

- Item_MRP tiene una alta correlación (Pearson y Phik ) con Item_Outlet_Sales

- Outlet_Identifier tiene una alta correlación (Phik) con Item_Outlet_Sales

- Outlet_Establishment_Year requiere ser transformada a años de antigüedad de la tienda.

- Outlet_Size contiene un 28.3% de datos faltantes

- Outlet_Location_Type sin comentarios

- Outlet_Type sin comentarios

- Item_Outlet_Sales será la variable objetivo a predecir.


## Gráficas de distribución

In [None]:
#plot item_identifier
item_identifier_graph = go.Figure( go.Bar(
    y=train_df['Item_Identifier'].value_counts().index,
    x=train_df['Item_Identifier'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_identifier_graph.update_layout(title="Item Identifier", 
                                    yaxis_title="Item_Identifier",
                                    xaxis_title="Count")
item_identifier_graph.show()

In [None]:
#plot Item_Weight Histogram
item_weight_graph = go.Figure( go.Histogram(
    x=train_df['Item_Weight'],
    marker_color='#5579C6'
    ))
item_weight_graph.update_layout(title="Item Weight Histogram",
                                xaxis_title="Item_Weight",
                                yaxis_title="Count")
item_weight_graph.show()

In [None]:
#plot Item_Weight Boxplot horizontal
item_weight_boxplot = go.Figure( go.Box(
    x=train_df['Item_Weight'],
    marker_color='#5579C6',
    ))
item_weight_boxplot.update_layout(title="Item Weight Boxplot")
item_weight_boxplot.show()

In [None]:
#plot Item_Fat_Content Bar Chart
item_fat_content_graph = go.Figure( go.Bar(
    y=train_df['Item_Fat_Content'].value_counts().index,
    x=train_df['Item_Fat_Content'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_fat_content_graph.update_layout(title="Item Fat Content",
                                    xaxis_title="Item_Fat_Content",
                                    yaxis_title="Count")
item_fat_content_graph.show()

In [None]:
#plot Item_Visibility
item_visibility_graph = go.Figure( go.Histogram(
    x=train_df['Item_Visibility'],
    marker_color='#5579C6'
    ))
item_visibility_graph.update_layout(title="Item Visibility",
                                xaxis_title="Item_Visibility",
                                yaxis_title="Count")
item_visibility_graph.show()

In [None]:
#plot Item_Visibility boxplot
item_visibility_boxplot = go.Figure( go.Box(
    x=train_df['Item_Visibility'],
    marker_color='#5579C6'
    ))
item_visibility_boxplot.update_layout(title="Item Visibility Boxplot")
item_visibility_boxplot.show()

In [None]:
# plot Item_Type Bar chart horizontal
item_type_bar = go.Figure( go.Bar(
    y=train_df['Item_Type'].value_counts().index,
    x=train_df['Item_Type'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_type_bar.update_layout(title="Item Type",
                                xaxis_title="Item_Type",            
                                yaxis_title="Count")
item_type_bar.show()

In [None]:
# plot Item_MRP Histogram
item_mrp_histogram = go.Figure( go.Histogram(
    x=train_df['Item_MRP'],
    marker_color='#5579C6'
    ))
item_mrp_histogram.update_layout(title="Item MRP Histogram",
                                xaxis_title="Item_MRP",
                                yaxis_title="Count")
item_mrp_histogram.show()

In [None]:
# plot Item_MRP Boxplot
item_mrp_boxplot = go.Figure( go.Box(
    x=train_df['Item_MRP'],
    marker_color='#5579C6'
    ))
item_mrp_boxplot.update_layout(title="Item MRP Boxplot")
item_mrp_boxplot.show()

In [None]:
# plot Outlet_Identifier Bar chart horizontal
outlet_identifier_bar = go.Figure( go.Bar(
    y=train_df['Outlet_Identifier'].value_counts().index,
    x=train_df['Outlet_Identifier'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
outlet_identifier_bar.update_layout(title="Outlet Identifier",
                                xaxis_title="Outlet_Identifier",
                                yaxis_title="Count")
outlet_identifier_bar.show()

In [None]:
# plot Outlet_Establishment_Year Histogram
outlet_establishment_year_histogram = go.Figure( go.Histogram(
    x=train_df['Outlet_Establishment_Year'],
    marker_color='#5579C6'
    ))
outlet_establishment_year_histogram.update_layout(title="Outlet Establishment Year Histogram",
                                xaxis_title="Outlet_Establishment_Year",
                                yaxis_title="Count")
outlet_establishment_year_histogram.show()

In [None]:
# plot Outlet_Size Bar chart horizontal
outlet_size_bar = go.Figure( go.Bar(
    y=train_df['Outlet_Size'].value_counts().index,
    x=train_df['Outlet_Size'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
outlet_size_bar.update_layout(title="Outlet Size",
                                xaxis_title="Outlet_Size",
                                yaxis_title="Count")
outlet_size_bar.show()

In [None]:
# plot Outlet_Location_Type Bar chart horizontal
outlet_location_type_bar = go.Figure( go.Bar(
    y=train_df['Outlet_Location_Type'].value_counts().index,
    x=train_df['Outlet_Location_Type'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
outlet_location_type_bar.update_layout(title="Outlet Location Type",
                                xaxis_title="Outlet_Location_Type",
                                yaxis_title="Count")
outlet_location_type_bar.show()

In [None]:
# plot Outlet_Type Bar chart horizontal
outlet_type_bar = go.Figure( go.Bar(
    y=train_df['Outlet_Type'].value_counts().index,
    x=train_df['Outlet_Type'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
outlet_type_bar.update_layout(title="Outlet Type",
                                xaxis_title="Outlet_Type",
                                yaxis_title="Count")
outlet_type_bar.show()

In [None]:
# plot Item_Outlet_Sales histogram
item_outlet_sales_histogram = go.Figure( go.Histogram(
    x=train_df['Item_Outlet_Sales'],
    marker_color='#5579C6'
    ))
item_outlet_sales_histogram.update_layout(title="Item Outlet Sales Histogram",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Count")
item_outlet_sales_histogram.show()

In [None]:
# plot Item_Outlet_Sales boxplot
item_outlet_sales_boxplot = go.Figure( go.Box(
    x=train_df['Item_Outlet_Sales'],
    marker_color='#5579C6'
    ))
item_outlet_sales_boxplot.update_layout(title="Item Outlet Sales Boxplot")
item_outlet_sales_boxplot.show()


### Observaciones

- Item_Visibility contiene outliders que deben ser tratados.

## Limpieza de datos

### Item_Weight

Se reemplazaron datos faltantes por la media.

In [None]:
# Item_Weight fillna with median
train_df['Item_Weight'].fillna(train_df['Item_Weight'].median(), inplace=True)
#plot Item_Weight histogram
item_weight_histogram = go.Figure( go.Histogram(
    x=train_df['Item_Weight'],
    marker_color='#5579C6'
    ))
item_weight_histogram.update_layout(title="Item Weight Histogram",
                                xaxis_title="Item_Weight",
                                yaxis_title="Count")
item_weight_histogram.show()
#plot Item_Weight boxplot
item_weight_boxplot = go.Figure( go.Box(
    x=train_df['Item_Weight'],
    marker_color='#5579C6'
    ))
item_weight_boxplot.update_layout(title="Item Weight Boxplot")
item_weight_boxplot.show()


### Item_Fat_Content

Se renombraron las etiquetas para encajarlas en las dos categorías existentes.

In [None]:
# Item_Fat_Content rename categorical variable
train_df.replace({"Item_Fat_Content": {"low fat": "Low Fat", "Low Fat": "Low Fat", "reg": "Regular", "Regular": "Regular", "reg ": "Regular", "LF": "Low Fat"}} , inplace=True)
#plot Item_Fat_Content Bar chart horizontal
item_fat_content_bar = go.Figure( go.Bar(
    y=train_df['Item_Fat_Content'].value_counts().index,
    x=train_df['Item_Fat_Content'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_fat_content_bar.update_layout(title="Item Fat Content",
                                xaxis_title="Item_Fat_Content",
                                yaxis_title="Count")
item_fat_content_bar.show()


### Item_Visibility

Se reemplazó los datos faltantes por el promedio de visibilidad del producto en otras tiendas.

In [None]:
# Item_Visibility average group by Item_Identifier
train_df['Item_Visibility_avg'] = train_df.groupby('Item_Identifier')['Item_Visibility'].transform('mean')
# plot Item_Visibility_avg histogram
item_visibility_avg_histogram = go.Figure( go.Histogram(
    x=train_df['Item_Visibility_avg'],
    marker_color='#5579C6'
    ))
item_visibility_avg_histogram.update_layout(title="Item Visibility Avg Histogram",
                                xaxis_title="Item_Visibility_avg",
                                yaxis_title="Count")
item_visibility_avg_histogram.show()

In [None]:
# Item_Visibility
item_visibility_avg = train_df.groupby("Item_Identifier").mean()["Item_Visibility"] 
train_df['Item_Visibility'] = train_df.apply(lambda x: item_visibility_avg[x["Item_Identifier"]] if x["Item_Visibility"] ==0 else x["Item_Visibility"], axis=1)
# plot Item_Visibility_new histogram
item_visibility_new_histogram = go.Figure( go.Histogram(
    x=train_df['Item_Visibility'],
    marker_color='#5579C6'
    ))
item_visibility_new_histogram.update_layout(title="Item Visibility Histogram",
                                xaxis_title="Item_Visibility",
                                yaxis_title="Count")
item_visibility_new_histogram.show()

### Outlet_Size

Se reemplazaron los valores nulos por la moda según el outlet_type

In [None]:
# Outlet_Size fillna with mode by outler_identifier

outlet_size_mode_by_outlet_identifier = {}
for i in test_df['Outlet_Identifier'].unique():
      outlet_size_mode_by_outlet_identifier[i] = test_df[test_df['Outlet_Identifier'] == i]['Outlet_Size'].mode()
print(outlet_size_mode_by_outlet_identifier)

In [None]:
# Outlet_Size fillna with mode by outler_type
outlet_size_mode = train_df.pivot_table(values='Outlet_Size',
      columns='Outlet_Type',aggfunc=lambda x:x.mode())
print(outlet_size_mode)

train_df["Outlet_Size"].fillna("None", inplace=True)
train_df["Outlet_Size"] = train_df.apply(lambda x: outlet_size_mode.loc['Outlet_Size'][x["Outlet_Type"]] if x["Outlet_Size"] == "None" else x["Outlet_Size"], axis=1)

#plot Outlet_Size Bar chart horizontal
outlet_size_bar = go.Figure( go.Bar(
    y=train_df['Outlet_Size'].value_counts().index,
    x=train_df['Outlet_Size'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
outlet_size_bar.update_layout(title="Outlet Size",
                                xaxis_title="Outlet_Size",
                                yaxis_title="Count")
outlet_size_bar.show()


# Análisis Bivariable

In [None]:
# Item_Outlet_Sales vs Item_Weight scatter plot
item_outlet_sales_item_weight_scatter = go.Figure( go.Scatter(
    x=train_df['Item_Weight'],
    y=train_df['Item_Outlet_Sales'],
    mode='markers',
    marker_color='#5579C6'
    ))
item_outlet_sales_item_weight_scatter.update_layout(title="Item Outlet Sales vs Item Weight Scatter Plot",
                                xaxis_title="Item_Weight",
                                yaxis_title="Item_Outlet_Sales")
item_outlet_sales_item_weight_scatter.show()


In [None]:
#plot tem_Outlet_Sales main by Item_Fat_Content
item_fat_content_main_by_item_outlet_sales = train_df.groupby(['Item_Fat_Content']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_fat_content_main_by_item_outlet_sales_bar = go.Figure( go.Bar(
    y=item_fat_content_main_by_item_outlet_sales.index,
    x=item_fat_content_main_by_item_outlet_sales.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_fat_content_main_by_item_outlet_sales_bar.update_layout(title="Item Outlet Sales Main by Item Fat Content",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Item_Fat_Content")
item_fat_content_main_by_item_outlet_sales_bar.show()

In [None]:
# Item_Visibility vs Item_Outlet_Sales scatter plot
item_visibility_item_outlet_sales_scatter = go.Figure( go.Scatter(
    x=train_df['Item_Visibility'],
    y=train_df['Item_Outlet_Sales'],
    mode='markers',
    marker_color='#5579C6'
    ))
item_visibility_item_outlet_sales_scatter.update_layout(title="Item Visibility vs Item Outlet Sales Scatter Plot",
                                xaxis_title="Item_Visibility",
                                yaxis_title="Item_Outlet_Sales")
item_visibility_item_outlet_sales_scatter.show()

In [None]:
# Item_Outlet_Sales mean by Item_Type 
item_outlet_sales_item_type_media = train_df.groupby(['Item_Type']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_outlet_sales_item_type_media_bar = go.Figure( go.Bar(
    y=item_outlet_sales_item_type_media.index,
    x=item_outlet_sales_item_type_media.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_outlet_sales_item_type_media_bar.update_layout(title="Item Outlet Sales Mean by Item Type ",
                                xaxis_title="Item_Outlet_Sales",    
                                yaxis_title="Item_Type")
item_outlet_sales_item_type_media_bar.show()

In [None]:
# Item_MRP vs Item_Outlet_Sales scatter plot
item_mrp_item_outlet_sales_scatter = go.Figure( go.Scatter(
    x=train_df['Item_MRP'],
    y=train_df['Item_Outlet_Sales'],
    mode='markers',
    marker_color='#5579C6'
    ))
item_mrp_item_outlet_sales_scatter.update_layout(title="Item MRP vs Item Outlet Sales Scatter Plot",
                                xaxis_title="Item_MRP",
                                yaxis_title="Item_Outlet_Sales")
item_mrp_item_outlet_sales_scatter.show()

In [None]:
import plotly.express as px
fig = px.scatter_3d(train_df, x='Item_MRP', y='Item_Outlet_Sales', z='Outlet_Type',
              color='Outlet_Type')
fig.show()

In [None]:
fig = px.scatter_3d(train_df, x='Item_MRP', y='Item_Outlet_Sales', z='Outlet_Size',
              color='Outlet_Size')
fig.show()

In [None]:
# Item_Outlet_Sales mean by Outler identifier
item_outlet_sales_mean_by_outlet_identifier = train_df.groupby(['Outlet_Identifier']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_outlet_sales_mean_by_outlet_identifier_bar = go.Figure( go.Bar(
    y=item_outlet_sales_mean_by_outlet_identifier.index, 
    x=item_outlet_sales_mean_by_outlet_identifier.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_outlet_sales_mean_by_outlet_identifier_bar.update_layout(title="Item Outlet Sales Mean by Outlet Identifier",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Outlet_Identifier")
item_outlet_sales_mean_by_outlet_identifier_bar.show()
                                                                                    

In [None]:
# Item_Outlet_Sales mean by Outlet_Establishment_Year
item_outlet_sales_mean_by_outlet_establishment_year = train_df.groupby(['Outlet_Establishment_Year']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_outlet_sales_mean_by_outlet_establishment_year_bar = go.Figure( go.Bar(
    y=item_outlet_sales_mean_by_outlet_establishment_year.index,
    x=item_outlet_sales_mean_by_outlet_establishment_year.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_outlet_sales_mean_by_outlet_establishment_year_bar.update_layout(title="Item Outlet Sales Mean by Outlet Establishment Year",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Outlet_Establishment_Year")
item_outlet_sales_mean_by_outlet_establishment_year_bar.show()

In [None]:
# Item_Outlet_Sales mean by Outlet_Size
item_outlet_sales_mean_by_outlet_size = train_df.groupby(['Outlet_Size']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_outlet_sales_mean_by_outlet_size_bar = go.Figure( go.Bar(
    y=item_outlet_sales_mean_by_outlet_size.index,
    x=item_outlet_sales_mean_by_outlet_size.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_outlet_sales_mean_by_outlet_size_bar.update_layout(title="Item Outlet Sales Mean by Outlet Size",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Outlet_Size")
item_outlet_sales_mean_by_outlet_size_bar.show()

In [None]:
# Item_Outlet_Sales mean by Outlet_Location_Type
item_outlet_sales_mean_by_outlet_location_type = train_df.groupby(['Outlet_Location_Type']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_outlet_sales_mean_by_outlet_location_type_bar = go.Figure( go.Bar(
    y=item_outlet_sales_mean_by_outlet_location_type.index,
    x=item_outlet_sales_mean_by_outlet_location_type.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_outlet_sales_mean_by_outlet_location_type_bar.update_layout(title="Item Outlet Sales Mean by Outlet Location Type",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Outlet_Location_Type")
item_outlet_sales_mean_by_outlet_location_type_bar.show()


In [None]:
# Item_Outlet_Sales mean by Outlet_Type
item_outlet_sales_mean_by_outlet_type = train_df.groupby(['Outlet_Type']).mean()["Item_Outlet_Sales"]
# plot bar chart
item_outlet_sales_mean_by_outlet_type_bar = go.Figure( go.Bar(
    y=item_outlet_sales_mean_by_outlet_type.index,
    x=item_outlet_sales_mean_by_outlet_type.values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_outlet_sales_mean_by_outlet_type_bar.update_layout(title="Item Outlet Sales Mean by Outlet Type",
                                xaxis_title="Item_Outlet_Sales",
                                yaxis_title="Outlet_Type")
item_outlet_sales_mean_by_outlet_type_bar.show()

In [None]:
fig = px.scatter_3d(train_df, x='Outlet_Size', y='Outlet_Type', z='Item_Outlet_Sales',
              color='Item_Outlet_Sales')
fig.show()

### Ingeniería de Características

### Item_Type

In [None]:
# Item_Type group categorical variables
train_df["Item_Type_Id"] = train_df["Item_Identifier"].apply(lambda x: x[0:2])
# plot Item_Type_Id Bar chart horizontal
item_type_id_bar = go.Figure( go.Bar(
    y=train_df['Item_Type_Id'].value_counts().index,
    x=train_df['Item_Type_Id'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_type_id_bar.update_layout(title="Item Type Id",
                                xaxis_title="Item_Type_Id",
                                yaxis_title="Count")
item_type_id_bar.show()

### Outlet_Establishment_Year 

In [None]:
# get outlet_age 
current_year =  dt.datetime.now().year
print(current_year)
train_df["Outlet_Age"] = current_year - train_df["Outlet_Establishment_Year"] 
# plot Outlet_Age bar chart
outlet_age_bar = go.Figure( go.Bar(
    x=train_df['Outlet_Age'].value_counts().index,
    y=train_df['Outlet_Age'].value_counts().values,
    marker_color='#5579C6'
    ))

outlet_age_bar.update_layout(title="Outlet Age",
                                xaxis_title="Outlet_Age",
                                yaxis_title="Count"
                            )
outlet_age_bar.update_xaxes(type='category')
outlet_age_bar.show()

In [None]:
train_df.columns

In [None]:
label = LabelEncoder()
vars_label_encoder = ['Item_Fat_Content', 'Outlet_Identifier',
        'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Id'] 
for col in vars_label_encoder:
    train_df[col] = label.fit_transform(train_df[col])
train_df.head()

In [None]:
# get dummy variables
vars_dummy = ['Item_Fat_Content', 'Outlet_Identifier',
        'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Id'] 
train_df = pd.get_dummies(train_df, columns=vars_dummy)
train_df.head()


In [None]:
train_df.info()

## Entrenamiento del modelo

### Selección de variables

Tomando en cuenta los coeficientes de correlación de Phik se establece que las variables 

In [None]:
cols_for_model = ['Item_Weight', 'Item_Visibility','Item_MRP',
        'Outlet_Age', 'Item_Fat_Content_0',
       'Item_Fat_Content_1', 'Outlet_Identifier_0', 'Outlet_Identifier_1',
       'Outlet_Identifier_2', 'Outlet_Identifier_3', 'Outlet_Identifier_4',
       'Outlet_Identifier_5', 'Outlet_Identifier_6', 'Outlet_Identifier_7',
       'Outlet_Identifier_8', 'Outlet_Identifier_9', 'Outlet_Size_0',
       'Outlet_Size_1', 'Outlet_Size_2', 'Outlet_Location_Type_0',
       'Outlet_Location_Type_1', 'Outlet_Location_Type_2', 'Outlet_Type_0',
       'Outlet_Type_1', 'Outlet_Type_2', 'Outlet_Type_3', 'Item_Type_Id_0',
       'Item_Type_Id_1', 'Item_Type_Id_2']
target = 'Item_Outlet_Sales'

In [None]:
X = train_df[cols_for_model]
y = train_df[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,
    random_state=1) #stratify=y
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
regresion = LinearRegression()
regresion.fit(X_train, y_train)
print(regresion)

In [None]:
# Los coeficientes de la regresión se obtiene con el método .coef_:
print('Coeficientes: \n', regresion.coef_)

In [None]:
# el método predict permite utilizar el modelo para predecir
prediccion = regresion.predict(X_val)

print('Error cuadrático medio: {}'.format(
    mean_squared_error(y_val, prediccion)))
print('Raíz del error cuadrático medio: {}'.format(
    math.sqrt(mean_squared_error(y_val, prediccion))))
print('El coeficiente de determinación es: {}'.format(
    r2_score(y_val, prediccion)))

In [None]:
# plot y_val vs prediccion plotly
fig = go.Figure(data=[go.Scatter(x=y_val, y=prediccion, mode='markers')])
fig.show()


In [None]:
# plot feature importances plotly
feature_importances = pd.Series(regresion.coef_, index=X_train.columns)
# plot feature importances plotly
fig = go.Figure(data=[go.Bar(x=feature_importances.index,
                                y=feature_importances.values)])  
fig.update_layout(title='Feature Importances')   
fig.show()


In [None]:

def modelos(prueba_X, prueba_Y, entrenamiento_X, entrenamiento_Y,
            algoritmo, tipo):

    print (algoritmo)
    algoritmo.fit(entrenamiento_X, entrenamiento_Y)
    prediccion = algoritmo.predict(prueba_X)

    print('Error cuadrático medio: {}'.format(
        mean_squared_error(prueba_Y, prediccion)))
    print('Raíz del error cuadrático medio: {}'.format(
        math.sqrt(mean_squared_error(prueba_Y, prediccion))))
    print('El coeficiente de determinación es: {}'.format(
        r2_score(prueba_Y, prediccion)))
    
    tabla = pd.DataFrame(prueba_X.copy())
    tabla["real"] = prueba_Y.copy()
    tabla["predicho"] = prediccion.copy()
    
    data = tabla[["real","predicho"]].reset_index(drop=True)  

    # plotly line plot real vs predictions
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data.index, y=data["real"],
                mode="lines", name="real"))
    fig.add_trace(go.Scatter(x=data.index, y=data["predicho"],
                mode="lines", name="predicho"))
    fig.update_layout(title='Real vs Predicho',
                                xaxis_title="Real",
                                yaxis_title="Predicho")
    fig.show()

    if tipo == "lineal":
        coef = pd.DataFrame(algoritmo.coef_)
    if tipo == "compleja":    
        coef = pd.DataFrame(algoritmo.feature_importances_)
    coef["importancia"] = entrenamiento_X.columns
    #plot feature importance plotly
    fig = go.Figure(data=[go.Bar(x=coef["importancia"],
                                y=coef[0],
                                marker_color='rgb(0,0,0)',
                                marker_line_color='rgb(0,0,0)',
                                marker_line_width=1.5,
                                opacity=0.6)])
    fig.update_layout(title='Importancia de los parámetros')
    fig.show()
    
    return coef

In [None]:
lr = LinearRegression()
coeficientes = modelos(X_val, y_val, X_train, y_train, lr, "lineal")

In [None]:
from sklearn.linear_model import Lasso
ls = Lasso()
coeficientes = modelos(X_val, y_val, X_train, y_train, ls, "lineal")

In [None]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1000)
coeficientes = modelos(X_val, y_val, X_train, y_train, ridge, "lineal")

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
coeficientes = modelos(X_val, y_val, X_train, y_train, rf, "compleja")

In [None]:
from sklearn.ensemble import AdaBoostRegressor
adb = AdaBoostRegressor()
coeficientes = modelos(X_val, y_val, X_train, y_train, adb, "compleja")

# Aplicación del modelo

In [None]:
profile_data(test_df)

In [None]:
# Item_Weight fillna with median
test_df['Item_Weight'].fillna(test_df['Item_Weight'].median(), inplace=True)
#plot Item_Weight histogram
item_weight_histogram = go.Figure( go.Histogram(
    x=test_df['Item_Weight'],
    marker_color='#5579C6'
    ))
item_weight_histogram.update_layout(title="Item Weight Histogram",
                                xaxis_title="Item_Weight",
                                yaxis_title="Count")
item_weight_histogram.show()
#plot Item_Weight boxplot
item_weight_boxplot = go.Figure( go.Box(
    x=test_df['Item_Weight'],
    marker_color='#5579C6'
    ))
item_weight_boxplot.update_layout(title="Item Weight Boxplot")
item_weight_boxplot.show()


In [None]:
# Item_Fat_Content rename categorical variable
test_df.replace({"Item_Fat_Content": {"low fat": "Low Fat", "Low Fat": "Low Fat", "reg": "Regular", "Regular": "Regular", "reg ": "Regular", "LF": "Low Fat"}} , inplace=True)
#plot Item_Fat_Content Bar chart horizontal
item_fat_content_bar = go.Figure( go.Bar(
    y=test_df['Item_Fat_Content'].value_counts().index,
    x=test_df['Item_Fat_Content'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_fat_content_bar.update_layout(title="Item Fat Content",
                                xaxis_title="Item_Fat_Content",
                                yaxis_title="Count")
item_fat_content_bar.show()

In [None]:
# Item_Visibility
item_visibility_avg = test_df.groupby("Item_Identifier").mean()["Item_Visibility"] 
test_df['Item_Visibility'] = test_df.apply(lambda x: item_visibility_avg[x["Item_Identifier"]] if x["Item_Visibility"] ==0 else x["Item_Visibility"], axis=1)
# plot Item_Visibility_new histogram
item_visibility_new_histogram = go.Figure( go.Histogram(
    x=test_df['Item_Visibility'],
    marker_color='#5579C6'
    ))
item_visibility_new_histogram.update_layout(title="Item Visibility Histogram",
                                xaxis_title="Item_Visibility",
                                yaxis_title="Count")
item_visibility_new_histogram.show()

In [None]:
# Outlet_Size fillna with mode by outler_type
outlet_size_mode = test_df.pivot_table(values='Outlet_Size',
      columns='Outlet_Type',aggfunc=lambda x:x.mode())
print(outlet_size_mode)

test_df["Outlet_Size"].fillna("None", inplace=True)
test_df["Outlet_Size"] = test_df.apply(lambda x: outlet_size_mode.loc['Outlet_Size'][x["Outlet_Type"]] if x["Outlet_Size"] == "None" else x["Outlet_Size"], axis=1)

#plot Outlet_Size Bar chart horizontal
outlet_size_bar = go.Figure( go.Bar(
    y=test_df['Outlet_Size'].value_counts().index,
    x=test_df['Outlet_Size'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
outlet_size_bar.update_layout(title="Outlet Size",
                                xaxis_title="Outlet_Size",
                                yaxis_title="Count")
outlet_size_bar.show()


In [None]:
# Item_Type group categorical variables
test_df["Item_Type_Id"] = test_df["Item_Identifier"].apply(lambda x: x[0:2])
# plot Item_Type_Id Bar chart horizontal
item_type_id_bar = go.Figure( go.Bar(
    y=test_df['Item_Type_Id'].value_counts().index,
    x=test_df['Item_Type_Id'].value_counts().values,
    marker_color='#5579C6',
    orientation='h'
    ))
item_type_id_bar.update_layout(title="Item Type Id",
                                xaxis_title="Item_Type_Id",
                                yaxis_title="Count")
item_type_id_bar.show()

In [None]:
# get outlet_age 
current_year =  dt.datetime.now().year
print(current_year)
test_df["Outlet_Age"] = current_year - test_df["Outlet_Establishment_Year"] 
# plot Outlet_Age bar chart
outlet_age_bar = go.Figure( go.Bar(
    x=test_df['Outlet_Age'].value_counts().index,
    y=test_df['Outlet_Age'].value_counts().values,
    marker_color='#5579C6'
    ))

outlet_age_bar.update_layout(title="Outlet Age",
                                xaxis_title="Outlet_Age",
                                yaxis_title="Count"
                            )
outlet_age_bar.update_xaxes(type='category')
outlet_age_bar.show()

In [None]:
label = LabelEncoder()
vars_label_encoder = ['Item_Fat_Content', 'Outlet_Identifier',
        'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Id'] 
for col in vars_label_encoder:
    test_df[col] = label.fit_transform(test_df[col])
test_df.head()

In [None]:
# get dummy variables
vars_dummy = ['Item_Fat_Content', 'Outlet_Identifier',
        'Outlet_Size', 'Outlet_Location_Type',
       'Outlet_Type', 'Item_Type_Id'] 
test_df = pd.get_dummies(test_df, columns=vars_dummy)
test_df.head()


In [None]:
X_test = test_df[cols_for_model]

In [None]:
prediction = regresion.predict(X_test)
print(prediction)
result = X_test.copy()
result["prediction_sales"] = prediction
print(result)
result.to_csv("result.csv", index=False)