# Random Forest

## Importar Librerias

In [72]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder

In [73]:
# Cargar datos
file_path = '/content/Bases de Datos Estructurada.csv'
df = pd.read_csv(file_path, delimiter=';')
df.head()

Unnamed: 0,fecha,dia,enviados_totales,cobrados_totales,vendidos_totales,devueltos_totales,suscripciones_totales,tiendas_conveniencia,tiendas_barrio,sectoristas,promocion_bin,cantidad_paginas,clasificacion_titular
0,1/02/2022,Sunday,94993.0,93185.0,81248.0,13256.0,865.0,629.0,1134.0,92365.0,,,
1,1/03/2022,Monday,110810.0,108409.0,101669.0,11186.0,865.0,664.0,1377.0,107904.0,,,
2,1/04/2022,Tuesday,97543.0,95151.0,86815.0,6499.0,865.0,618.0,1376.0,94684.0,,,
3,1/05/2022,Wednesday,96234.0,93842.0,79160.0,25826.0,864.0,620.0,1395.0,93355.0,,,
4,1/06/2022,Thursday,96254.0,93851.0,84055.0,13686.0,862.0,661.0,1399.0,93332.0,,,


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   fecha                  1090 non-null   object 
 1   dia                    1090 non-null   object 
 2   enviados_totales       1084 non-null   float64
 3   cobrados_totales       1084 non-null   float64
 4   vendidos_totales       1084 non-null   float64
 5   devueltos_totales      1084 non-null   float64
 6   suscripciones_totales  1084 non-null   float64
 7   tiendas_conveniencia   1084 non-null   float64
 8   tiendas_barrio         1084 non-null   float64
 9   sectoristas            1084 non-null   float64
 10  promocion_bin          0 non-null      float64
 11  cantidad_paginas       0 non-null      float64
 12  clasificacion_titular  0 non-null      float64
dtypes: float64(11), object(2)
memory usage: 110.8+ KB


In [75]:
# Convertir la columna 'fecha' a tipo datetime
df['fecha'] = pd.to_datetime(df['fecha'], errors='coerce', dayfirst=False)  # Ajusta `dayfirst=True` si el formato es DD/MM/YYYY
# Convertir la columna dia a categoria
df['dia'] = df['dia'].astype('category')

In [76]:
df = df[['fecha','dia','vendidos_totales', 'devueltos_totales', 'suscripciones_totales', 'tiendas_conveniencia', 'tiendas_barrio', 'sectoristas']]

In [77]:
numeric_cols = df.select_dtypes(include=['float64'])
imputer = KNNImputer(n_neighbors=5)
imputed_data = imputer.fit_transform(numeric_cols)
df[numeric_cols.columns] = imputed_data
df = df.dropna(subset=['fecha'])
print(f"Number of rows in DataFrame: {len(df)}")
print(f"Number of non-missing 'fecha' entries: {df['fecha'].notna().sum()}")
print(df.info())

Number of rows in DataFrame: 1090
Number of non-missing 'fecha' entries: 1090
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1090 entries, 0 to 1089
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   fecha                  1090 non-null   datetime64[ns]
 1   dia                    1090 non-null   category      
 2   vendidos_totales       1090 non-null   float64       
 3   devueltos_totales      1090 non-null   float64       
 4   suscripciones_totales  1090 non-null   float64       
 5   tiendas_conveniencia   1090 non-null   float64       
 6   tiendas_barrio         1090 non-null   float64       
 7   sectoristas            1090 non-null   float64       
dtypes: category(1), datetime64[ns](1), float64(6)
memory usage: 61.2 KB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[numeric_cols.columns] = imputed_data


In [78]:
if 'fecha' in df.columns:
    df['año'] = df['fecha'].dt.year
    df['mes'] = df['fecha'].dt.month
    df['día'] = df['fecha'].dt.day
    df = df.drop(columns=['fecha'])

In [79]:
# OneHotEncoder para columnas categóricas (como 'dia')
if 'dia' in df.columns:
    encoder = OneHotEncoder(sparse_output=False, drop='first')
    dia_encoded = encoder.fit_transform(df[['dia']])
    dia_encoded_df = pd.DataFrame(dia_encoded, columns=encoder.get_feature_names_out(['dia']), index=df.index)
    df = pd.concat([df.drop(columns=['dia']), dia_encoded_df], axis=1)

In [80]:
# Seleccionar variables independientes (X) y dependiente (y)
X = df.drop(columns=['vendidos_totales'])  # Variable objetivo
y = df['vendidos_totales']

In [81]:
# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
# Crear el modelo Random Forest
rf_model = RandomForestRegressor(n_estimators=1000, random_state=42)

# Entrenar el modelo
rf_model.fit(X_train, y_train)

# Hacer predicciones
y_pred = rf_model.predict(X_test)

In [83]:
# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("R2 Score:", r2)

# Importancia de las variables
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("\nImportancia de las variables:")
print(feature_importances)

Mean Squared Error (MSE): 18725913.247541487
R2 Score: 0.8969454229391658

Importancia de las variables:
                  Feature  Importance
4             sectoristas    0.637267
1   suscripciones_totales    0.185729
0       devueltos_totales    0.133522
6                     mes    0.011454
2    tiendas_conveniencia    0.008784
3          tiendas_barrio    0.007254
8              dia_Monday    0.006830
7                     día    0.005528
5                     año    0.001399
11           dia_Thursday    0.001047
13          dia_Wednesday    0.000625
10             dia_Sunday    0.000235
12            dia_Tuesday    0.000180
9            dia_Saturday    0.000144
