In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_excel('../assets/consumo_material_clean.xlsx')
df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,ORIGEN,TGL,PRODUCTO
0,E99808,01/01/23,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,1-2-60,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3
1,B41691,01/02/16,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,0-10-1,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...
2,E64543,01/02/16,71961/16,403770.0,20,5,12.1,48.4,Compra menor,0-4-111,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18
3,E65007,01/02/16,72773/16,20415.0,100,50,215.325,430.65,Concurso,0-10-1,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11
4,E64911,01/02/17,86159/17,20701.0,300,300,792.0,792.0,Concurso,0-6-1,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6


In [3]:
df['ORIGEN'] = df['ORIGEN'].str.replace('--', '-')
df[['REGION', 'HOSPITAL', 'DEPARTAMENTO']] = df['ORIGEN'].str.split('-', expand=True)
df = df.drop(["ORIGEN"],axis=1)

In [4]:
df_limpio = df.dropna(subset=['TGL'])
df.dropna(subset=['TGL'], inplace=True)

In [5]:
# Categorical variables
df['FECHAPEDIDO'] = pd.to_datetime(df['FECHAPEDIDO'], format='%d/%m/%y')
df['CODIGO'] = df['CODIGO'].astype('category')
df['PRODUCTO'] = df['PRODUCTO'].astype('category')
df['NUMERO'] = df['NUMERO'].astype('category')
df['REFERENCIA'] = df['REFERENCIA'].astype('category')
df['TIPOCOMPRA'] = df['TIPOCOMPRA'].astype('category')
#df['ORIGEN'] = df['ORIGEN'].astype('category')
df['TGL'] = df['TGL'].astype('category')
df['REGION'] = df['REGION'].astype('category')
df['HOSPITAL'] = df['HOSPITAL'].astype('category')
df['DEPARTAMENTO'] = df['DEPARTAMENTO'].astype('category')

# Numeric variables
df['CANTIDADCOMPRA'] = df['CANTIDADCOMPRA'].astype('int')
df['UNIDADESCONSUMOCONTENIDAS'] = df['UNIDADESCONSUMOCONTENIDAS'].astype('int')
df['PRECIO'] = df['PRECIO'].astype('float')
df['IMPORTELINEA'] = df['IMPORTELINEA'].astype('float')

df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,TGL,PRODUCTO,REGION,HOSPITAL,DEPARTAMENTO
0,E99808,2023-01-01,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,2,60
1,B41691,2016-02-01,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,0,10,1
2,E64543,2016-02-01,71961/16,403770.0,20,5,12.1,48.4,Compra menor,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18,0,4,111
3,E65007,2016-02-01,72773/16,20415.0,100,50,215.325,430.65,Concurso,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11,0,10,1
4,E64911,2017-02-01,86159/17,20701.0,300,300,792.0,792.0,Concurso,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6,0,6,1


In [6]:
new_df = df[['PRODUCTO', 'FECHAPEDIDO', 'TIPOCOMPRA', 'CANTIDADCOMPRA', 'IMPORTELINEA','TGL','HOSPITAL']].copy()

new_df['MES'] = new_df['FECHAPEDIDO'].dt.month
new_df['AÑO'] = new_df['FECHAPEDIDO'].dt.year
new_df = new_df.drop('FECHAPEDIDO', axis=1)

print(new_df.dtypes)

new_df.head()

PRODUCTO          category
TIPOCOMPRA        category
CANTIDADCOMPRA       int64
IMPORTELINEA       float64
TGL               category
HOSPITAL          category
MES                  int32
AÑO                  int32
dtype: object


Unnamed: 0,PRODUCTO,TIPOCOMPRA,CANTIDADCOMPRA,IMPORTELINEA,TGL,HOSPITAL,MES,AÑO
0,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,Compra menor,60,375.54,TRANSITO,2,1,2023
1,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,Compra menor,40,411.214916,ALMACENABLE,10,2,2016
2,APOSITO DE HIDROFIBRA / CINTA-18,Compra menor,20,48.4,TRANSITO,4,2,2016
3,APOSITO DE ESPUMA POLIURETANO / SACRO-11,Concurso,100,430.65,ALMACENABLE,10,2,2016
4,APOSITO C/ CARBON Y PLATA-6,Concurso,300,792.0,ALMACENABLE,6,2,2017


In [7]:
new_df = new_df.groupby(['AÑO', 'MES', 'PRODUCTO', 'TIPOCOMPRA','HOSPITAL','TGL',], observed=True).agg({'CANTIDADCOMPRA': 'sum'}).reset_index()

new_df.to_csv('../assets/new_df.csv', index=False)


In [8]:
split_year = 2023
train = new_df.loc[new_df['AÑO'] < split_year]
test = new_df.loc[new_df['AÑO'] >= split_year]

In [9]:
train.to_csv('../assets/train.csv', index=False)
test.to_csv('../assets/test.csv', index=False)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics import r2_score

# Identificar columnas numéricas y categóricas
num_cols = ['AÑO', 'MES','HOSPITAL', ]
cat_cols = ['PRODUCTO', 'TIPOCOMPRA', 'TGL'] # Pueden necesitar codificación
target_cols = ['CANTIDADCOMPRA']

# Dividir los datos de entrenamiento en características y objetivo
X_train = test.drop(columns=target_cols)
y_train = test[target_cols]

# Preprocesamiento: codificación One-Hot para variables categóricas y escalado para variables numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# Modelo: Bosque Aleatorio para regresión multivariable
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Crear un pipeline con el preprocesamiento y el modelo
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

# Evaluación del modelo en el conjunto de entrenamiento (usando RMSE)
y_pred_train = pipeline.predict(X_train)
#rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
mse=mean_squared_error(y_train,y_pred_train)
#rmse_train
mse

r_squared_train = r2_score(y_train, y_pred_train)
print(f"R-squared en el conjunto de entrenamiento: {r_squared_train}")

y_pred_test = pipeline.predict(X_train)
test['Predicciones_CANTIDADCOMPRA'] = y_pred_train
test.to_csv('test_con_predicciones.csv', index=False)

  return fit_method(estimator, *args, **kwargs)


R-squared en el conjunto de entrenamiento: 0.9679207234528602


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicciones_CANTIDADCOMPRA'] = y_pred_train
