# **DATATHON 2023: NTT-DATA CHALLENGE**

## **Requirements**:

In [1]:
%pip install -r requirements.txt 

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## **Imports**

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima_model import ARIMA
import matplotlib.pyplot as plt
import seaborn as sns

## **Main Program**

Read dataset

In [3]:
df = pd.read_excel('../assets/consumo_material_clean.xlsx')
df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,ORIGEN,TGL,PRODUCTO
0,E99808,01/01/23,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,1-2-60,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3
1,B41691,01/02/16,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,0-10-1,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...
2,E64543,01/02/16,71961/16,403770.0,20,5,12.1,48.4,Compra menor,0-4-111,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18
3,E65007,01/02/16,72773/16,20415.0,100,50,215.325,430.65,Concurso,0-10-1,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11
4,E64911,01/02/17,86159/17,20701.0,300,300,792.0,792.0,Concurso,0-6-1,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6


Convert date to datetime:

In [4]:
# Categorical variables
df['FECHAPEDIDO'] = pd.to_datetime(df['FECHAPEDIDO'], format='%d/%m/%y')
df['CODIGO'] = df['CODIGO'].astype('category')
df['PRODUCTO'] = df['PRODUCTO'].astype('category')
df['NUMERO'] = df['NUMERO'].astype('category')
df['REFERENCIA'] = df['REFERENCIA'].astype('category')
df['TIPOCOMPRA'] = df['TIPOCOMPRA'].astype('category')
df['ORIGEN'] = df['ORIGEN'].astype('category')
df['TGL'] = df['TGL'].astype('category')

# Numeric variables
df['CANTIDADCOMPRA'] = df['CANTIDADCOMPRA'].astype('int')
df['UNIDADESCONSUMOCONTENIDAS'] = df['UNIDADESCONSUMOCONTENIDAS'].astype('int')
df['PRECIO'] = df['PRECIO'].astype('float')
df['IMPORTELINEA'] = df['IMPORTELINEA'].astype('float')

df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,ORIGEN,TGL,PRODUCTO
0,E99808,2023-01-01,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,1-2-60,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3
1,B41691,2016-02-01,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,0-10-1,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...
2,E64543,2016-02-01,71961/16,403770.0,20,5,12.1,48.4,Compra menor,0-4-111,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18
3,E65007,2016-02-01,72773/16,20415.0,100,50,215.325,430.65,Concurso,0-10-1,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11
4,E64911,2017-02-01,86159/17,20701.0,300,300,792.0,792.0,Concurso,0-6-1,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6


New dataset with the important variables to train/predict

In [5]:
new_df = df[['PRODUCTO', 'FECHAPEDIDO', 'TIPOCOMPRA', 'CANTIDADCOMPRA', 'IMPORTELINEA']].copy()

new_df['MES'] = new_df['FECHAPEDIDO'].dt.month
new_df['AÑO'] = new_df['FECHAPEDIDO'].dt.year
new_df = new_df.drop('FECHAPEDIDO', axis=1)

print(new_df.dtypes)

new_df.head()

PRODUCTO          category
TIPOCOMPRA        category
CANTIDADCOMPRA       int64
IMPORTELINEA       float64
MES                  int32
AÑO                  int32
dtype: object


Unnamed: 0,PRODUCTO,TIPOCOMPRA,CANTIDADCOMPRA,IMPORTELINEA,MES,AÑO
0,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,Compra menor,60,375.54,1,2023
1,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,Compra menor,40,411.214916,2,2016
2,APOSITO DE HIDROFIBRA / CINTA-18,Compra menor,20,48.4,2,2016
3,APOSITO DE ESPUMA POLIURETANO / SACRO-11,Concurso,100,430.65,2,2016
4,APOSITO C/ CARBON Y PLATA-6,Concurso,300,792.0,2,2017


Group by product, year, month and type of purchase

In [6]:
new_df = new_df.groupby(['AÑO', 'MES', 'PRODUCTO', 'TIPOCOMPRA'], observed=True).agg({'CANTIDADCOMPRA': 'sum', 'IMPORTELINEA': 'sum'}).reset_index()

new_df.to_csv('../assets/new_df.csv', index=False)


Split train and test datasets

In [7]:
split_year = 2023
train = new_df.loc[new_df['AÑO'] < split_year]
test = new_df.loc[new_df['AÑO'] >= split_year]

In [8]:
train.to_csv('../assets/train.csv', index=False)
test.to_csv('../assets/test.csv', index=False)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np

# Identificar columnes numèriques i categòriques
num_cols = ['AÑO', 'MES']
cat_cols = ['PRODUCTO', 'TIPOCOMPRA']  # Podrien necessitar codificació
target_col = 'CANTIDADCOMPRA'

# Dividir les dades d'entrenament en característiques i objectiu
X_train = train.drop(columns=[target_col, 'IMPORTELINEA'])
y_train = train[target_col]

# Preprocessament: codificació One-Hot per a variables categòriques i escalat per a variables numèriques
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# Model: Boscos Aleatoris per a regressió multivariable
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Crear un pipeline amb el preprocessament i el model
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)])

# Entrenament del model
pipeline.fit(X_train, y_train)

# Avaluació del model en el conjunt d'entrenament (utilitzant RMSE)
y_pred_train = pipeline.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

rmse_train


512.5163841156719

Preparar los datos de prueba

In [10]:
X_test = test.drop(columns=target_cols)

# Realizar predicciones en el conjunto de prueba
y_pred_test = pipeline.predict(X_test)

# Convertir las predicciones a un DataFrame para facilitar su visualización
y_pred_test_df = pd.DataFrame(y_pred_test, columns=target_cols)

y_pred_test_df.head()

NameError: name 'target_cols' is not defined