# **DATATHON 2023: NTT-DATA CHALLENGE**

## **Requirements**:

In [11]:
%pip install -r requirements.txt 




## **Imports**

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## **Main Program**

Read dataset

In [44]:
df = pd.read_excel('../assets/consumo_material_clean.xlsx')
df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,ORIGEN,TGL,PRODUCTO
0,E99808,01/01/23,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,1-2-60,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3
1,B41691,01/02/16,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,0-10-1,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...
2,E64543,01/02/16,71961/16,403770.0,20,5,12.1,48.4,Compra menor,0-4-111,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18
3,E65007,01/02/16,72773/16,20415.0,100,50,215.325,430.65,Concurso,0-10-1,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11
4,E64911,01/02/17,86159/17,20701.0,300,300,792.0,792.0,Concurso,0-6-1,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6


Some preprocessing

In [45]:
# Drop rows with NaN values
df = df.dropna()

# Date format
df['FECHAPEDIDO'] = pd.to_datetime(df['FECHAPEDIDO'], format='%d/%m/%y')

# Split "ORIGEN" into "REGION", "HOSPITAL" and "DEPARTAMENTO"
df['ORIGEN'] = df['ORIGEN'].str.replace('--', '-')
df[['REGION', 'HOSPITAL', 'DEPARTAMENTO']] = df['ORIGEN'].str.split('-', expand=True)
df = df.drop(["ORIGEN"], axis=1)

# Categorical variables
categorical = ['CODIGO', 'PRODUCTO', 'NUMERO', 'REFERENCIA', 'TIPOCOMPRA', 'REGION', 'HOSPITAL', 'DEPARTAMENTO', 'TGL']
df[categorical] = df[categorical].astype('category')

# Numeric variables
numerical_int = ['CANTIDADCOMPRA', 'UNIDADESCONSUMOCONTENIDAS']
numerical_float = ['PRECIO', 'IMPORTELINEA']
df[numerical_float] = df[numerical_float].astype('float')

df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,TGL,PRODUCTO,REGION,HOSPITAL,DEPARTAMENTO
0,E99808,2023-01-01,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,2,60
1,B41691,2016-02-01,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,0,10,1
2,E64543,2016-02-01,71961/16,403770.0,20,5,12.1,48.4,Compra menor,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18,0,4,111
3,E65007,2016-02-01,72773/16,20415.0,100,50,215.325,430.65,Concurso,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11,0,10,1
4,E64911,2017-02-01,86159/17,20701.0,300,300,792.0,792.0,Concurso,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6,0,6,1


New dataset with the important variables to train/predict

In [46]:
new_df = df[['PRODUCTO', 'FECHAPEDIDO', 'TIPOCOMPRA', 'REGION', 'HOSPITAL', 'DEPARTAMENTO', 'TGL', 'CANTIDADCOMPRA', 'UNIDADESCONSUMOCONTENIDAS', 'PRECIO', 'IMPORTELINEA']].copy()

new_df['MES'] = new_df['FECHAPEDIDO'].dt.month
new_df['AÑO'] = new_df['FECHAPEDIDO'].dt.year
new_df = new_df.drop('FECHAPEDIDO', axis=1)

new_df['PRECIOUNIDAD'] = new_df['IMPORTELINEA'] / new_df['CANTIDADCOMPRA']

print(new_df.dtypes)

new_df.head()

CODIGO                       category
NUMERO                       category
REFERENCIA                   category
CANTIDADCOMPRA                  int64
UNIDADESCONSUMOCONTENIDAS       int64
PRECIO                        float64
IMPORTELINEA                  float64
TIPOCOMPRA                   category
TGL                          category
PRODUCTO                     category
REGION                       category
HOSPITAL                     category
DEPARTAMENTO                 category
MES                             int32
AÑO                             int32
PRECIOUNIDAD                  float64
dtype: object


Unnamed: 0,CODIGO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,TGL,PRODUCTO,REGION,HOSPITAL,DEPARTAMENTO,MES,AÑO,PRECIOUNIDAD
0,E99808,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,2,60,1,2023,6.259
1,B41691,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,0,10,1,2,2016,10.280373
2,E64543,71961/16,403770.0,20,5,12.1,48.4,Compra menor,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18,0,4,111,2,2016,2.42
3,E65007,72773/16,20415.0,100,50,215.325,430.65,Concurso,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11,0,10,1,2,2016,4.3065
4,E64911,86159/17,20701.0,300,300,792.0,792.0,Concurso,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6,0,6,1,2,2017,2.64


Group by product, year, month and type of purchase

In [16]:
new_df = new_df.groupby(['AÑO', 'MES', 'PRODUCTO', 'HOSPITAL', 'TIPOCOMPRA', 'TGL'], observed=True).agg({'CANTIDADCOMPRA': 'sum', 'UNIDADESCONSUMOCONTENIDAS': 'mean', 'PRECIO': 'mean', 'IMPORTELINEA': 'sum', 'PRECIOUNIDAD': 'mean'}).reset_index()

new_df.drop(['UNIDADESCONSUMOCONTENIDAS', 'PRECIO', 'IMPORTELINEA', 'PRECIOUNIDAD'], axis=1, inplace=True)

new_df.to_csv('../assets/new_df.csv', index=False)


Split train and test datasets

In [28]:
split_year = 2023
train = new_df.loc[new_df['AÑO'] < split_year]
test = new_df.loc[new_df['AÑO'] >= split_year]

In [29]:
train.to_csv('../assets/train.csv', index=False)
test.to_csv('../assets/test.csv', index=False)

Random Forest Regressor

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# Selecting categorical and numerical columns
categorical_cols = ['TIPOCOMPRA', 'PRODUCTO', 'HOSPITAL', 'TGL']
numerical_cols = ['AÑO', 'MES']

# Creating transformers for numerical and categorical columns
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundling transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

X_train = train.drop('CANTIDADCOMPRA', axis=1)
y_train = train['CANTIDADCOMPRA']

# Preparing the testing data
X_test = test.drop('CANTIDADCOMPRA', axis=1)
y_test = test['CANTIDADCOMPRA']

from imblearn.over_sampling import SMOTE
from collections import Counter

# Creating a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])

# Hyperparameters to tune
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__max_depth': [10, 20, 30, None]
}

# Grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fitting the model
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Best model from grid search
best_rf_model = grid_search.best_estimator_

# Predicting with the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluating the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)

r2_rf = r2_score(y_test, y_pred_rf)
print(f'MSE with Random Forest: {mse_rf}')
print(f'RMSE with Random Forest: {rmse_rf}')
print(f'R2 score with Random Forest: {r2_rf}')

Distribución original de clases: Counter({200: 84, 100: 77, 10: 71, 5: 40, 300: 37, 20: 37, 60: 35, 30: 29, 1000: 28, 600: 25, 12: 25, 50: 24, 120: 23, 150: 23, 40: 22, 1200: 19, 180: 19, 70: 19, 350: 18, 80: 17, 400: 17, 15: 16, 900: 14, 25: 14, 140: 13, 2000: 13, 110: 13, 4: 13, 3000: 12, 6: 12, 160: 11, 130: 11, 700: 11, 800: 10, 24: 9, 320: 9, 240: 9, 5000: 9, 3: 9, 90: 9, 7000: 8, 210: 8, 290: 8, 3200: 8, 1008: 7, 220: 7, 4000: 7, 1800: 7, 500: 7, 35: 7, 7: 7, 560: 7, 450: 6, 1500: 6, 360: 6, 380: 6, 260: 5, 2700: 5, 9: 5, 280: 5, 250: 5, 170: 5, 310: 5, 1280: 5, 270: 4, 420: 4, 3400: 4, 45: 4, 135: 4, 72: 4, 3300: 4, 8: 4, 1250: 4, 440: 4, 2: 4, 75: 3, 650: 3, 330: 3, 460: 3, 2016: 3, 530: 3, 1700: 3, 1600: 3, 2500: 3, 340: 3, 65: 3, 84: 3, 480: 3, 1120: 3, 2240: 3, 230: 3, 1090: 2, 195: 2, 370: 2, 850: 2, 48: 2, 1030: 2, 1220: 2, 1100: 2, 36: 2, 780: 2, 1260: 2, 540: 2, 630: 2, 750: 2, 3600: 2, 108: 2, 430: 2, 1550: 2, 1300: 2, 6300: 2, 2750: 2, 115: 2, 435: 2, 1350: 2, 8000: 2,

ValueError: could not convert string to float: 'BOMBA DE UN SOLO USO / TERAPIA PRESIÓN NEGATIVA-40'

In [31]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

# Separar características y variable objetivo
X_train = train.drop('CANTIDADCOMPRA', axis=1)
y_train = train['CANTIDADCOMPRA']
X_test = test.drop('CANTIDADCOMPRA', axis=1)
y_test = test['CANTIDADCOMPRA']

# Preprocesamiento
categorical_features = ['PRODUCTO', 'HOSPITAL', 'TIPOCOMPRA', 'TGL']
numerical_features = ['AÑO', 'MES']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

models_params = {
	"XGBRegressor": {
		"model": XGBRegressor(random_state=0),
		"params": {"regressor__n_estimators": [50, 100, 200], "regressor__learning_rate": [0.01, 0.1], "regressor__max_depth": [3, 5, 10, 20]}
	}
}

# Función para evaluar los modelos
def evaluate_model(model, params, X_train, y_train, X_test, y_test):
    grid_search = GridSearchCV(model, params, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    y_pred_train = best_model.predict(X_train)
    y_pred_test = best_model.predict(X_test)
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2 = r2_score(y_test, y_pred_test)
    return {
        "MSE Train": mse_train,
        "MSE Test": mse_test,
        "MAE Test": mae_test,
        "R2 Score": r2
    }

# Evaluación de modelos
results = {}
for name, mp in models_params.items():
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('regressor', mp['model'])])
    results[name] = evaluate_model(model_pipeline, mp['params'], X_train, y_train, X_test, y_test)

# Mostrar resultados
for model_name, result in results.items():
    print(f"Modelo: {model_name}")
    for key, value in result.items():
        print(f"{key}: {value}")


Modelo: XGBRegressor
MSE Train: 170574.9051897159
MSE Test: 882126.9163475028
MAE Test: 407.8388120599573
R2 Score: 0.5274249621102094
