# **DATATHON 2023: NTT-DATA CHALLENGE**

## **Requirements**:

## **Main Program**

Read dataset

Some preprocessing

In [45]:
# Drop rows with NaN values
df = df.dropna()

# Date format
df['FECHAPEDIDO'] = pd.to_datetime(df['FECHAPEDIDO'], format='%d/%m/%y')

# Split "ORIGEN" into "REGION", "HOSPITAL" and "DEPARTAMENTO"
df['ORIGEN'] = df['ORIGEN'].str.replace('--', '-')
df[['REGION', 'HOSPITAL', 'DEPARTAMENTO']] = df['ORIGEN'].str.split('-', expand=True)
df = df.drop(["ORIGEN"], axis=1)

# Categorical variables
categorical = ['CODIGO', 'PRODUCTO', 'NUMERO', 'REFERENCIA', 'TIPOCOMPRA', 'REGION', 'HOSPITAL', 'DEPARTAMENTO', 'TGL']
df[categorical] = df[categorical].astype('category')

# Numeric variables
numerical_int = ['CANTIDADCOMPRA', 'UNIDADESCONSUMOCONTENIDAS']
numerical_float = ['PRECIO', 'IMPORTELINEA']
df[numerical_float] = df[numerical_float].astype('float')

df.head()

Unnamed: 0,CODIGO,FECHAPEDIDO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,TGL,PRODUCTO,REGION,HOSPITAL,DEPARTAMENTO
0,E99808,2023-01-01,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,2,60
1,B41691,2016-02-01,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,0,10,1
2,E64543,2016-02-01,71961/16,403770.0,20,5,12.1,48.4,Compra menor,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18,0,4,111
3,E65007,2016-02-01,72773/16,20415.0,100,50,215.325,430.65,Concurso,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11,0,10,1
4,E64911,2017-02-01,86159/17,20701.0,300,300,792.0,792.0,Concurso,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6,0,6,1


New dataset with the important variables to train/predict

In [46]:
new_df = df[['PRODUCTO', 'FECHAPEDIDO', 'TIPOCOMPRA', 'REGION', 'HOSPITAL', 'DEPARTAMENTO', 'TGL', 'CANTIDADCOMPRA', 'UNIDADESCONSUMOCONTENIDAS', 'PRECIO', 'IMPORTELINEA']].copy()

new_df['MES'] = new_df['FECHAPEDIDO'].dt.month
new_df['AÑO'] = new_df['FECHAPEDIDO'].dt.year
new_df = new_df.drop('FECHAPEDIDO', axis=1)

new_df['PRECIOUNIDAD'] = new_df['IMPORTELINEA'] / new_df['CANTIDADCOMPRA']

print(new_df.dtypes)

new_df.head()

CODIGO                       category
NUMERO                       category
REFERENCIA                   category
CANTIDADCOMPRA                  int64
UNIDADESCONSUMOCONTENIDAS       int64
PRECIO                        float64
IMPORTELINEA                  float64
TIPOCOMPRA                   category
TGL                          category
PRODUCTO                     category
REGION                       category
HOSPITAL                     category
DEPARTAMENTO                 category
MES                             int32
AÑO                             int32
PRECIOUNIDAD                  float64
dtype: object


Unnamed: 0,CODIGO,NUMERO,REFERENCIA,CANTIDADCOMPRA,UNIDADESCONSUMOCONTENIDAS,PRECIO,IMPORTELINEA,TIPOCOMPRA,TGL,PRODUCTO,REGION,HOSPITAL,DEPARTAMENTO,MES,AÑO,PRECIOUNIDAD
0,E99808,1595724/23,178567.1,60,10,62.59,375.54,Compra menor,TRANSITO,APOSITO DE FIBRAS DE POLIACRILATO C/PLATA-3,1,2,60,1,2023,6.259
1,B41691,72714/16,400403.0,40,10,102.803729,411.214916,Compra menor,ALMACENABLE,SOLUCION P/ LIMPIEZA Y DESCONTAMINACION DE HER...,0,10,1,2,2016,10.280373
2,E64543,71961/16,403770.0,20,5,12.1,48.4,Compra menor,TRANSITO,APOSITO DE HIDROFIBRA / CINTA-18,0,4,111,2,2016,2.42
3,E65007,72773/16,20415.0,100,50,215.325,430.65,Concurso,ALMACENABLE,APOSITO DE ESPUMA POLIURETANO / SACRO-11,0,10,1,2,2016,4.3065
4,E64911,86159/17,20701.0,300,300,792.0,792.0,Concurso,ALMACENABLE,APOSITO C/ CARBON Y PLATA-6,0,6,1,2,2017,2.64


Group by product, year, month and type of purchase

In [16]:
new_df = new_df.groupby(['AÑO', 'MES', 'PRODUCTO', 'HOSPITAL', 'TIPOCOMPRA', 'TGL'], observed=True).agg({'CANTIDADCOMPRA': 'sum', 'UNIDADESCONSUMOCONTENIDAS': 'mean', 'PRECIO': 'mean', 'IMPORTELINEA': 'sum', 'PRECIOUNIDAD': 'mean'}).reset_index()

new_df.drop(['UNIDADESCONSUMOCONTENIDAS', 'PRECIO', 'IMPORTELINEA', 'PRECIOUNIDAD'], axis=1, inplace=True)

new_df.to_csv('../assets/new_df.csv', index=False)


Split train and test datasets

In [28]:
split_year = 2023
train = new_df.loc[new_df['AÑO'] < split_year]
test = new_df.loc[new_df['AÑO'] >= split_year]

In [29]:
train.to_csv('../assets/train.csv', index=False)
test.to_csv('../assets/test.csv', index=False)

Random Forest Regressor

In [48]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_sample_weight

# Selecting categorical and numerical columns
categorical_cols = ['TIPOCOMPRA', 'PRODUCTO', 'HOSPITAL', 'TGL']
numerical_cols = ['AÑO', 'MES']

# Creating transformers for numerical and categorical columns
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundling transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preparing training data
X_train = train.drop('CANTIDADCOMPRA', axis=1)
y_train = train['CANTIDADCOMPRA']

# Computing sample weights based on 'AÑO'
weights = compute_sample_weight(class_weight='balanced', y=X_train['AÑO'])

# Preparing the testing data
X_test = test.drop('CANTIDADCOMPRA', axis=1)
y_test = test['CANTIDADCOMPRA']

# Creating a pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))])

# Hyperparameters to tune
param_grid = {
    'regressor__n_estimators': [50, 100, 150],
    'regressor__max_depth': [10, 20, 30, None]
}

# Grid search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Fitting the model with sample weights
grid_search.fit(X_train, y_train, regressor__sample_weight=weights)

# Best parameters
print(f"Best parameters: {grid_search.best_params_}")

# Best model from grid search
best_rf_model = grid_search.best_estimator_

# Predicting with the best model
y_pred_rf = best_rf_model.predict(X_test)

# Evaluating the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f'MSE with Random Forest: {mse_rf}')
print(f'RMSE with Random Forest: {rmse_rf}')
print(f'R2 score with Random Forest: {r2_rf}')


Best parameters: {'regressor__max_depth': 30, 'regressor__n_estimators': 100}
MSE with Random Forest: 900554.8369206262
RMSE with Random Forest: 948.9756777286899
R2 score with Random Forest: 0.5175527145892607
