# **DATATHON 2023: NTT-DATA CHALLENGE**

## **Requirements**

In [None]:
%pip install -r requirements.txt 

## **Imports**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## **Main Program**

Read dataset

In [None]:
df = pd.read_excel('../assets/consumo_material_clean.xlsx')
df.head()

Preprocessing

In [None]:
# Drop rows with NaN values
df.dropna(inplace=True)

# Date format
df['FECHAPEDIDO'] = pd.to_datetime(df['FECHAPEDIDO'], format='%d/%m/%y')

# Split "ORIGEN" into "REGION", "HOSPITAL" and "DEPARTAMENTO"
df['ORIGEN'] = df['ORIGEN'].str.replace('--', '-')
df[['REGION', 'HOSPITAL', 'DEPARTAMENTO']] = df['ORIGEN'].str.split('-', expand=True)
df = df.drop(["ORIGEN"], axis=1)

# Categorical variables
categorical = ['CODIGO', 'PRODUCTO', 'NUMERO', 'REFERENCIA', 'TIPOCOMPRA', 'REGION', 'HOSPITAL', 'DEPARTAMENTO', 'TGL']
df[categorical] = df[categorical].astype('category')

# Numeric variables
numerical_int = ['CANTIDADCOMPRA', 'UNIDADESCONSUMOCONTENIDAS']
numerical_float = ['PRECIO', 'IMPORTELINEA']
df[numerical_float] = df[numerical_float].astype('float')

df.head()

In [None]:
new_df = df[['PRODUCTO', 'FECHAPEDIDO', 'TIPOCOMPRA', 'CANTIDADCOMPRA', 'IMPORTELINEA', 'TGL','HOSPITAL']].copy()

new_df['MES'] = new_df['FECHAPEDIDO'].dt.month
new_df['AÑO'] = new_df['FECHAPEDIDO'].dt.year
new_df = new_df.drop('FECHAPEDIDO', axis=1)

print(new_df.dtypes)

new_df.head()

In [None]:
new_df = new_df.groupby(['AÑO', 'MES', 'PRODUCTO', 'TIPOCOMPRA','HOSPITAL','TGL',], observed=True).agg({'CANTIDADCOMPRA': 'sum'}).reset_index()

new_df.to_csv('../assets/new_df.csv', index=False)


In [None]:
split_year = 2023
train = new_df.loc[new_df['AÑO'] < split_year]
test = new_df.loc[new_df['AÑO'] >= split_year]

In [None]:
train.to_csv('../assets/train.csv', index=False)
test.to_csv('../assets/test.csv', index=False)

In [86]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Identificar columnas numéricas y categóricas
num_cols = ['AÑO', 'MES','HOSPITAL', ]
cat_cols = ['PRODUCTO', 'TIPOCOMPRA', 'TGL'] # Pueden necesitar codificación
target_cols = ['CANTIDADCOMPRA']

# Dividir los datos de entrenamiento en características y objetivo
X_train = train.drop(columns=target_cols)
y_train = train[target_cols]

X_test = test.drop(columns=target_cols)
y_test = test[target_cols]

# Preprocesamiento: codificación One-Hot para variables categóricas y escalado para variables numéricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ])

# Modelo: Bosque Aleatorio para regresión multivariable
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])
pipeline.fit(X_train, y_train)
y_pred_test = pipeline.predict(X_test)

mse_test = mean_squared_error(y_test, y_pred_test)
r_squared_test = r2_score(y_test, y_pred_test)
print(f"R^2 TEST: {r_squared_test}, MSE TEST: {mse_test}")

test['Predicciones_CANTIDADCOMPRA'] = y_pred_test
test.to_csv('test_con_predicciones.csv', index=False)

  return fit_method(estimator, *args, **kwargs)


R^2 TEST: 0.5645043777081817, MSE TEST: 812913.0393567446


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predicciones_CANTIDADCOMPRA'] = y_pred_test
