<a href="https://colab.research.google.com/github/KiaraSN/lab-aws-sagemaker-canvas-estoque/blob/main/SageMake_Canvas_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
...
import matplotlib.pyplot as plt


In [None]:
csv_path = "dio/dataset-500-curso-sagemaker-canvas-dio.csv"
df = pd.read_csv(csv_path)

In [None]:
candidate_names = [...]
columns_lower = [c.lower() for c in df.columns]
# Procuramos por nomes comuns e, se não existir, pegamos a última coluna numérica


In [None]:
target_col = 'QUANTIDADE_ESTOQUE'
df[target_col].describe()

Unnamed: 0,QUANTIDADE_ESTOQUE
count,500.0
mean,33.35
std,26.869079
min,0.0
25%,7.0
50%,32.0
75%,56.0
max,97.0


In [None]:
date_candidates = [c for c in df.columns if any(k in c.lower() for k in ['date','data','mes','dia','year','ano'])]

# Convert to datetime and create new features
if 'DIA' in df.columns:
    df['DIA'] = pd.to_datetime(df['DIA'])
    df['DIA_year'] = df['DIA'].dt.year
    df['DIA_month'] = df['DIA'].dt.month
    df['DIA_day'] = df['DIA'].dt.day
    df['DIA_weekday'] = df['DIA'].dt.weekday # Monday=0, Sunday=6
    df = df.drop(columns=['DIA']) # Drop original DIA column

In [None]:
df_proc = df # Temporary assignment to resolve NameError. This should ideally be the output of your data preprocessing.
X = df_proc.drop(columns=[target_col])
y = df_proc[target_col]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np

numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

numeric_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include='object').columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from numpy import sqrt

mae = mean_absolute_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [None]:
import os

results = pd.DataFrame(y_pred, columns=['predictions'])

# Create the directory if it does not exist
os.makedirs('/mnt/data', exist_ok=True)

results.to_csv("/mnt/data/predictions.csv", index=False)

In [35]:
report = {
    'metrics': {
        'mae': mae,
        'rmse': rmse,
        'r2': r2
    }
}
print(f"MAE (Mean Absolute Error): {report['metrics']['mae']:.4f}")
print(f"RMSE (Root Mean Squared Error): {report['metrics']['rmse']:.4f}")
print(f"R2 (R-squared): {report['metrics']['r2']:.4f}")

MAE (Mean Absolute Error): 7.1322
RMSE (Root Mean Squared Error): 10.3389
R2 (R-squared): 0.8696
