<a href="https://colab.research.google.com/github/KiaraSN/lab-aws-sagemaker-canvas-estoque/blob/main/SageMake_canvas_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# train_model.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import json

# 1. Load
df = pd.read_csv("/content/dio/dataset-500-curso-sagemaker-canvas-dio.csv")

# 2. Choose target
target = "QUANTIDADE_ESTOQUE"

# 3. Parse date (DIA)
df["DIA"] = pd.to_datetime(df["DIA"], dayfirst=True, errors='coerce')
df["DIA_year"] = df["DIA"].dt.year
df["DIA_month"] = df["DIA"].dt.month
df["DIA_day"] = df["DIA"].dt.day
df["DIA_weekday"] = df["DIA"].dt.weekday
df = df.drop(columns=["DIA"])

# 4. Features / target
X = df.drop(columns=[target])
y = df[target]

numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

# 5. Preprocessing & pipeline
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                                   ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer([('num', numeric_transformer, numeric_features),
                                  ('cat', categorical_transformer, categorical_features)], remainder='drop')

model = Pipeline([('preproc', preprocessor),
                  ('rf', RandomForestRegressor(n_estimators=100, random_state=42))])

# 6. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Fit
model.fit(X_train, y_train)

# 8. Predict + metrics
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# 9. Save results
results = X_test.reset_index(drop=True)
results["y_true"] = y_test.reset_index(drop=True)
results["y_pred"] = y_pred
results.to_csv("predictions.csv", index=False)

report = {
    "dataset_shape": df.shape,
    "target": target,
    "metrics": {"mae": mae, "rmse": rmse, "r2": r2}
}
with open("model_report.json", "w") as f:
    json.dump(report, f, indent=2)
print("Done. Metrics:", report)

  df["DIA"] = pd.to_datetime(df["DIA"], dayfirst=True, errors='coerce')


Done. Metrics: {'dataset_shape': (500, 7), 'target': 'QUANTIDADE_ESTOQUE', 'metrics': {'mae': 7.1322, 'rmse': np.float64(10.338926346579706), 'r2': 0.869556470709836}}


In [6]:
print(f"MAE (Mean Absolute Error): {report['metrics']['mae']:.4f}")
print(f"RMSE (Root Mean Squared Error): {report['metrics']['rmse']:.4f}")
print(f"R2 (R-squared): {report['metrics']['r2']:.4f}")

MAE (Mean Absolute Error): 7.1322
RMSE (Root Mean Squared Error): 10.3389
R2 (R-squared): 0.8696
