In [None]:
# Start/attach to your cluster
storage_account_name = "< your storage_account_name>"
sas_token = "<your SAS token>"

spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS")
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net", 
               "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net", sas_token)

feature_path = f"abfss://<your first container name>@{storage_account_name}.dfs.core.windows.net/cpg_features.parquet"
df_features = spark.read.parquet(feature_path)
df_features.show(5)
df_features.printSchema()


In [None]:
from pyspark.sql.functions import col

# Example: filter rows before 2023-03-01 for training, after for test
train_df = df_features.filter(col("sales_date") < "2023-03-01")
test_df = df_features.filter(col("sales_date") >= "2023-01-01")

print("Train rows:", train_df.count())
print("Test rows:", test_df.count())


In [None]:
train_pd = train_df.toPandas()
test_pd = test_df.toPandas()

In [None]:
feature_cols = [
    "day_of_week", "week_of_year", "month", "year",
    "is_holiday", "is_promo_active", 
    "rolling_7d_qty"
]
target_col = "sales_qty"

X_train = train_pd[feature_cols]
y_train = train_pd[target_col]
X_test = test_pd[feature_cols]
y_test = test_pd[target_col]

In [None]:
# USING SKLEARN
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Ensure X_test is not empty
if X_test is None or X_test.shape[0] == 0:
    raise ValueError("X_test is empty. Please provide a valid test dataset.")

# Create the regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R2: {r2:.2f}")

In [None]:
#USING TENSORFLOW
%pip install --upgrade typing_extensions

import tensorflow as tf
from tensorflow.keras import layers

model_tf = tf.keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(len(feature_cols),)),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)  # single numeric output
])

model_tf.compile(optimizer='adam', loss='mse')

# Convert X_train, y_train to numpy arrays
X_train_np = X_train.values
y_train_np = y_train.values
X_test_np = X_test.values
y_test_np = y_test.values

model_tf.fit(X_train_np, y_train_np, validation_split=0.1, epochs=10, batch_size=32)

# Evaluate
loss = model_tf.evaluate(X_test_np, y_test_np)
print("Test MSE:", loss)

In [None]:
import joblib
import os

local_path = "/tmp/cpg_forecast_rf.pkl"
joblib.dump(model, local_path)

# Then copy from local driver to ADLS via dbutils or PySpark
container_name = "model-store"
storage_path = f"abfss://{your first container name}@{storage_account_name}.dfs.core.windows.net/cpg_forecast_rf.pkl"

# Easiest approach might be a "dbutils.fs.cp" from file:/ to abfss:/
dbutils.fs.cp(f"file:{local_path}", storage_path)