In [31]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")

# Load data SQL

In [32]:
def load_sql(query: str) -> pd.DataFrame:
    load_dotenv()
    db_user= os.getenv("DB_USER")
    db_password= os.getenv("DB_PASSWORD")
    db_host= os.getenv("DB_HOST")
    db_name= os.getenv("DB_NAME")
    engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}/{db_name}')
    with engine.connect() as conn:
        df = pd.read_sql_query(text(query), conn)
        return df

pd.set_option('display.float_format', lambda x: '%.2f' % x)
df = load_sql("SELECT * FROM raw.pharmacy_sales;")
df = df.sort_values(by=["distributor", "product_name", "year", "city", "month"])
df.head()

Unnamed: 0,distributor,customer_name,city,country,latitude,longitude,channel,sub_channel,product_name,product_class,quantity,price,sales,month,year,sales_rep_name,manager,sales_team
207687,Bashirian-Kassulke,Rogahn-Klein Pharma Plc,Leinfelden-Echterdingen,Germany,48.69,9.14,Pharmacy,Institution,Abatatriptan,Antibiotics,2.0,742.0,1484.0,February,2020,Stella Given,Alisha Cordwell,Charlie
187350,Bashirian-Kassulke,Runolfsson-Halvorson Pharm,Rheinberg,Germany,51.55,6.6,Pharmacy,Retail,Abranatal Lysoprosate,Antiseptics,15826.0,681.0,10777506.0,August,2019,Mary Gerrard,Britanny Bold,Delta
254078,Bashirian-Kassulke,Hane Ltd Pharmaceutical Ltd,Aichach,Germany,48.45,11.13,Hospital,Private,Abranatal Lysoprosate,Antiseptics,432.0,681.0,294192.0,December,2020,Anne Wu,Britanny Bold,Delta
175417,Bashirian-Kassulke,Doyle-Tillman Pharmaceutical Limited,Zirndorf,Germany,49.45,10.95,Pharmacy,Institution,Acantaine,Antibiotics,50.0,66.0,3300.0,June,2019,Thompson Crawford,James Goodwill,Alfa
246485,Bashirian-Kassulke,"Langworth, Olson and Satterfield Pharmacy",Meschede,Germany,51.35,8.28,Hospital,Government,Aciprex,Antipiretics,150.0,421.0,63150.0,November,2020,Thompson Crawford,James Goodwill,Alfa


In [33]:
df[['quantity', 'sales']].head(10)

Unnamed: 0,quantity,sales
207687,2.0,1484.0
187350,15826.0,10777506.0
254078,432.0,294192.0
175417,50.0,3300.0
246485,150.0,63150.0
232401,20.0,8420.0
53000,2500.0,1695000.0
254079,320.0,216960.0
158400,60.0,1440.0
188559,2000.0,48000.0


# Feature Engineering

In [34]:
features = (df.groupby(["distributor",
                    "channel",
                    "sub_channel",
                    "city",
                    "product_name",
                    "product_class",
                    "sales_team",
                    "year",
                    "month",]).agg(
                total_quantity=("quantity", "sum"),
                total_sales=("sales", "sum"),
                avg_price=("price", "mean"),
                    ).reset_index())

# Clean outliers
sales_upper_bound = features["total_sales"].quantile(0.90)
features["total_sales_clean"] = features["total_sales"].clip(lower=0, upper=sales_upper_bound)

# Split data

In [35]:
# Convert month to digit
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}

features["year"] = features["year"].astype(int)
features["month"] = features["month"].map(month_mapping)

features = features.sort_values(by=["distributor", "year", "month"])

def time_split(df, split_date=(2018, 9)):
    train = df[
        (df["year"] < split_date[0]) |
        ((df["year"] == split_date[0]) & (df["month"] < split_date[1]))
    ].copy()
    test = df[
        (df["year"] > split_date[0]) |
        ((df["year"] == split_date[0]) & (df["month"] >= split_date[1]))
    ].copy()
    return train, test

train_raw, test_raw = time_split(features)

# Feature engineering

In [36]:
def create_lag_features(df, is_train=True):
    df = df.copy()

    # Sort values
    df = df.sort_values(by=["distributor", "year", "month"])
    grp = df.groupby(["distributor"])

    # Lag features - use values from previous months of q1, q2, q3, q4
    df["lag_3m_sales"] = grp["total_sales_clean"].shift(3)
    df["lag_6m_sales"] = grp["total_sales_clean"].shift(6)
    df["lag_9m_sales"] = grp["total_sales_clean"].shift(9)
    df["lag_12m_sales"] = grp["total_sales_clean"].shift(12)

    # Rolling features
    df["rolling_avg_sales_3m"] = grp["total_sales_clean"].transform(
        lambda x: x.rolling(window=3, min_periods=1).mean()
    )
    # Growth percentage
    df["sales_growth_pct"] = grp["total_sales_clean"].transform(
        lambda x: x.pct_change().shift(1) * 100
    )
    # Clean NaN
    df = df.replace([np.inf, -np.inf], np.nan)

    if is_train:
        df = df.fillna(0)
    else:
        df = df.fillna(0)

    return df

train_fe = create_lag_features(train_raw, is_train=True)
test_fe = create_lag_features(test_raw, is_train=False)

# Verify values
print("Check for data leakage:")
print(f"Train date range: {train_fe['year'].min()}-{train_fe['month'].min()} to {train_fe['year'].max()}-{train_fe['month'].max()}")
print(f"Test date range: {test_fe['year'].min()}-{test_fe['month'].min()} to {test_fe['year'].max()}-{test_fe['month'].max()}")
print(f"\nTrain shape: {train_fe.shape}")
print(f"Test shape: {test_fe.shape}")

Check for data leakage:
Train date range: 2017-1 to 2018-12
Test date range: 2018-1 to 2020-12

Train shape: (118439, 19)
Test shape: (135203, 19)


# Data Encoding

In [37]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = ["distributor", "channel", "sub_channel", "city", 
                    "product_name", "product_class", "sales_team"]

# Inisialisasi dictionary untuk menyimpan encoder
encoders = {}

# Fit encoder pada TRAIN saja, lalu transform train dan test
for col in categorical_cols:
    le = LabelEncoder()
    train_fe[col] = le.fit_transform(train_fe[col].astype(str))
    
    # Untuk test, handle unseen labels
    test_fe[col] = test_fe[col].astype(str).map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )
    encoders[col] = le

# Prepare final features

In [38]:
final_features = ["avg_price", "month", "year", "distributor", 
                  "product_class", "city", "rolling_avg_sales_3m", 
                  "lag_3m_sales", "sales_growth_pct"]

X_train = train_fe[final_features]
y_train = train_fe["total_sales_clean"]

X_test = test_fe[final_features]
y_test = test_fe["total_sales_clean"]

# Filter out zero values untuk menghindari MAPE ekstrem
mask = (y_test > 1000) & (y_test < 100000)
X_test_filtered = X_test[mask]
y_test_filtered = y_test[mask]

print(f"Original test samples: {len(y_test)}")
print(f"Filtered test samples: {len(y_test_filtered)}")
print(f"Filtered test y range: {y_test_filtered.min():.2f} - {y_test_filtered.max():.2f}")

Original test samples: 135203
Filtered test samples: 113171
Filtered test y range: 1002.00 - 76800.00


# Scaling

In [39]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test_filtered)

X_train_df = pd.DataFrame(X_train_scaled, columns=final_features, index=X_train.index)
X_test_df = pd.DataFrame(X_test_scaled, columns=final_features, index=X_test_filtered.index)

# Allign y_train with X_train_df index
y_train_aligned = y_train.loc[X_train_df.index]

print(f"X_train shape: {X_train_df.shape}")
print(f"y_train shape: {y_train_aligned.shape}")
print(f"X_test shape: {X_test_df.shape}")
print(f"y_test shape: {y_test_filtered.shape}")

X_train shape: (118439, 9)
y_train shape: (118439,)
X_test shape: (113171, 9)
y_test shape: (113171,)


# Training & Evaluation Machine learning models

In [40]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

models = {
    "Linear Regression": LinearRegression(),
    "XGBoost Regressor": XGBRegressor(objective='reg:squarederror', random_state=42, n_estimators=100),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10, n_jobs=-1),
    "CatBoost Regressor": CatBoostRegressor(iterations=200, depth=6, verbose=0, random_state=42)
}

def evaluate_model(y_true, y_pred):
    y_pred_clipped = np.clip(y_pred, 0, None)
    
    mae = mean_absolute_error(y_true, y_pred_clipped)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred_clipped))
    r2 = r2_score(y_true, y_pred_clipped)
    
    # Hitung MAPE secara manual untuk kontrol lebih baik
    mape = np.mean(np.abs((y_true - y_pred_clipped) / y_true)) * 100
    
    return {
        "MAE": mae,
        "RMSE": rmse,
        "R2 Score": r2,
        "MAPE (%)": mape
    }

results = {}
for model_name, model in models.items():
    print(f"Training {model_name}...")
    model.fit(X_train_df, y_train_aligned)
    y_pred = model.predict(X_test_df)
    results[model_name] = evaluate_model(y_test_filtered, y_pred)

results_df = pd.DataFrame(results).T.sort_values(by="MAE")
print(f"\nResults (Fixed):\n{results_df.to_string()}")

Training Linear Regression...
Training XGBoost Regressor...
Training Random Forest Regressor...
Training CatBoost Regressor...

Results (Fixed):
                             MAE     RMSE  R2 Score  MAPE (%)
CatBoost Regressor      10657.24 16064.19      0.61    147.13
XGBoost Regressor       10772.44 16305.41      0.60    147.95
Random Forest Regressor 10803.21 16262.56      0.60    151.52
Linear Regression       14498.39 19877.66      0.41    218.37
