In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")

In [2]:
def load_sql(query: str) -> pd.DataFrame:
    load_dotenv()
    db_user= os.getenv("DB_USER")
    db_password= os.getenv("DB_PASSWORD")
    db_host= os.getenv("DB_HOST")
    db_name= os.getenv("DB_NAME")
    engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}/{db_name}')
    with engine.connect() as conn:
        df = pd.read_sql_query(text(query), conn)
        return df

pd.set_option('display.float_format', lambda x: '%.2f' % x)
df = load_sql("SELECT * FROM raw.pharmacy_sales;")
df = df.sort_values(by=["distributor", "product_name", "year", "city", "month"])
df.head()

Unnamed: 0,distributor,customer_name,city,country,latitude,longitude,channel,sub_channel,product_name,product_class,quantity,price,sales,month,year,sales_rep_name,manager,sales_team
207687,Bashirian-Kassulke,Rogahn-Klein Pharma Plc,Leinfelden-Echterdingen,Germany,48.69,9.14,Pharmacy,Institution,Abatatriptan,Antibiotics,2.0,742.0,1484.0,February,2020,Stella Given,Alisha Cordwell,Charlie
187350,Bashirian-Kassulke,Runolfsson-Halvorson Pharm,Rheinberg,Germany,51.55,6.6,Pharmacy,Retail,Abranatal Lysoprosate,Antiseptics,15826.0,681.0,10777506.0,August,2019,Mary Gerrard,Britanny Bold,Delta
254078,Bashirian-Kassulke,Hane Ltd Pharmaceutical Ltd,Aichach,Germany,48.45,11.13,Hospital,Private,Abranatal Lysoprosate,Antiseptics,432.0,681.0,294192.0,December,2020,Anne Wu,Britanny Bold,Delta
175417,Bashirian-Kassulke,Doyle-Tillman Pharmaceutical Limited,Zirndorf,Germany,49.45,10.95,Pharmacy,Institution,Acantaine,Antibiotics,50.0,66.0,3300.0,June,2019,Thompson Crawford,James Goodwill,Alfa
246485,Bashirian-Kassulke,"Langworth, Olson and Satterfield Pharmacy",Meschede,Germany,51.35,8.28,Hospital,Government,Aciprex,Antipiretics,150.0,421.0,63150.0,November,2020,Thompson Crawford,James Goodwill,Alfa


In [3]:
# 2. AGGREGATE (SIMPLIFIED)
features = (df.groupby([
    "distributor",
    "product_name",
    "year",
    "month"
]).agg(
    total_quantity=("quantity", "sum"),
    total_sales=("sales", "sum"),
    avg_price=("price", "mean"),
    num_cities=("city", "nunique"),
    num_channels=("channel", "nunique")
).reset_index())

# Clean outliers
sales_upper_bound = features["total_sales"].quantile(0.90)
features["total_sales_clean"] = features["total_sales"].clip(lower=0, upper=sales_upper_bound)

In [4]:
# 3. CONVERT MONTH & SORT
month_mapping = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
features["month"] = features["month"].map(month_mapping)
features = features.sort_values(by=["distributor", "product_name", "year", "month"])

In [5]:
# 4. TIME SPLIT
def time_split(df, split_date=(2018, 9)):
    train = df[
        (df["year"] < split_date[0]) |
        ((df["year"] == split_date[0]) & (df["month"] < split_date[1]))
    ].copy()
    test = df[
        (df["year"] > split_date[0]) |
        ((df["year"] == split_date[0]) & (df["month"] >= split_date[1]))
    ].copy()
    return train, test

train_raw, test_raw = time_split(features)

In [6]:
# 5. LAG FEATURES (SIMPLIFIED)
def lag_features(df, is_train=True):
    df = df.copy()
    df = df.sort_values(by=["distributor", "product_name", "year", "month"])
    grp = df.groupby(["distributor", "product_name"])
    
    # Lag features
    df["lag_1m_sales"] = grp["total_sales_clean"].shift(1)
    df["lag_3m_sales"] = grp["total_sales_clean"].shift(3)
    df["lag_6m_sales"] = grp["total_sales_clean"].shift(6)
    
    # Rolling
    df["rolling_avg_3m"] = grp["total_sales_clean"].transform(
        lambda x: x.shift(1).rolling(window=3, min_periods=1).mean()
    )
    
    # Growth
    df["sales_growth_pct"] = grp["total_sales_clean"].transform(
        lambda x: x.pct_change().shift(1) * 100
    )
    
    # Seasonal
    df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
    
    # Clean NaN
    df = df.replace([np.inf, -np.inf], np.nan)
    df = df.fillna(0)
    
    return df

train_fe = lag_features(train_raw, is_train=True)
test_fe = lag_features(test_raw, is_train=False)

In [7]:
# 6. CHECK DATA QUALITY
print("\n=== DATA QUALITY CHECK ===")
print(f"Train shape: {train_fe.shape}")
print(f"Test shape: {test_fe.shape}")

print("\nPercentage of zeros in lag features:")
for col in ["lag_1m_sales", "lag_3m_sales", "rolling_avg_3m"]:
    zero_pct = (train_fe[col] == 0).sum() / len(train_fe) * 100
    print(f"  {col}: {zero_pct:.1f}%")


=== DATA QUALITY CHECK ===
Train shape: (32836, 17)
Test shape: (27742, 17)

Percentage of zeros in lag features:
  lag_1m_sales: 17.1%
  lag_3m_sales: 42.6%
  rolling_avg_3m: 16.7%
