In [5]:
import pandas as pd
from minio import Minio
from io import BytesIO
import os
from dotenv import load_dotenv
import warnings
import numpy as np
warnings.filterwarnings("ignore")

# Load data inference from MinIO

In [2]:
env_path = os.path.join("..", ".env")
load_dotenv(env_path)

# Create MinIO client environment
client = Minio(
    f"{os.getenv('MINIO_HOST')}:{os.getenv('MINIO_PORT')}",
    access_key=os.getenv("MINIO_ACCESS_KEY"),
    secret_key=os.getenv("MINIO_SECRET_KEY"),
    secure=False
)

In [3]:
# Load from gold bucket
def load_from_minio(bucket_name: str) -> pd.DataFrame:
    objects = list(client.list_objects(bucket_name))

    if not objects:
        print(f"No objects found in bucket '{bucket_name}'.")
        return None
    
    # Get latest file
    latest = sorted(objects, key=lambda x: x.last_modified, reverse=True)[0]

    # Read data
    response = client.get_object(bucket_name, latest.object_name)
    df = pd.read_parquet(BytesIO(response.read()))

    # Close response
    response.close()
    response.release_conn()

    print(f"loaded {latest.object_name} from bucket '{bucket_name}'")
    return df

In [4]:
pd.set_option('display.max_columns', None)
df = load_from_minio("gold")
df

KeyboardInterrupt: 

In [5]:
load_from_minio("silver")

loaded pharmacy_sales_20260112_210851_9.parquet from bucket 'silver'


Unnamed: 0,distributor,customer_name,city,country,latitude,longitude,channel,sub_channel,product_name,product_class,quantity,price,sales,month,year,sales_rep_name,manager,sales_team
0,Beier,"Zieme, Doyle and Kunze",Lublin,Poland,51.2333,22.5667,Hospital,Private,Kinenadryl,Antipiretics,3.0,782.0,2346.0,August,2018,Morris Garcia,Tracy Banks,Bravo
1,Beier,"Heathcote, Grant and Witting Pharm",Bielsko-Biała,Poland,49.8225,19.0444,Pharmacy,Institution,Abobozolid,Antimalarial,100.0,75.0,7500.0,August,2018,Abigail Thompson,Tracy Banks,Bravo
2,Beier,Lockman-Welch Pharm,Bytom,Poland,50.3470,18.9230,Hospital,Government,Aggretisol,Antimalarial,25.0,262.0,6550.0,August,2018,Sheila Stones,Britanny Bold,Delta
3,Beier,Hansen Group Pharm,Gryfów Śląski,Poland,51.0308,15.4202,Hospital,Private,Afinitasol,Antipiretics,20.0,286.0,5720.0,August,2018,Mary Gerrard,Britanny Bold,Delta
4,Beier,Rutherford and Sons,Gdynia,Poland,54.5189,18.5319,Pharmacy,Retail,Rebedase Aplencor,Mood Stabilizers,8.0,180.0,1440.0,September,2018,Anne Wu,Britanny Bold,Delta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Gerlach LLC,"Hammes, Bashirian and Pacocha Pharmacy",Schwelm,Germany,51.2904,7.2972,Pharmacy,Institution,Pazofenac,Mood Stabilizers,2.0,123.0,246.0,January,2017,Abigail Thompson,Tracy Banks,Bravo
996,Gerlach LLC,Kozey-Emmerich,Würselen,Germany,50.8247,6.1275,Hospital,Government,Effigine Propraprex,Antimalarial,3.0,377.0,1131.0,January,2017,Stella Given,Alisha Cordwell,Charlie
997,Gerlach LLC,"Koelpin, Luettgen and Abernathy Pharmaceutical...",Mörfelden-Walldorf,Germany,49.9896,8.5661,Hospital,Private,Temasone Thiobucil,Antimalarial,1.0,482.0,482.0,January,2017,Morris Garcia,Tracy Banks,Bravo
998,Gerlach LLC,"Hills, Stroman and Ernser",Rheda-Wiedenbrück,Germany,51.8417,8.3000,Pharmacy,Retail,Araxetine,Antibiotics,5.0,450.0,2250.0,January,2017,Jessica Smith,Britanny Bold,Delta


In [6]:
load_from_minio("bronze")

loaded pharmacy_sales_20260112_210851_9.parquet from bucket 'bronze'


Unnamed: 0,distributor,customer_name,city,country,latitude,longitude,channel,sub_channel,product_name,product_class,quantity,price,sales,month,year,sales_rep_name,manager,sales_team
0,Beier,"Zieme, Doyle and Kunze",Lublin,Poland,51.2333,22.5667,Hospital,Private,Kinenadryl,Antipiretics,3.0,782.0,2346.0,August,2018,Morris Garcia,Tracy Banks,Bravo
1,Beier,"Heathcote, Grant and Witting Pharm",Bielsko-Biała,Poland,49.8225,19.0444,Pharmacy,Institution,Abobozolid,Antimalarial,100.0,75.0,7500.0,August,2018,Abigail Thompson,Tracy Banks,Bravo
2,Beier,Lockman-Welch Pharm,Bytom,Poland,50.3470,18.9230,Hospital,Government,Aggretisol,Antimalarial,25.0,262.0,6550.0,August,2018,Sheila Stones,Britanny Bold,Delta
3,Beier,Hansen Group Pharm,Gryfów Śląski,Poland,51.0308,15.4202,Hospital,Private,Afinitasol,Antipiretics,20.0,286.0,5720.0,August,2018,Mary Gerrard,Britanny Bold,Delta
4,Beier,Rutherford and Sons,Gdynia,Poland,54.5189,18.5319,Pharmacy,Retail,Rebedase Aplencor,Mood Stabilizers,8.0,180.0,1440.0,September,2018,Anne Wu,Britanny Bold,Delta
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Gerlach LLC,"Hammes, Bashirian and Pacocha Pharmacy",Schwelm,Germany,51.2904,7.2972,Pharmacy,Institution,Pazofenac,Mood Stabilizers,2.0,123.0,246.0,January,2017,Abigail Thompson,Tracy Banks,Bravo
996,Gerlach LLC,Kozey-Emmerich,Würselen,Germany,50.8247,6.1275,Hospital,Government,Effigine Propraprex,Antimalarial,3.0,377.0,1131.0,January,2017,Stella Given,Alisha Cordwell,Charlie
997,Gerlach LLC,"Koelpin, Luettgen and Abernathy Pharmaceutical...",Mörfelden-Walldorf,Germany,49.9896,8.5661,Hospital,Private,Temasone Thiobucil,Antimalarial,1.0,482.0,482.0,January,2017,Morris Garcia,Tracy Banks,Bravo
998,Gerlach LLC,"Hills, Stroman and Ernser",Rheda-Wiedenbrück,Germany,51.8417,8.3000,Pharmacy,Retail,Araxetine,Antibiotics,5.0,450.0,2250.0,January,2017,Jessica Smith,Britanny Bold,Delta


In [7]:
load_from_minio("analytics")

loaded sales_analytics_20260112_211119.parquet from bucket 'analytics'


Unnamed: 0,distributor,city,year,month,total_quantity,total_sales,avg_price,product_count,sales_growth_pct_avg
0,Bashirian-Kassulke,Cloppenburg,2017,1,3272.0,1534568.0,469.0,1,0.0
1,Bashirian-Kassulke,Ehingen an der Donau,2017,3,250.0,76750.0,307.0,1,0.0
2,Bashirian-Kassulke,Geretsried,2017,3,250.0,192000.0,768.0,1,0.0
3,Bashirian-Kassulke,Homburg,2017,3,2500.0,265000.0,106.0,1,0.0
4,Bashirian-Kassulke,Kirchheim unter Teck,2017,1,6886.0,1287682.0,187.0,1,0.0
...,...,...,...,...,...,...,...,...,...
20842,Welch-Langworth,Sömmerda,2017,4,170.0,126820.0,746.0,1,0.0
20843,Welch-Langworth,Wiesloch,2017,4,3200.0,83200.0,26.0,1,0.0
20844,Welch-Langworth,Wolfsburg,2017,4,320.0,48960.0,153.0,1,0.0
20845,Welch-Langworth,Wuppertal,2017,1,44.0,10056.0,136.5,2,0.0


In [8]:
# Read chunk parquet
def read_parquet_in_chunks(bucket_name: str, chunk_size: int):
    objects = list(client.list_objects(bucket_name))

    if not objects:
        print(f"No objects found in bucket '{bucket_name}'.")
        return
    
    for obj in objects:
        response = client.get_object(bucket_name, obj.object_name)
        df = pd.read_parquet(BytesIO(response.read()), engine='pyarrow')

        # Close response
        response.close()
        response.release_conn()

        for start in range(0, len(df), chunk_size):
            yield df[start:start + chunk_size]

for chunk in read_parquet_in_chunks("gold", chunk_size=max(1, len(df)//5)):
    print(chunk)
    break  # Remove this line to process all chunks

pd.DataFrame(chunk)

           distributor   channel sub_channel       city          product_name  \
0        Carter-Conn    Pharmacy      Retail  Biskupiec            Symbiroban   
1        Carter-Conn    Hospital     Private    Brwinów  Morphizolid Tianalin   
2        Carter-Conn    Hospital     Private  Brzeszcze   Pulmodiol Adalaxime   
3        Carter-Conn    Pharmacy      Retail    Brzozów             Victomine   
4        Carter-Conn    Hospital     Private      Bytów    Exotropin Empizine   
...                ...       ...         ...        ...                   ...   
2991        Smith Inc   Pharmacy      Retail       Żary     Dexacilin Triline   
2992        Smith Inc   Pharmacy      Retail       Żary      Docstryl Rivacin   
2993        Smith Inc   Pharmacy      Retail       Żary             Penitrana   
2994        Smith Inc   Hospital  Government    Żychlin            Cyclovital   
2995  Stehr-Champlin    Pharmacy      Retail    Świecie              Diaxolol   

         product_class sale

Unnamed: 0,distributor,channel,sub_channel,city,product_name,product_class,sales_team,year,month,total_quantity,total_sales,avg_price,total_sales_clean,lag_1m_sales,lag_3m_sales,lag_6m_sales,rolling_avg_3m,rolling_avg_6m,sales_growth_pct,month_sin,month_cos
0,Carter-Conn,Pharmacy,Retail,Biskupiec,Symbiroban,Antibiotics,Delta,2018,1,80.0,9760.0,122.0,9760.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
1,Carter-Conn,Hospital,Private,Brwinów,Morphizolid Tianalin,Mood Stabilizers,Alfa,2018,1,50.0,37100.0,742.0,37100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
2,Carter-Conn,Hospital,Private,Brzeszcze,Pulmodiol Adalaxime,Analgesics,Delta,2018,1,10.0,6050.0,605.0,6050.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
3,Carter-Conn,Pharmacy,Retail,Brzozów,Victomine,Antimalarial,Delta,2018,1,70.0,49350.0,705.0,49350.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
4,Carter-Conn,Hospital,Private,Bytów,Exotropin Empizine,Mood Stabilizers,Alfa,2018,1,8.0,6280.0,785.0,6280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2991,Smith Inc,Pharmacy,Retail,Żary,Dexacilin Triline,Analgesics,Bravo,2018,1,2.0,258.0,129.0,258.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
2992,Smith Inc,Pharmacy,Retail,Żary,Docstryl Rivacin,Antiseptics,Delta,2018,1,4.0,3156.0,789.0,3156.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
2993,Smith Inc,Pharmacy,Retail,Żary,Penitrana,Analgesics,Alfa,2018,1,36.0,6444.0,179.0,6444.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025
2994,Smith Inc,Hospital,Government,Żychlin,Cyclovital,Antimalarial,Charlie,2018,1,10.0,6770.0,677.0,6770.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.866025


In [9]:
df.describe()

Unnamed: 0,year,month,total_quantity,total_sales,avg_price,total_sales_clean,lag_1m_sales,lag_3m_sales,lag_6m_sales,rolling_avg_3m,rolling_avg_6m,sales_growth_pct,month_sin,month_cos
count,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0,129659.0
mean,2017.846428,5.846459,51.842993,21092.66,411.078568,12757.907206,98.297991,0.0,0.0,98.289114,98.289114,0.006294,0.123278,0.06181919
std,0.360539,3.55334,216.864986,99713.65,225.018148,25781.468471,2529.713947,0.0,0.0,2523.015843,2523.015843,1.649807,0.722545,0.677434
min,2017.0,1.0,0.0,0.0,22.0,50.0,0.0,0.0,0.0,0.0,0.0,-93.896104,-1.0,-1.0
25%,2018.0,3.0,3.0,1002.0,195.0,1002.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.5,-0.5
50%,2018.0,5.0,10.0,3068.0,430.0,3068.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,6.123234000000001e-17
75%,2018.0,9.0,30.0,11250.0,605.0,11250.0,0.0,0.0,0.0,0.0,0.0,0.0,0.866025,0.8660254
max,2018.0,12.0,20000.0,9216000.0,794.0,211260.0,211260.0,0.0,0.0,211260.0,211260.0,400.0,1.0,1.0


In [10]:
df.isnull().sum()

distributor          0
channel              0
sub_channel          0
city                 0
product_name         0
product_class        0
sales_team           0
year                 0
month                0
total_quantity       0
total_sales          0
avg_price            0
total_sales_clean    0
lag_1m_sales         0
lag_3m_sales         0
lag_6m_sales         0
rolling_avg_3m       0
rolling_avg_6m       0
sales_growth_pct     0
month_sin            0
month_cos            0
dtype: int64

# Feature Engineering