In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv
import warnings
warnings.filterwarnings("ignore")

# Load data SQL

In [2]:
def load_sql(query: str) -> pd.DataFrame:
    load_dotenv()
    db_user= os.getenv("DB_USER")
    db_password= os.getenv("DB_PASSWORD")
    db_host= os.getenv("DB_HOST")
    db_name= os.getenv("DB_NAME")
    engine = create_engine(f'postgresql://{db_user}:{db_password}@{db_host}/{db_name}')
    with engine.connect() as conn:
        df = pd.read_sql_query(text(query), conn)
        return df

df = load_sql("SELECT * FROM raw.pharmacy_sales;")
df = df.sort_values(by=["distributor", "product_name", "year", "city", "month"])
df.head()

Unnamed: 0,distributor,customer_name,city,country,latitude,longitude,channel,sub_channel,product_name,product_class,quantity,price,sales,month,year,sales_rep_name,manager,sales_team
207687,Bashirian-Kassulke,Rogahn-Klein Pharma Plc,Leinfelden-Echterdingen,Germany,48.6928,9.1428,Pharmacy,Institution,Abatatriptan,Antibiotics,2.0,742.0,1484.0,February,2020,Stella Given,Alisha Cordwell,Charlie
187350,Bashirian-Kassulke,Runolfsson-Halvorson Pharm,Rheinberg,Germany,51.5467,6.6006,Pharmacy,Retail,Abranatal Lysoprosate,Antiseptics,15826.0,681.0,10777506.0,August,2019,Mary Gerrard,Britanny Bold,Delta
254078,Bashirian-Kassulke,Hane Ltd Pharmaceutical Ltd,Aichach,Germany,48.45,11.1333,Hospital,Private,Abranatal Lysoprosate,Antiseptics,432.0,681.0,294192.0,December,2020,Anne Wu,Britanny Bold,Delta
175417,Bashirian-Kassulke,Doyle-Tillman Pharmaceutical Limited,Zirndorf,Germany,49.45,10.95,Pharmacy,Institution,Acantaine,Antibiotics,50.0,66.0,3300.0,June,2019,Thompson Crawford,James Goodwill,Alfa
246485,Bashirian-Kassulke,"Langworth, Olson and Satterfield Pharmacy",Meschede,Germany,51.3503,8.2836,Hospital,Government,Aciprex,Antipiretics,150.0,421.0,63150.0,November,2020,Thompson Crawford,James Goodwill,Alfa


In [3]:
df[['quantity', 'sales']].head(10)

Unnamed: 0,quantity,sales
207687,2.0,1484.0
187350,15826.0,10777506.0
254078,432.0,294192.0
175417,50.0,3300.0
246485,150.0,63150.0
232401,20.0,8420.0
53000,2500.0,1695000.0
254079,320.0,216960.0
158400,60.0,1440.0
188559,2000.0,48000.0


# Feature Engineering

In [4]:
features = (df.groupby(["distributor",
                    "channel",
                    "sub_channel",
                    "city",
                    "product_name",
                    "product_class",
                    "sales_team",
                    "year",
                    "month",]).agg(
                total_quantity=("quantity", "sum"),
                total_sales=("sales", "sum"),
                avg_price=("price", "mean"),
                    ).reset_index())

features = features.sort_values(by=["distributor", "product_name", "year", "city", "month"])
features.head()

Unnamed: 0,distributor,channel,sub_channel,city,product_name,product_class,sales_team,year,month,total_quantity,total_sales,avg_price
101,Bashirian-Kassulke,Pharmacy,Institution,Leinfelden-Echterdingen,Abatatriptan,Antibiotics,Charlie,2020,February,2.0,1484.0,742.0
160,Bashirian-Kassulke,Pharmacy,Retail,Rheinberg,Abranatal Lysoprosate,Antiseptics,Delta,2019,August,15826.0,10777506.0,681.0
36,Bashirian-Kassulke,Hospital,Private,Aichach,Abranatal Lysoprosate,Antiseptics,Delta,2020,December,432.0,294192.0,681.0
127,Bashirian-Kassulke,Pharmacy,Institution,Zirndorf,Acantaine,Antibiotics,Alfa,2019,June,50.0,3300.0,66.0
23,Bashirian-Kassulke,Hospital,Government,Meschede,Aciprex,Antipiretics,Alfa,2020,November,150.0,63150.0,421.0


In [5]:
# add rolling features
grp = features.groupby(["distributor", "product_name", "city"])
features["rolling_avg_sales_3m"] = grp["total_sales"].transform(
    lambda x: x.rolling(window=3, min_periods=1).mean()
)
features["sales_growth_pct"] = grp["total_sales"].transform(
    lambda x: x.pct_change() * 100
)

In [6]:
# Clean Nan and Inf values
features_cleaned = features.replace([np.inf, -np.inf], np.nan).fillna(0)
features_cleaned.head()

Unnamed: 0,distributor,channel,sub_channel,city,product_name,product_class,sales_team,year,month,total_quantity,total_sales,avg_price,rolling_avg_sales_3m,sales_growth_pct
101,Bashirian-Kassulke,Pharmacy,Institution,Leinfelden-Echterdingen,Abatatriptan,Antibiotics,Charlie,2020,February,2.0,1484.0,742.0,1484.0,0.0
160,Bashirian-Kassulke,Pharmacy,Retail,Rheinberg,Abranatal Lysoprosate,Antiseptics,Delta,2019,August,15826.0,10777506.0,681.0,10777506.0,0.0
36,Bashirian-Kassulke,Hospital,Private,Aichach,Abranatal Lysoprosate,Antiseptics,Delta,2020,December,432.0,294192.0,681.0,294192.0,0.0
127,Bashirian-Kassulke,Pharmacy,Institution,Zirndorf,Acantaine,Antibiotics,Alfa,2019,June,50.0,3300.0,66.0,3300.0,0.0
23,Bashirian-Kassulke,Hospital,Government,Meschede,Aciprex,Antipiretics,Alfa,2020,November,150.0,63150.0,421.0,63150.0,0.0


In [7]:
df[['quantity', 'sales']].head(20)

Unnamed: 0,quantity,sales
207687,2.0,1484.0
187350,15826.0,10777506.0
254078,432.0,294192.0
175417,50.0,3300.0
246485,150.0,63150.0
232401,20.0,8420.0
53000,2500.0,1695000.0
254079,320.0,216960.0
158400,60.0,1440.0
188559,2000.0,48000.0


In [8]:
features_cleaned[['total_quantity', 'total_sales']].head(20)

Unnamed: 0,total_quantity,total_sales
101,2.0,1484.0
160,15826.0,10777506.0
36,432.0,294192.0
127,50.0,3300.0
23,150.0,63150.0
71,20.0,8420.0
97,2500.0,1695000.0
174,320.0,216960.0
72,60.0,1440.0
107,2000.0,48000.0


In [9]:
features_cleaned['sales_growth_pct'].describe()

count    253642.000000
mean        231.156031
std        3759.918022
min     -120100.000000
25%           0.000000
50%           0.000000
75%           0.000000
max      479900.000000
Name: sales_growth_pct, dtype: float64

In [10]:
valid_entity = (
    features_cleaned.groupby(["distributor", "product_name", "city"]).filter(lambda x: x["total_sales"].gt(0).sum() >= 2)
)
valid_entity.reset_index(drop=True, inplace=True)
print(valid_entity.shape)
valid_entity.head()

(93267, 14)


Unnamed: 0,distributor,channel,sub_channel,city,product_name,product_class,sales_team,year,month,total_quantity,total_sales,avg_price,rolling_avg_sales_3m,sales_growth_pct
0,Beier,Pharmacy,Retail,Łuków,Abilovir Aprotasol,Antipiretics,Charlie,2018,June,20.0,5380.0,269.0,5380.0,0.0
1,Beier,Pharmacy,Retail,Łuków,Abilovir Aprotasol,Antipiretics,Alfa,2018,November,30.0,8070.0,269.0,6725.0,50.0
2,Beier,Pharmacy,Retail,Ciechocinek,Acelimus,Analgesics,Charlie,2018,October,1200.0,895200.0,746.0,895200.0,0.0
3,Beier,Pharmacy,Retail,Ciechocinek,Acelimus,Analgesics,Delta,2018,September,50.0,37300.0,746.0,466250.0,-95.833333
4,Beier,Pharmacy,Retail,Siedlce,Acelimus,Analgesics,Alfa,2018,February,30.0,22380.0,746.0,22380.0,0.0


In [11]:
valid_entity

Unnamed: 0,distributor,channel,sub_channel,city,product_name,product_class,sales_team,year,month,total_quantity,total_sales,avg_price,rolling_avg_sales_3m,sales_growth_pct
0,Beier,Pharmacy,Retail,Łuków,Abilovir Aprotasol,Antipiretics,Charlie,2018,June,20.0,5380.0,269.0,5380.0,0.000000
1,Beier,Pharmacy,Retail,Łuków,Abilovir Aprotasol,Antipiretics,Alfa,2018,November,30.0,8070.0,269.0,6725.0,50.000000
2,Beier,Pharmacy,Retail,Ciechocinek,Acelimus,Analgesics,Charlie,2018,October,1200.0,895200.0,746.0,895200.0,0.000000
3,Beier,Pharmacy,Retail,Ciechocinek,Acelimus,Analgesics,Delta,2018,September,50.0,37300.0,746.0,466250.0,-95.833333
4,Beier,Pharmacy,Retail,Siedlce,Acelimus,Analgesics,Alfa,2018,February,30.0,22380.0,746.0,22380.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93262,Welch-Langworth,Pharmacy,Institution,Cottbus,Topipizole,Mood Stabilizers,Bravo,2018,July,3.0,1104.0,368.0,44712.0,-98.750000
93263,Welch-Langworth,Hospital,Government,Potsdam,Xenaminphen,Antimalarial,Alfa,2017,June,10.0,1950.0,195.0,1950.0,0.000000
93264,Welch-Langworth,Hospital,Government,Potsdam,Xenaminphen,Antimalarial,Charlie,2018,September,10.0,1950.0,195.0,1950.0,0.000000
93265,Welch-Langworth,Pharmacy,Retail,Lüneburg,Zyvance,Analgesics,Delta,2018,August,5.0,1615.0,323.0,1615.0,0.000000


In [12]:
# Using IQR for outlier detection
Q1 = features_cleaned['total_sales'].quantile(0.25)
Q3 = features_cleaned['total_sales'].quantile(0.75)
IQR = Q3 - Q1
outlier = features_cleaned[(features_cleaned['total_sales'] < (Q1 - 1.5 * IQR)) | (features_cleaned['total_sales'] > Q3 + 1.5 * IQR)]
print(f"Number of outliers detected: {outlier.shape[0]}")

Number of outliers detected: 36097


In [14]:
# Count minus features_cleaned with outliers

df_minus = features_cleaned.drop(outlier.index)
print(f"Data shape after removing outliers: {df_minus.shape}")

Data shape after removing outliers: (217545, 14)


In [16]:
df_minus.describe()

Unnamed: 0,year,total_quantity,total_sales,avg_price,rolling_avg_sales_3m,sales_growth_pct
count,217545.0,217545.0,217545.0,217545.0,217545.0,217545.0
mean,2018.376768,30.087088,8814.604025,394.388467,11615.06,59.552169
std,1.030964,62.839179,11104.55496,226.203505,44641.48,677.541244
min,2017.0,-900.0,-28160.0,22.0,-489048.0,-23700.0
25%,2018.0,5.0,1410.0,180.0,1595.0,0.0
50%,2018.0,10.0,4120.0,408.0,4780.0,0.0
75%,2019.0,30.0,12120.0,585.0,13324.5,0.0
max,2020.0,2200.0,51576.0,794.0,7472750.0,89900.0
