In [6]:
import numpy as np
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv
import pandas as pd
from pandas.api.types import is_integer_dtype, is_float_dtype

load_dotenv()
os.chdir(os.getenv('BASE_PATH'))


In [7]:
df = pd.read_csv('data/raw/retail_store_inventory.csv')
df.drop(columns=['Date', 'Store ID', 'Product ID'], axis=1, inplace=True)

In [8]:
df = df.rename(columns={'Inventory Level': 'Inventory',
                        'Units Sold': 'Sales',
                        'Units Ordered': 'Orders',
                        'Demand Forecast': 'Demand',
                        'Weather Condition': 'Weather',
                        'Holiday/Promotion': 'Promotion',
                        'Competitor Pricing': 'Competitor Price'})

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Category          73100 non-null  object 
 1   Region            73100 non-null  object 
 2   Inventory         73100 non-null  int64  
 3   Sales             73100 non-null  int64  
 4   Orders            73100 non-null  int64  
 5   Demand            73100 non-null  float64
 6   Price             73100 non-null  float64
 7   Discount          73100 non-null  int64  
 8   Weather           73100 non-null  object 
 9   Promotion         73100 non-null  int64  
 10  Competitor Price  73100 non-null  float64
 11  Seasonality       73100 non-null  object 
dtypes: float64(3), int64(5), object(4)
memory usage: 6.7+ MB


In [None]:
def downcast_ints(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    int_cols = [c for c in df.columns if is_integer_dtype(df[c])]
    for c in int_cols:
        s = df[c]
        if s.min() >= 0:
            df[c] = pd.to_numeric(s, downcast="unsigned")  # -> uint8/16/32/64
        else:
            df[c] = pd.to_numeric(s, downcast="integer")   # -> int8/16/32/64
    return df

def optimize_df(df: pd.DataFrame) -> pd.DataFrame:
    start = df.memory_usage(deep=True).sum()
    out = df.copy()

    # 1) Downcast integers
    out = downcast_ints(out)

    # 2) Downcast floats
    float_cols = [c for c in out.columns if is_float_dtype(out[c])]
    out[float_cols] = out[float_cols].apply(pd.to_numeric, downcast="float")  # -> float32

    end = out.memory_usage(deep=True).sum()
    print(f"Memory: {start/1024**2:.2f} MB → {end/1024**2:.2f} MB "
          f"({(1 - end/start)*100:.1f}% reduction)")
    return out

df_opt = optimize_df(df)
df_opt.info()


Memory: 22.03 MB → 18.89 MB (14.2% reduction)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Category          73100 non-null  object 
 1   Region            73100 non-null  object 
 2   Inventory         73100 non-null  uint16 
 3   Sales             73100 non-null  uint16 
 4   Orders            73100 non-null  uint8  
 5   Demand            73100 non-null  float32
 6   Price             73100 non-null  float32
 7   Discount          73100 non-null  uint8  
 8   Weather           73100 non-null  object 
 9   Promotion         73100 non-null  uint8  
 10  Competitor Price  73100 non-null  float32
 11  Seasonality       73100 non-null  object 
dtypes: float32(3), object(4), uint16(2), uint8(3)
memory usage: 3.6+ MB


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73100 entries, 0 to 73099
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Category          73100 non-null  object 
 1   Region            73100 non-null  object 
 2   Inventory         73100 non-null  int64  
 3   Sales             73100 non-null  int64  
 4   Orders            73100 non-null  int64  
 5   Demand            73100 non-null  float32
 6   Price             73100 non-null  float32
 7   Discount          73100 non-null  int64  
 8   Weather           73100 non-null  object 
 9   Promotion         73100 non-null  int64  
 10  Competitor Price  73100 non-null  float32
 11  Seasonality       73100 non-null  object 
dtypes: float32(3), int64(5), object(4)
memory usage: 5.9+ MB
