In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from feature_engine.encoding import OrdinalEncoder

In [3]:
data = pd.read_parquet('d:/demand-forecast-SQGroup/data/sales_bya_v1.parquet')

In [4]:
data.head().T

Unnamed: 0,0,1,2,3,4
cid,10001,10001,10001,10001,10001
item_type_id,186,186,186,186,186
category,Domestic,Domestic,Domestic,Domestic,Domestic
date,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00
outlet_id,1003,1003,1003,1003,1003
wire,1,1,1,1,1
rm,1.0,1.0,1.0,1.0,1.0
fy,17,17,17,17,17
base_size,1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w)
client_id,3DHEHE9HHE9H5MAK,3DHEHE9HHE9H9H1L,3DHEHE9HHEHE3DHE,3DHEHE9HHEHE4N7O,3DHEHE9HHEHEHEAK


In [5]:
data.tail().T

Unnamed: 0,353591,353592,353593,353594,353595
cid,10149,10149,10149,10149,10149
item_type_id,180,180,180,180,180
category,Domestic,Domestic,Domestic,Domestic,Domestic
date,2020-07-09 00:00:00,2022-08-11 00:00:00,2019-09-10 00:00:00,2021-09-11 00:00:00,2022-09-11 00:00:00
outlet_id,1015,1015,1015,1015,1015
wire,0,0,0,0,0
rm,4.0,4.0,4.0,4.0,4.0
fy,18,20,17,19,20
base_size,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm
client_id,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P


In [6]:
### structural info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353596 entries, 0 to 353595
Data columns (total 35 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   cid                 353596 non-null  int32         
 1   item_type_id        353596 non-null  int32         
 2   category            353596 non-null  object        
 3   date                353596 non-null  datetime64[ns]
 4   outlet_id           353596 non-null  int32         
 5   wire                353596 non-null  int32         
 6   rm                  353596 non-null  float32       
 7   fy                  353596 non-null  int32         
 8   base_size           353596 non-null  object        
 9   client_id           353596 non-null  object        
 10  qtym                353596 non-null  float32       
 11  net_price           353596 non-null  float32       
 12  grade               353596 non-null  object        
 13  uses                353596 no

In [7]:
### checking missing values
data.isnull().sum()

cid                   0
item_type_id          0
category              0
date                  0
outlet_id             0
wire                  0
rm                    0
fy                    0
base_size             0
client_id             0
qtym                  0
net_price             0
grade                 0
uses                  0
application_group     0
noc                   0
dfc                   0
area_km2              0
population            0
literacy_rate_perc    0
pcx                   0
excnts                0
exach                 0
trc                   0
tlcolt                0
tmtm                  0
ecoind                0
division              0
sf                    0
sop                   0
pminx                 0
tms_cr                0
mas                   0
kpi                   0
mkt                   0
dtype: int64

In [8]:
df = data.copy()

<center><h1>Categorical Encoding</h1></center>

In [9]:
def encode(df: pd.DataFrame) -> None:
    df['category'] = df.category.apply(lambda cat: {'Domestic': 1, 'Power':0}[cat])
    
    df['grade'] = df.grade.apply(
        lambda cat: {'Grade1': 1, 'Grade2': 2, 'Grade3': 3, 'Grade4': 4}[cat]
    )
    
    df['uses'] = df.uses.apply(
            lambda cat: {
                'House Wiring': 1,
                 'Fan & Lighting Connection': 2,
                 'Air Condition & Washing Machine, Heavy Item': 3,
                 'Lift & Heavy Item': 4,
                 'Earthing':5,
                 'Industry, Machineries': 6
            }[cat]
    )
    
    df['application_group'] = df.application_group.apply(
        lambda cat: {
            'General': 1, 'Moderate': 2, 'Rich': 3, 'Industry': 4
        }[cat]
    )
    
    df['ecoind'] = df.ecoind.apply(
        lambda cat: {'Medium': 2, 'High': 4, 'Low': 2, 'Poor': 1}[cat]
    )
    
    df['division'] = df.division.apply(
        lambda cat: {
            'Dhaka': 1,
            'Chittagong': 2,
            'Khulna': 3,
            'Rajshahi': 4,
            'Mymensingh': 5,
            'Sylhet': 6,
            'Rangpur': 7,
            'Barishal': 8}[cat]
    )
    
    df['mkt'] = df.mkt.str.strip(' ').apply(
        lambda cat: {
            'Urban': 4,
            'Semi Urban': 3, 
            'Rural': 2, 
            'Others': 1, 
        }[cat]
    )
    
    ### base_size ordinal encoding
    encoder = OrdinalEncoder(
        encoding_method='ordered', 
        variables=None, missing_values='raise', 
        ignore_format=False, unseen='ignore'
    )
    encoder.fit(df[['base_size']], y=df.net_price)
    df['base_size'] = encoder.transform(df[['base_size']]).base_size
    
    ### optimize for memory
    for col in df.select_dtypes('int64').columns:
        df[col] = df[col].astype('int32')

In [10]:
### encoding
cat = df.select_dtypes('object')

In [11]:
cat.category.value_counts()

category
Domestic    351161
Power         2435
Name: count, dtype: int64

In [12]:
cat.category

0         Domestic
1         Domestic
2         Domestic
3         Domestic
4         Domestic
            ...   
353591    Domestic
353592    Domestic
353593    Domestic
353594    Domestic
353595    Domestic
Name: category, Length: 353596, dtype: object

In [13]:
cat['category'] = cat.category.apply(lambda cat: {'Domestic': 1, 'Power':0}[cat])

In [14]:
cat.category.value_counts()

category
1    351161
0      2435
Name: count, dtype: int64

In [15]:
cat.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,353586,353587,353588,353589,353590,353591,353592,353593,353594,353595
category,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
base_size,1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),1x 1.0 re (1-w),...,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm,1 x 4.0 rm
client_id,3DHEHE9HHE9H5MAK,3DHEHE9HHE9H9H1L,3DHEHE9HHEHE3DHE,3DHEHE9HHEHE4N7O,3DHEHE9HHEHEHEAK,3DHEHE1LHE9H5M1L,3DHEHE5MHE3D9H2P,3DHEHE5MHESOHE4N,3DHEHE4NHE3D2P1L,3DHEHE7OHE9HSO3D,...,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P,3DHE3D5MHE3D1L2P
grade,Grade1,Grade1,Grade1,Grade1,Grade1,Grade1,Grade1,Grade1,Grade1,Grade1,...,Grade2,Grade2,Grade2,Grade2,Grade2,Grade2,Grade2,Grade2,Grade2,Grade2
uses,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,Fan & Lighting Connection,...,"Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item","Air Condition & Washing Machine, Heavy Item"
application_group,General,General,General,General,General,General,General,General,General,General,...,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate,Moderate
ecoind,High,High,High,High,High,High,High,High,Medium,High,...,High,High,High,High,High,High,High,High,High,High
division,Chittagong,Chittagong,Chittagong,Chittagong,Chittagong,Chittagong,Dhaka,Dhaka,Khulna,Sylhet,...,Khulna,Khulna,Khulna,Khulna,Khulna,Khulna,Khulna,Khulna,Khulna,Khulna
mkt,Urban,Rural,Urban,Urban,Urban,Others,Urban,Urban,Rural,Semi Urban,...,Others,Others,Others,Others,Others,Others,Others,Others,Others,Others


In [16]:
cat.grade.value_counts().to_dict()

{'Grade1': 290132, 'Grade2': 54256, 'Grade3': 8036, 'Grade4': 1172}

In [17]:
cat.uses.value_counts().to_dict()

{'House Wiring': 158646,
 'Fan & Lighting Connection': 125925,
 'Air Condition & Washing Machine, Heavy Item': 54256,
 'Lift & Heavy Item': 6773,
 'Earthing': 5561,
 'Industry, Machineries': 2435}

In [18]:
cat.application_group.value_counts().to_dict()

{'General': 290132, 'Moderate': 54256, 'Rich': 8036, 'Industry': 1172}

In [19]:
cat.ecoind.value_counts().to_dict()

{'Medium': 161523, 'High': 144250, 'Low': 43998, 'Poor': 3825}

In [20]:
cat.division.value_counts().to_dict()

{'Dhaka': 83777,
 'Chittagong': 82999,
 'Khulna': 47060,
 'Rajshahi': 39035,
 'Mymensingh': 35859,
 'Sylhet': 33205,
 'Rangpur': 24860,
 'Barishal': 6801}

In [21]:
cat.mkt.str.strip(' ').value_counts().to_dict()

{'Urban': 218061, 'Semi Urban': 55521, 'Rural': 40328, 'Others': 39686}

In [23]:
cat.base_size.value_counts()

base_size
1x 2.0 rm (3-w)      59601
1x 1.3 rm (3-w)      58566
1x 1.0 rm (3-w)      45759
1x 2.5 rm (7-w)      27389
1x 3 rm (7-w)        27293
1x 1.5 rm (7-w)      26464
1x 1.0 re (1-w)      21538
1x 4 rm (7-w)        19709
1x 4.5 rm (7-w)      16962
1x 1.5 re (1-w)      13472
1x 7 rm (7-w)        10653
1x 6 rm (7-w)         6833
1x 10 rm (7-w)        6773
1x 0.75 rm (3-w)      5444
1x 2.5 re (1-W)       2542
1x 1.5 rm (3-w)       1705
1x 14.5 rm (7-w)      1216
1 x 16 rm (19-w)       708
1 x 16 rm (7-w)        461
1 x 0.75 re (1-w)      117
1 x 2.5 rm (7-w)        48
1 x 14.5 rm (7-w)       47
1x 2.5 re (1-w)         46
1 x 2.0 Rm (3-w)        44
1 x 1.0 re (1-w)        44
1 x 6 rm (7-w)          35
1x 4.0 re (1-w)         32
1 x 4.0 rm              32
1 x 1.5 rm (7-w)        26
1 x 1.0 rm              18
1 x 2.5 re (1-w)        16
1x 16 rm (19-w)          3
Name: count, dtype: int64

In [24]:
df.base_size

0         1x 1.0 re (1-w)
1         1x 1.0 re (1-w)
2         1x 1.0 re (1-w)
3         1x 1.0 re (1-w)
4         1x 1.0 re (1-w)
               ...       
353591         1 x 4.0 rm
353592         1 x 4.0 rm
353593         1 x 4.0 rm
353594         1 x 4.0 rm
353595         1 x 4.0 rm
Name: base_size, Length: 353596, dtype: object

In [25]:
encoder = OrdinalEncoder(
    encoding_method='ordered', 
    variables=None, missing_values='raise', 
    ignore_format=False, unseen='ignore'
)

In [26]:
encoder.fit(df[['base_size']], y=df.net_price)

In [27]:
encoder.transform(df[['base_size']])

Unnamed: 0,base_size
0,6
1,6
2,6
3,6
4,6
...,...
353591,22
353592,22
353593,22
353594,22


In [28]:
encoder.encoder_dict_

{'base_size': {'1 x 1.0 re (1-w)': 0,
  '1x 2.5 re (1-w)': 1,
  '1 x 2.5 re (1-w)': 2,
  '1 x 2.0 Rm (3-w)': 3,
  '1 x 0.75 re (1-w)': 4,
  '1x 4.0 re (1-w)': 5,
  '1x 1.0 re (1-w)': 6,
  '1x 1.5 rm (3-w)': 7,
  '1x 1.5 re (1-w)': 8,
  '1x 0.75 rm (3-w)': 9,
  '1 x 6 rm (7-w)': 10,
  '1x 2.5 re (1-W)': 11,
  '1x 16 rm (19-w)': 12,
  '1 x 1.0 rm': 13,
  '1x 1.0 rm (3-w)': 14,
  '1x 3 rm (7-w)': 15,
  '1x 4 rm (7-w)': 16,
  '1 x 14.5 rm (7-w)': 17,
  '1x 1.5 rm (7-w)': 18,
  '1x 4.5 rm (7-w)': 19,
  '1x 6 rm (7-w)': 20,
  '1 x 1.5 rm (7-w)': 21,
  '1 x 4.0 rm': 22,
  '1x 1.3 rm (3-w)': 23,
  '1x 2.0 rm (3-w)': 24,
  '1x 2.5 rm (7-w)': 25,
  '1x 7 rm (7-w)': 26,
  '1x 14.5 rm (7-w)': 27,
  '1 x 16 rm (19-w)': 28,
  '1x 10 rm (7-w)': 29,
  '1 x 2.5 rm (7-w)': 30,
  '1 x 16 rm (7-w)': 31}}

In [29]:
encode(data)

In [30]:
data.head().T

Unnamed: 0,0,1,2,3,4
cid,10001,10001,10001,10001,10001
item_type_id,186,186,186,186,186
category,1,1,1,1,1
date,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00
outlet_id,1003,1003,1003,1003,1003
wire,1,1,1,1,1
rm,1.0,1.0,1.0,1.0,1.0
fy,17,17,17,17,17
base_size,6,6,6,6,6
client_id,3DHEHE9HHE9H5MAK,3DHEHE9HHE9H9H1L,3DHEHE9HHEHE3DHE,3DHEHE9HHEHE4N7O,3DHEHE9HHEHEHEAK


In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353596 entries, 0 to 353595
Data columns (total 35 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   cid                 353596 non-null  int32         
 1   item_type_id        353596 non-null  int32         
 2   category            353596 non-null  int32         
 3   date                353596 non-null  datetime64[ns]
 4   outlet_id           353596 non-null  int32         
 5   wire                353596 non-null  int32         
 6   rm                  353596 non-null  float32       
 7   fy                  353596 non-null  int32         
 8   base_size           353596 non-null  int32         
 9   client_id           353596 non-null  object        
 10  qtym                353596 non-null  float32       
 11  net_price           353596 non-null  float32       
 12  grade               353596 non-null  int32         
 13  uses                353596 no

In [32]:
data.to_parquet('d:/demand-forecast-SQGroup/data/sales_bya_v2.parquet', index=False)