# Data Preprocessing

In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn import set_config
import joblib

In [2]:
path = '../../data/raw/coffee_shop_sales.csv'
df = pd.read_csv(path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149116 entries, 0 to 149115
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   transaction_id    149116 non-null  int64  
 1   transaction_date  149116 non-null  object 
 2   transaction_time  149116 non-null  object 
 3   store_location    149116 non-null  object 
 4   product_category  149116 non-null  object 
 5   product_type      149116 non-null  object 
 6   product_detail    149116 non-null  object 
 7   product_size      149116 non-null  object 
 8   unit_price        149116 non-null  float64
 9   quantity          149116 non-null  int64  
 10  total_bill        149116 non-null  float64
dtypes: float64(2), int64(2), object(7)
memory usage: 12.5+ MB


In [3]:
# Data Format
df['transaction_date'] = pd.to_datetime(df['transaction_date'], format='%d/%m/%Y')

In [4]:
# Group Data
df_ = df.groupby(['transaction_date','store_location'])['total_bill'].sum().reset_index()
df_ = pd.pivot_table(df_,
                     values='total_bill',
                     index='transaction_date',
                     columns='store_location').reset_index()

df_['total_bill'] = df_["Astoria"] + df_["Hell's Kitchen"] + df_["Lower Manhattan"]
df_

store_location,transaction_date,Astoria,Hell's Kitchen,Lower Manhattan,total_bill
0,2023-01-01,868.40,851.45,788.35,2508.20
1,2023-01-02,925.50,828.80,649.05,2403.35
2,2023-01-03,902.75,906.25,756.00,2565.00
3,2023-01-04,808.25,781.65,630.20,2220.10
4,2023-01-05,903.05,714.90,800.90,2418.85
...,...,...,...,...,...
176,2023-06-26,1975.10,1746.10,2154.70,5875.90
177,2023-06-27,1861.55,1676.70,2437.40,5975.65
178,2023-06-28,1758.10,1445.85,1524.95,4728.90
179,2023-06-29,1852.75,1298.55,1299.45,4450.75


In [5]:
# Rename Columns
df_.columns = ['trn_date', 'store_ast', 'store_hkt', 'store_lmn', 'total_sales']

In [6]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   trn_date     181 non-null    datetime64[ns]
 1   store_ast    181 non-null    float64       
 2   store_hkt    181 non-null    float64       
 3   store_lmn    181 non-null    float64       
 4   total_sales  181 non-null    float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 7.2 KB


# Data Pipeline

In [16]:
data_raw = '../../data/raw/coffee_shop_sales.csv'
data_clean = '../../data/clean/coffee_store_sales.csv'
data_line = '../../models/coffee_transform.joblib'

date_cols = ['transaction_date']
rename_cols = ['trn_date', 'store_ast', 'store_hkt', 'store_lmn', 'total_sales']

## Create Pipeline

In [8]:
class ObjectToDate(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        for col in self.columns:
            X[col] = pd.to_datetime(X[col], format='%d/%m/%Y')
        return X

In [9]:
class GroupData(BaseEstimator, TransformerMixin):
    def __init__(self):
        return None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_ = X.groupby(['transaction_date','store_location'])['total_bill'].sum().reset_index()
        X_ = pd.pivot_table(X_,
                            values='total_bill',
                            index='transaction_date',
                            columns='store_location').reset_index()
        X_['total_bill'] = X_["Astoria"] + X_["Hell's Kitchen"] + X_["Lower Manhattan"]
        return X_

In [10]:
class RenameColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X.columns = self.columns
        return X

In [11]:
data_transform = Pipeline([
    ('date_format', ObjectToDate(date_cols)),
    ('group_data', GroupData()),
    ('rename_cols', RenameColumns(rename_cols))
])

In [12]:
df_raw = pd.read_csv(data_raw)

data_transform_fit = data_transform.fit(df_raw)
df_sales = data_transform_fit.transform(df_raw)

In [13]:
df_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   trn_date     181 non-null    datetime64[ns]
 1   store_ast    181 non-null    float64       
 2   store_hkt    181 non-null    float64       
 3   store_lmn    181 non-null    float64       
 4   total_sales  181 non-null    float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 7.2 KB


## Save and Test

In [14]:
joblib.dump(data_transform_fit, data_line)

try:
    data_pipeline = joblib.load(data_line)
    print('Data pipeline is loaded...')

except Exception as e:
    print("Error:", str(e))

Data pipeline is loaded...


In [18]:
df_clean = data_pipeline.transform(df_raw)

df_clean.to_csv(data_clean, index=False)
df_clean.iloc[[0,1,-2,-1]]

Unnamed: 0,trn_date,store_ast,store_hkt,store_lmn,total_sales
0,2023-01-01,868.4,851.45,788.35,2508.2
1,2023-01-02,925.5,828.8,649.05,2403.35
179,2023-06-29,1852.75,1298.55,1299.45,4450.75
180,2023-06-30,1807.65,1904.93,1768.74,5481.32
