In [9]:
import pandas as pd
import numpy as np

In [19]:
bakery_sales = pd.read_csv('Bakery_sales.csv')
Coffee_sales = pd.read_csv('Coffee_Shop_sales.csv')

In [26]:
Coffee_sales.head()

Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,product_id,unit_price,product_category,product_type,product_detail
0,1,2023-01-01,07:06:11,2,5,Lower Manhattan,32,3.0,Coffee,Gourmet brewed coffee,Ethiopia Rg
1,2,2023-01-01,07:08:56,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg
2,3,2023-01-01,07:14:04,2,5,Lower Manhattan,59,4.5,Drinking Chocolate,Hot chocolate,Dark chocolate Lg
3,4,2023-01-01,07:20:24,1,5,Lower Manhattan,22,2.0,Coffee,Drip coffee,Our Old Time Diner Blend Sm
4,5,2023-01-01,07:22:41,2,5,Lower Manhattan,57,3.1,Tea,Brewed Chai tea,Spicy Eye Opener Chai Lg


In [20]:
#clean the unit_price column by removing the € sign and converting the column to float
bakery_sales["unit_price"] = (
    bakery_sales["unit_price"]
    .str.replace("€", "", regex=False)
    .str.replace(",", ".", regex=False)
    .str.replace(" ", "", regex=False)
    .str.replace('"', "", regex=False)
    .str.strip()
    .astype(float)
)
#rename the columns to match the target schema
bakery_sales = bakery_sales.rename(
    columns={
        "ticket_number": "transaction_id",
        "date": "transaction_date",
        "time": "transaction_time",
        "Quantity": "transaction_qty",
        "article": "product_detail",
    }
)
#add the missing columns
bakery_sales["store_id"] = None
bakery_sales["store_location"] = None
bakery_sales["product_id"] = None
bakery_sales["product_category"] = "Bakery"
bakery_sales["product_type"] = None
#reorder the columns
bakery_sales = bakery_sales[
    [
        "transaction_id",
        "transaction_date",
        "transaction_time",
        "transaction_qty",
        "store_id",
        "store_location",
        "product_id",
        "unit_price",
        "product_category",
        "product_type",
        "product_detail",
    ]
]


In [21]:
print(bakery_sales.head())

   transaction_id transaction_date transaction_time  transaction_qty store_id  \
0        150040.0       2021-01-02            08:38              1.0     None   
1        150040.0       2021-01-02            08:38              3.0     None   
2        150041.0       2021-01-02            09:14              2.0     None   
3        150041.0       2021-01-02            09:14              1.0     None   
4        150042.0       2021-01-02            09:25              5.0     None   

  store_location product_id  unit_price product_category product_type  \
0           None       None        0.90           Bakery         None   
1           None       None        1.20           Bakery         None   
2           None       None        1.20           Bakery         None   
3           None       None        1.15           Bakery         None   
4           None       None        1.20           Bakery         None   

         product_detail  
0              BAGUETTE  
1      PAIN AU CHOCOLA

In [22]:
unique_stores = Coffee_sales[['store_id', 'store_location']].drop_duplicates()

# Display the result
print(unique_stores)

     store_id   store_location
0           5  Lower Manhattan
17          8   Hell's Kitchen
105         3          Astoria


In [23]:
#transaction_id is unique over all stores
Coffee_sales[['transaction_id','store_id']].drop_duplicates()

Unnamed: 0,transaction_id,store_id
0,1,5
1,2,5
2,3,5
3,4,5
4,5,5
...,...,...
149111,149452,8
149112,149453,8
149113,149454,8
149114,149455,8


In [39]:
unique_tickets = bakery_sales['transaction_id'].unique()

# Attribute a random store to each transaction
np.random.seed(42)
ticket_store_mapping = {ticket: np.random.choice(unique_stores['store_id']) for ticket in unique_tickets}

bakery_sales['store_id'] = bakery_sales['transaction_id'].map(ticket_store_mapping)

bakery_sales['store_location'] = bakery_sales['store_id'].map(unique_stores.set_index('store_id')['store_location'])

In [40]:
bakery_sales.head()

Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,product_id,unit_price,product_category,product_type,product_detail
0,150040.0,2021-01-02,08:38,1.0,3,Astoria,,0.9,Bakery,,BAGUETTE
1,150040.0,2021-01-02,08:38,3.0,3,Astoria,,1.2,Bakery,,PAIN AU CHOCOLAT
2,150041.0,2021-01-02,09:14,2.0,5,Lower Manhattan,,1.2,Bakery,,PAIN AU CHOCOLAT
3,150041.0,2021-01-02,09:14,1.0,5,Lower Manhattan,,1.15,Bakery,,PAIN
4,150042.0,2021-01-02,09:25,5.0,3,Astoria,,1.2,Bakery,,TRADITIONAL BAGUETTE


In [13]:
bakery_sales.to_csv("bakery_sales_new.csv", index=False)