In [1]:
import pandas as pd
import os

In [2]:
df = pd.read_csv(os.path.join(os.getcwd(), "data", "retail_store_inventory.csv"))
df.head()

Unnamed: 0,Date,Store ID,Product ID,Category,Region,Inventory Level,Units Sold,Units Ordered,Demand Forecast,Price,Discount,Weather Condition,Holiday/Promotion,Competitor Pricing,Seasonality
0,2022-01-01,S001,P0001,Groceries,North,231,127,55,135.47,33.5,20,Rainy,0,29.69,Autumn
1,2022-01-01,S001,P0002,Toys,South,204,150,66,144.04,63.01,20,Sunny,0,66.16,Autumn
2,2022-01-01,S001,P0003,Toys,West,102,65,51,74.02,27.99,10,Sunny,1,31.32,Summer
3,2022-01-01,S001,P0004,Toys,North,469,61,164,62.18,32.72,10,Cloudy,1,34.74,Autumn
4,2022-01-01,S001,P0005,Electronics,East,166,14,135,9.26,73.64,0,Sunny,0,68.95,Summer


In [28]:
columns_dict = {
  "Store ID" : "s_id",
  "Product ID" : "p_id",
  "Category" : "category",
  "Region" : "region",
  "Seasonality" : "seasonality",
  "Inventory Level" : "amount",
  "Units Sold" : "units_sold",
  "Units Ordered" : "units_ordered",
  "Price": "price",
  "Discount" : "discount"
}


drop_column = ["Date", "Demand Forecast", "Weather Condition", "Holiday/Promotion", "Competitor Pricing"]
df_dropped = df.drop(drop_column, axis=1)
df_dropped = df_dropped.rename(columns=columns_dict)

df1_columns = ['s_id', 'p_id', "category", "region", 'seasonality']
df2_columns = ['s_id', 'p_id', 'amount', "units_sold", "units_ordered", "price", "discount"]

def filter_columns (df: pd.DataFrame, preserved_cols: list) -> pd.DataFrame:
  cols = df.columns.tolist()
  for col in preserved_cols:
    cols.remove(col)
  
  return df.drop(cols, axis=1)

df1 = filter_columns(df_dropped, df1_columns)
df2 = filter_columns(df_dropped, df2_columns)

### Table 1 
Inventory Details

In [29]:
# Inventory details
# table name: inventory_details
df1.head()


Unnamed: 0,s_id,p_id,category,region,seasonality
0,S001,P0001,Groceries,North,Autumn
1,S001,P0002,Toys,South,Autumn
2,S001,P0003,Toys,West,Summer
3,S001,P0004,Toys,North,Autumn
4,S001,P0005,Electronics,East,Summer


In [30]:
df1['vt_start '] = None
df1['vt_end '] = None
df1['tt_start '] = None
df1['tt_end '] = None
df1.head()

Unnamed: 0,s_id,p_id,category,region,seasonality,vt_start,vt_end,tt_start,tt_end
0,S001,P0001,Groceries,North,Autumn,,,,
1,S001,P0002,Toys,South,Autumn,,,,
2,S001,P0003,Toys,West,Summer,,,,
3,S001,P0004,Toys,North,Autumn,,,,
4,S001,P0005,Electronics,East,Summer,,,,


In [56]:
from random import randrange
from datetime import timedelta

def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

In [55]:
import random

def populate_df_details(df : pd.DataFrame, num_duplicate: int = 3):
  duplicated_data = []
  cols = df.columns.tolist()
  # Iterate for each data in df
  for idx, row in df[:20].iterrows():
    num_duplicate_data = random.randint(0, num_duplicate)
    row_dict = row.to_dict()
    print(row_dict)
    # Generate random data
    for _ in range(num_duplicate_data):
      cols_to_change = random.randint(2, len(row_dict)-4-1)
      col_name = cols[cols_to_change]
      unique_val = df[col_name].unique().tolist()
      # Choose a new value
      choice = random.choice(unique_val)
      while (choice == row_dict[col_name]):
        # Choose until the value is different
        choice = random.choice(unique_val)
      
      print(f"[{col_name}] : {row_dict[col_name]} => {choice}")

      # Change Value
      row_dict[col_name] = choice
      print(row_dict)
      duplicated_data.append(row_dict)

  



  
  return df


df1_populated = populate_df_details(df1)
# df1_populated.head()

{'s_id': 'S001', 'p_id': 'P0001', 'category': 'Groceries', 'region': 'North', 'seasonality': 'Autumn', 'vt_start ': None, 'vt_end ': None, 'tt_start ': None, 'tt_end ': None}
[seasonality] : Autumn => Summer
{'s_id': 'S001', 'p_id': 'P0001', 'category': 'Groceries', 'region': 'North', 'seasonality': 'Summer', 'vt_start ': None, 'vt_end ': None, 'tt_start ': None, 'tt_end ': None}
[category] : Groceries => Furniture
{'s_id': 'S001', 'p_id': 'P0001', 'category': 'Furniture', 'region': 'North', 'seasonality': 'Summer', 'vt_start ': None, 'vt_end ': None, 'tt_start ': None, 'tt_end ': None}
[region] : North => South
{'s_id': 'S001', 'p_id': 'P0001', 'category': 'Furniture', 'region': 'South', 'seasonality': 'Summer', 'vt_start ': None, 'vt_end ': None, 'tt_start ': None, 'tt_end ': None}
{'s_id': 'S001', 'p_id': 'P0002', 'category': 'Toys', 'region': 'South', 'seasonality': 'Autumn', 'vt_start ': None, 'vt_end ': None, 'tt_start ': None, 'tt_end ': None}
[seasonality] : Autumn => Winter
{'

### Table 2
Inventory Sales

In [31]:
# Inventory sales
# table name: inventory_sales
df2.head()

Unnamed: 0,s_id,p_id,amount,units_sold,units_ordered,price,discount
0,S001,P0001,231,127,55,33.5,20
1,S001,P0002,204,150,66,63.01,20
2,S001,P0003,102,65,51,27.99,10
3,S001,P0004,469,61,164,32.72,10
4,S001,P0005,166,14,135,73.64,0


In [32]:
df2['vt_start '] = None
df2['vt_end '] = None
df2['tt_start '] = None
df2['tt_end '] = None
df2.head()

Unnamed: 0,s_id,p_id,amount,units_sold,units_ordered,price,discount,vt_start,vt_end,tt_start,tt_end
0,S001,P0001,231,127,55,33.5,20,,,,
1,S001,P0002,204,150,66,63.01,20,,,,
2,S001,P0003,102,65,51,27.99,10,,,,
3,S001,P0004,469,61,164,32.72,10,,,,
4,S001,P0005,166,14,135,73.64,0,,,,
