In [27]:
import pandas as pd
import os

In [43]:
DATA_DIR = os.path.join(os.getcwd(), '..', 'data')

In [44]:
df = pd.read_csv(os.path.join(DATA_DIR, "retail_store_inventory.csv"))
df.head()

# Take only certain amount of  data
df = df[:10000]


In [37]:
columns_dict = {
  "Store ID" : "s_id",
  "Product ID" : "p_id",
  "Category" : "category",
  "Region" : "region",
  "Seasonality" : "seasonality",
  "Inventory Level" : "amount",
  "Units Sold" : "units_sold",
  "Units Ordered" : "units_ordered",
  "Price": "price",
  "Discount" : "discount"
}


drop_column = ["Date", "Demand Forecast", "Weather Condition", "Holiday/Promotion", "Competitor Pricing"]
df_dropped = df.drop(drop_column, axis=1)
df_dropped = df_dropped.rename(columns=columns_dict)

df1_columns = ['s_id', 'p_id', "category", "region", 'seasonality']
df2_columns = ['s_id', 'p_id', 'amount', "units_sold", "units_ordered", "price", "discount"]

def filter_columns (df: pd.DataFrame, preserved_cols: list) -> pd.DataFrame:
  cols = df.columns.tolist()
  for col in preserved_cols:
    cols.remove(col)
  
  return df.drop(cols, axis=1)

df1 = filter_columns(df_dropped, df1_columns)
df2 = filter_columns(df_dropped, df2_columns)

### Table 1 
Inventory Details

In [38]:
# Inventory details
# table name: inventory_details
df1.head()


Unnamed: 0,s_id,p_id,category,region,seasonality
0,S001,P0001,Groceries,North,Autumn
1,S001,P0002,Toys,South,Autumn
2,S001,P0003,Toys,West,Summer
3,S001,P0004,Toys,North,Autumn
4,S001,P0005,Electronics,East,Summer


In [39]:
df1['vt_start'] = None
df1['vt_end'] = None
df1['tt_start'] = None
df1['tt_end'] = None
df1.head()

Unnamed: 0,s_id,p_id,category,region,seasonality,vt_start,vt_end,tt_start,tt_end
0,S001,P0001,Groceries,North,Autumn,,,,
1,S001,P0002,Toys,South,Autumn,,,,
2,S001,P0003,Toys,West,Summer,,,,
3,S001,P0004,Toys,North,Autumn,,,,
4,S001,P0005,Electronics,East,Summer,,,,


In [40]:
from random import randrange
from datetime import timedelta
from datetime import datetime

def random_date(start, end):
    """
    This function will return a random datetime between two datetime 
    objects.
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    return start + timedelta(seconds=random_second)

# d1 = datetime.strptime('1/1/2008 1:30 PM', '%m/%d/%Y %I:%M %p')
# d2 = datetime.strptime('31/1/2009 4:50 AM', '%m/%d/%Y %I:%M %p')

# print(random_date(d1, d2))

In [54]:
import random

def populate_df_details(df : pd.DataFrame, num_duplicate: int = 3):
  # Declare date
  min_date = datetime.strptime('1/1/2015 01:00 AM', '%m/%d/%Y %I:%M %p')
  now_date = datetime.now()

  result_data = []
  cols = df.columns.tolist()
  # Iterate for each data in df
  for _, row in df.iterrows():
    current_data = []
    num_duplicate_data = random.randint(0, num_duplicate)

    row_dict = row.to_dict()

    # Generate TT and VT
    vt_start = random_date(min_date, now_date)
    row_dict['vt_start'] = vt_start
    vt_end = random_date(vt_start, now_date)
    row_dict['vt_end'] = vt_end
    tt_start = random_date(min_date, now_date)
    row_dict['tt_start'] = tt_start
    tt_end = random_date(tt_start, now_date)
    row_dict['tt_end'] = tt_end
    current_data.append(row_dict)

    # print(row_dict)

    # Generate random data
    for _ in range(num_duplicate_data):
      cols_to_change = random.randint(2, len(row_dict)-4-1)
      col_name = cols[cols_to_change]
      unique_val = df[col_name].unique().tolist()
      # Choose a new value
      choice = random.choice(unique_val)
      while (choice == row_dict[col_name]):
        # Choose until the value is different
        choice = random.choice(unique_val)
      
      # print(f"[{col_name}] : {row_dict[col_name]} => {choice}")
      # Update tt
      new_tt_start = row_dict['tt_end']
      new_tt_end = random_date(new_tt_start, now_date)

      # Change Value
      new_row_dict = row_dict.copy()
      new_row_dict[col_name] = choice
      new_row_dict["tt_start"] = new_tt_start
      new_row_dict["tt_end"] = new_tt_end

      # Generate new VT (optional, 20% chance)
      vt_change_num = random.randint(1, 10)
      if (vt_change_num <= 2):
        new_vt_start = random_date(min_date, now_date)
        new_row_dict['vt_start'] = new_vt_start
        new_vt_end = random_date(new_vt_start, now_date)
        new_row_dict['vt_end'] = new_vt_end

      # print(row_dict)
      current_data.append(new_row_dict)

      # Update pointer
      row_dict = new_row_dict

    # Change last data vt_end and tt_end to be 'infinity'
    # Last data's tt_end should always be infinity
    current_data[-1]['tt_end'] = "infinity"
    current_data[-1]['vt_end'] = "infinity"

    # Extend 
    result_data.extend(current_data)

  # Change date time
  for data in result_data:
    data['vt_start'] = data['vt_start'].strftime('%Y-%m-%d %H:%M:%S')
    data['vt_end'] = "infinity" if data['vt_end'] == "infinity" else data['vt_end'].strftime('%Y-%m-%d %H:%M:%S')
    data['tt_start'] = data['tt_start'].strftime('%Y-%m-%d %H:%M:%S')
    data['tt_end'] = "infinity" if data['tt_end'] == "infinity" else data['tt_end'].strftime('%Y-%m-%d %H:%M:%S')
  return pd.DataFrame(result_data)


df1_populated = populate_df_details(df1)

In [55]:
df1_populated.to_csv(os.path.join(DATA_DIR, 'inventory_details.csv'), index=False)

### Table 2
Inventory Sales

In [31]:
# Inventory sales
# table name: inventory_sales
df2.head()

Unnamed: 0,s_id,p_id,amount,units_sold,units_ordered,price,discount
0,S001,P0001,231,127,55,33.5,20
1,S001,P0002,204,150,66,63.01,20
2,S001,P0003,102,65,51,27.99,10
3,S001,P0004,469,61,164,32.72,10
4,S001,P0005,166,14,135,73.64,0


In [32]:
df2['vt_start '] = None
df2['vt_end '] = None
df2['tt_start '] = None
df2['tt_end '] = None
df2.head()

Unnamed: 0,s_id,p_id,amount,units_sold,units_ordered,price,discount,vt_start,vt_end,tt_start,tt_end
0,S001,P0001,231,127,55,33.5,20,,,,
1,S001,P0002,204,150,66,63.01,20,,,,
2,S001,P0003,102,65,51,27.99,10,,,,
3,S001,P0004,469,61,164,32.72,10,,,,
4,S001,P0005,166,14,135,73.64,0,,,,
