# Miscellaneous Notebook


Contains Code for:
* changing dates that will match week of the year
* adding States column to divide 45 stores into 5 States

# Start

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di 
import warnings
warnings.filterwarnings('ignore')


In [2]:
data_path = "./Datasets/Walmart - original.csv"
new_data_path = "./Datasets/Walmart.csv"
date_col_name = 'Date'
target_col_name = 'Weekly_Sales'

## Below field identifies the plot names
target_plot_label = "Walmart's Weekly Sales Analysis"

## group_cols - is grouping required (true or False)
## grp_by_cols - level at which the data is to be grouped to identify anomalies
## group - This filters the data based on selected values for the columns.

group_cols = True
grp_by_cols = ['State']
state = 'Florida'
group = (state)

## Read csv change the dates, add states and save back
saving original date into a new column

In [3]:
import datetime

def find_corresponding_date(original_date):
    # Parse the original date
    original_year, original_month, original_day = map(int, original_date.split('-'))
    
    # Get the day of the week for the original date
    original_weekday = datetime.datetime(original_year, original_month, original_day).weekday()
    target_year = original_year + 11
       
    # Calculate the ISO week number and day of the week
    date = datetime.datetime.strptime(original_date, '%Y-%m-%d')

    week_number, weekday = date.isocalendar()[1], date.weekday()
    
    # Create a new date in the target year with the same ISO week number and day of the week
    new_date = datetime.datetime.strptime(f'{target_year}-W{week_number}-{weekday+1}', '%Y-W%W-%w')
    # new_dates.append(new_date.strftime('%Y-%m-%d'))

    # print(original_date, new_date, target_date)
    
    return new_date.strftime('%Y-%m-%d')



In [4]:
def read(data_path, sheet_name = ''):
    df = pd.DataFrame()
    if data_path.split('.')[-1] == 'xlsx':
        if sheet_name:
            df = pd.read_excel(data_path, sheet_name=sheet_name)
        else:
            df = pd.read_excel(data_path)
        print("Shape of the data in file {} is {}".format(data_path, df.shape))
    else:
        try:
            df = pd.read_csv(data_path)
            print("Shape of the data in file {} is {}".format(data_path, df.shape))
            if df.shape[0] == 0:
                print("No data in file {}".format(data_path))
        except Exception as e:
            print("Issue while reading data at {} \n{}".format(data_path, e))
    return df


def standardize_date_col(dataframe, date_col):
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], format='%d-%m-%Y', errors='coerce').fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce'))
    # Convert all dates to 'mm-dd-yyyy' format
    dataframe[date_col] = dataframe[date_col].dt.strftime('%Y-%m-%d')
    return dataframe

## Add reagion column consisting 5 random states, with each of the regions containing 9 stores.
def add_state():
    # Define a function to map values based on the specified ranges
    def map_value(val):
        if 1 <= val <= 9:
            return 'Florida'
        elif 10 <= val <= 18:
            return 'California'
        elif 19 <= val <= 27:
            return 'Ohio'
        elif 28 <= val <= 36:
            return 'Texas'
        else:
            return 'Virginia'
    
    # Add a new column 'r' to the DataFrame based on the mapping function
    df['State'] = df['Store'].apply(map_value)
    return
     

In [5]:
# Read data from csv or excel, sheet_name is the sheet in excel that contians data 
df = read(data_path, sheet_name= 'RAW')
df.head(3)

Shape of the data in file ./Datasets/Walmart - original.csv is (6435, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,5/2/10,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12/2/10,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106


In [6]:
df = standardize_date_col(df, date_col_name)
df.head(3)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,2010-02-12,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,2010-02-19,1611968.17,0,39.93,2.514,211.289143,8.106


In [7]:
df['original_Date'] = df['Date']

In [8]:
# Updating dates:
original_dates = list(df['Date'])

corresponding_dates = [find_corresponding_date(date) for date in original_dates]
corresponding_dates

df['Date'] = corresponding_dates
df.head(3)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,original_Date
0,1,2021-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,2010-02-05
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,2010-02-12
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,2010-02-19


In [9]:
(df['Date'].min(), df['Date'].max()), (df['original_Date'].min(), df['original_Date'].max())

(('2021-02-05', '2023-10-27'), ('2010-02-05', '2012-10-26'))

In [10]:
# Remove original date column:
df.drop(columns=['original_Date'], inplace=True)
df.tail(3)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
6432,45,2023-10-13,734464.36,0,54.47,4.0,192.327265,8.667
6433,45,2023-10-20,718125.53,0,56.47,3.969,192.330854,8.667
6434,45,2023-10-27,760281.43,0,58.85,3.882,192.308899,8.667


In [13]:
add_state()
print(df['State'].value_counts(), df.shape)
df.head(3)

State
Florida       1287
California    1287
Ohio          1287
Texas         1287
Virginia      1287
Name: count, dtype: int64 (6435, 9)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State
0,1,2021-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,Florida
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Florida
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida


In [14]:
new_data_path

'./Datasets/Walmart.csv'

In [15]:
#writing back 
df.to_csv(new_data_path, index=False)
pd.read_csv(new_data_path)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State
0,1,2021-02-05,1643690.90,0,42.31,2.572,211.096358,8.106,Florida
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.242170,8.106,Florida
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida
3,1,2021-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,Florida
4,1,2021-03-05,1554806.68,0,46.50,2.625,211.350143,8.106,Florida
...,...,...,...,...,...,...,...,...,...
6430,45,2023-09-29,713173.95,0,64.88,3.997,192.013558,8.684,Virginia
6431,45,2023-10-06,733455.07,0,64.89,3.985,192.170412,8.667,Virginia
6432,45,2023-10-13,734464.36,0,54.47,4.000,192.327265,8.667,Virginia
6433,45,2023-10-20,718125.53,0,56.47,3.969,192.330854,8.667,Virginia


## Add Daily data

In [5]:
df = pd.read_csv(new_data_path)
df.head(3)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State
0,1,2021-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,Florida
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Florida
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida


In [6]:
def aggregate_data(df, state):
    # Aggregating data by date
    data_aggregated = df[df['State']==state].groupby('Date').agg(
        Weekly_Sales=('Weekly_Sales', 'sum'),
        Temperature=('Temperature', 'mean'),
        Fuel_Price=('Fuel_Price', 'mean'),
        CPI=('CPI', 'mean'),
        Unemployment=('Unemployment', 'mean'),
        Holiday_Flag=('Holiday_Flag', 'max') # Assuming if any store has a holiday, it's a holiday across the region
    ).reset_index()
    return data_aggregated

In [17]:
def impute_daily_data(df):
    # Initialize a new DataFrame for the imputed data
    imputed_data = []
    
    # Convert the 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Function to distribute weekly sales based on the specified percentages
    def distribute_sales(weekly_sales):
        return {
            'Monday': weekly_sales * 0.05,
            'Tuesday': weekly_sales * 0.10,
            'Wednesday': weekly_sales * 0.20,
            'Thursday': weekly_sales * 0.10,
            'Friday': weekly_sales * 0.20,
            'Saturday': weekly_sales * 0.15, 
            'Sunday': weekly_sales * 0.20,
        }
    
    for index, row in df.iterrows():
        week_start = row['Date'] - timedelta(days=row['Date'].weekday())  # Calculate the start of the week (Monday)
        sales_distribution = distribute_sales(row['Weekly_Sales'])
        for i in range(7):  # For each day in the week
            day_name = (week_start + timedelta(days=i)).strftime('%A')
            new_row = row.copy()
            new_row['Date'] = week_start + timedelta(days=i)
            new_row['Daily_Sales'] = sales_distribution[day_name]
            new_row['Temperature'] += np.random.uniform(0, 2)  # Add random value to Temperature
            new_row['Holiday_Flag'] = 0  # Set all Holiday_Flag to 0
            imputed_data.append(new_row)
    
    # Create a DataFrame from the imputed data
    imputed_df = pd.DataFrame(imputed_data)

    # clean up the data:
    # Identify numeric columns
    numeric_columns = imputed_df.select_dtypes(include=[int, float]).columns
    
    # Round the numeric values in these columns to 2 decimal places
    imputed_df[numeric_columns] = imputed_df[numeric_columns].round(2)

    # Sort by 'Date' to ensure chronological order
    imputed_df.sort_values(by='Date', inplace=True)
    imputed_df.reset_index(drop=True, inplace=True)
    
    return imputed_df
    

In [18]:
df = pd.read_csv(new_data_path)

daily_sales = pd.DataFrame()
for state in df['State'].unique():
    print(state)
    # Imputed data and add state column back 
    imputed_data = impute_daily_data(aggregate_data(df, 'Florida'))
    imputed_data['State'] = state
    
    daily_sales = pd.concat([daily_sales, imputed_data])
    # daily_sales = impute_daily_data(data_agg/regated)


    # Displaying the first few rows of the aggregated data
    print(daily_sales.shape)#, data_aggregated.head()

Florida
(1001, 8)
California
(2002, 8)
Ohio
(3003, 8)
Texas
(4004, 8)
Virginia
(5005, 8)


In [19]:
daily_sales.Date.value_counts()

Date
2021-02-01    5
2022-12-05    5
2022-11-22    5
2022-11-23    5
2022-11-24    5
             ..
2022-01-06    5
2022-01-07    5
2022-01-08    5
2022-01-09    5
2023-10-29    5
Name: count, Length: 1001, dtype: int64

In [20]:
# daily_sales = impute_daily_data(data_aggregated)
# Showing the first 10 rows to verify the transformation
daily_sales#.head(10)

Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida
1,2021-02-02,1039762.27,39.03,2.58,200.61,7.55,0,Florida
2,2021-02-03,2079524.55,38.14,2.58,200.61,7.55,0,Florida
3,2021-02-04,1039762.27,37.56,2.58,200.61,7.55,0,Florida
4,2021-02-05,2079524.55,38.26,2.58,200.61,7.55,0,Florida
...,...,...,...,...,...,...,...,...
996,2023-10-25,1914213.71,67.84,3.53,211.92,5.67,0,Virginia
997,2023-10-26,957106.86,67.13,3.53,211.92,5.67,0,Virginia
998,2023-10-27,1914213.71,67.75,3.53,211.92,5.67,0,Virginia
999,2023-10-28,1435660.29,66.93,3.53,211.92,5.67,0,Virginia


In [21]:
((daily_sales.Date.max() - daily_sales.Date.min()).days + 1) * 5

5005

In [23]:
# Save the file locally
daily_data_path = './Datasets/Walmart_daily.csv'
daily_sales.to_csv(daily_data_path,index=False)

## Correct the data (Holiday_Flag) Introduce randomness in the data

In [1]:
from datetime import datetime, timedelta
import pandas as pd

df = pd.read_csv('./Datasets/Walmart_daily.csv')
df.head(5)

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State
0,2/1/21,519881.14,37.69,2.58,200.61,7.55,0,Florida
1,2/2/21,1039762.27,39.03,2.58,200.61,7.55,0,Florida
2,2/3/21,2079524.55,38.14,2.58,200.61,7.55,0,Florida
3,2/4/21,1039762.27,37.56,2.58,200.61,7.55,0,Florida
4,2/5/21,2079524.55,38.26,2.58,200.61,7.55,0,Florida


In [2]:
df_original = pd.read_csv('./Datasets/Walmart.csv')
df_original.head(5)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State
0,1,2021-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,Florida
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Florida
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida
3,1,2021-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,Florida
4,1,2021-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,Florida


In [10]:
# Convert the 'Date' column to datetime format
df_original['Date'] = pd.to_datetime(df_original['Date'])#, format='%m/%d/%y')

# Add a new column for the week number. The week starts on Saturday and ends on Friday.
# Pandas week starts on Monday by default, so we adjust to have it start on Saturday
df_original['Week_Number'] = (df_original['Date'] - pd.Timedelta(days=5)).dt.isocalendar().week
df_original['Year'] = df_original['Date'].dt.year
df_original.head(15)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State,Week_Number,Year,Year_Week
0,1,2021-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,Florida,4,2021,2021-4
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Florida,5,2021,2021-5
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida,6,2021,2021-6
3,1,2021-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,Florida,7,2021,2021-7
4,1,2021-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,Florida,8,2021,2021-8
5,1,2021-03-12,1439541.59,0,57.79,2.667,211.380643,8.106,Florida,9,2021,2021-9
6,1,2021-03-19,1472515.79,0,54.58,2.72,211.215635,8.106,Florida,10,2021,2021-10
7,1,2021-03-26,1404429.92,0,51.45,2.732,211.018042,8.106,Florida,11,2021,2021-11
8,1,2021-04-02,1594968.28,0,62.27,2.719,210.82045,7.808,Florida,12,2021,2021-12
9,1,2021-04-09,1545418.53,0,65.86,2.77,210.622857,7.808,Florida,13,2021,2021-13


In [7]:
# Do the same for Daily data:
# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%y')

# Add a new column for the week number. The week starts on Saturday and ends on Friday.
# Pandas week starts on Monday by default, so we adjust to have it start on Saturday
df['Week_Number'] = (df['Date'] - pd.Timedelta(days=5)).dt.isocalendar().week
df['Year'] = df['Date'].dt.year
df.head(15)

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4
1,2021-02-02,1039762.27,39.03,2.58,200.61,7.55,0,Florida,4,2021,2021-4
2,2021-02-03,2079524.55,38.14,2.58,200.61,7.55,0,Florida,4,2021,2021-4
3,2021-02-04,1039762.27,37.56,2.58,200.61,7.55,0,Florida,4,2021,2021-4
4,2021-02-05,2079524.55,38.26,2.58,200.61,7.55,0,Florida,4,2021,2021-4
5,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5
6,2021-02-07,2079524.55,37.45,2.58,200.61,7.55,0,Florida,5,2021,2021-5
7,2021-02-08,518924.83,38.06,2.55,200.74,7.55,0,Florida,5,2021,2021-5
8,2021-02-09,1037849.66,38.26,2.55,200.74,7.55,0,Florida,5,2021,2021-5
9,2021-02-10,2075699.33,37.48,2.55,200.74,7.55,0,Florida,5,2021,2021-5


In [12]:
# For simplicity and to ensure we handle year-end weeks correctly, combine Year and Week_Number into a unique identifier
df['Year_Week'] = df['Year'].astype(str) + '-' + df['Week_Number'].astype(str)
df_original['Year_Week'] = df_original['Year'].astype(str) + '-' + df_original['Week_Number'].astype(str)

# Now, we'll group by this new column and distribute the weekly sales randomly across the days
# First, let's calculate the total weekly sales
weekly_sales = df_original.groupby('Year_Week')['Weekly_Sales'].sum().reset_index()
weekly_sales

Unnamed: 0,Year_Week,Weekly_Sales
0,2021-10,44988974.64
1,2021-11,44133961.05
2,2021-12,50423831.26
3,2021-13,47365290.44
4,2021-14,45183667.08
...,...,...
138,2023-52,44955421.95
139,2023-6,50197056.96
140,2023-7,45771506.57
141,2023-8,46861034.97


In [14]:
# Join back on the original dataframe to have the weekly total alongside each day
# df = df.merge(weekly_sales, on='Year_Week', suffixes=('', '_Weekly_Total'))
df = df.merge(weekly_sales, on='Year_Week', suffixes=('', '_Weekly_Total'))
df.head(15)

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5
1,2021-02-02,1039762.27,39.03,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5
2,2021-02-03,2079524.55,38.14,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5
3,2021-02-04,1039762.27,37.56,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5
4,2021-02-05,2079524.55,38.26,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5
5,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63
6,2021-02-07,2079524.55,37.45,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63
7,2021-02-08,518924.83,38.06,2.55,200.74,7.55,0,Florida,5,2021,2021-5,48336677.63
8,2021-02-09,1037849.66,38.26,2.55,200.74,7.55,0,Florida,5,2021,2021-5,48336677.63
9,2021-02-10,2075699.33,37.48,2.55,200.74,7.55,0,Florida,5,2021,2021-5,48336677.63


In [17]:
weekly_sales[weekly_sales['Year_Week']=='2021-6']

Unnamed: 0,Year_Week,Weekly_Sales
44,2021-6,48276993.78


In [19]:
# Now, let's randomly distribute the weekly total across the days within each week
import numpy as np

# Seed for reproducibility
np.random.seed(42)

def redistribute_sales(row):
    # For each group (each unique Year_Week), redistribute sales
    # The sum of 'Daily_Sales' across the week should equal 'Daily_Sales_Weekly_Total'
    # To ensure sums match exactly due to floating point arithmetic, handle the last day explicitly
    days_in_week = row.shape[0]
    weekly_total = row['Weekly_Sales'].iloc[0]
    random_sales = np.random.random(days_in_week - 1)
    random_sales /= random_sales.sum()  # Normalize to sum to 1
    random_sales *= weekly_total  # Scale to sum to the weekly total
    random_sales = np.append(random_sales, weekly_total - random_sales.sum())  # Ensure the sum matches exactly
    return random_sales

# Apply the redistribution function to each group
df['Daily_Sales_New'] = df.groupby('Year_Week').apply(lambda x: redistribute_sales(x)).explode().values

# Check the results to ensure the transformation looks correct
df[['Date', 'Daily_Sales', 'Daily_Sales_New', 'Year_Week']].head()

  df['Daily_Sales_New'] = df.groupby('Year_Week').apply(lambda x: redistribute_sales(x)).explode().values


Unnamed: 0,Date,Daily_Sales,Daily_Sales_New,Year_Week
0,2021-02-01,519881.14,1127107.776974,2021-4
1,2021-02-02,1039762.27,2860995.216569,2021-4
2,2021-02-03,2079524.55,2202797.572268,2021-4
3,2021-02-04,1039762.27,1801549.685429,2021-4
4,2021-02-05,2079524.55,469508.643124,2021-4


In [51]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,1,0,0
1,1,1,0,0
1,2,1,0,0
1,3,1,0,0
2,4,2,1,1
2,5,2,1,1
2,6,2,0,1
2,7,2,0,1
3,8,3,0,0
3,9,3,0,0


In [26]:
df[df['Year_Week']=='2023-42'].groupby('Year_Week')['Daily_Sales'].sum(),df[df['Year_Week']=='2021-4'].groupby('Year_Week')['Daily_Sales_New'].sum(),

(Year_Week
 2023-42    47882917.1
 Name: Daily_Sales, dtype: float64,
 Year_Week
 2021-4    40761643.451478
 Name: Daily_Sales_New, dtype: object)

In [30]:
df['Daily_Sales_New'] = df['Daily_Sales_New'].apply(lambda x: np.round(x, decimals=2))
df

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales,Daily_Sales_New
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1127107.78
1,2021-02-02,1039762.27,39.03,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,2860995.22
2,2021-02-03,2079524.55,38.14,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,2202797.57
3,2021-02-04,1039762.27,37.56,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1801549.69
4,2021-02-05,2079524.55,38.26,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,469508.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,2023-10-23,478553.43,66.78,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,2403298.40
4991,2023-10-24,957106.86,68.08,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,342097.12
4992,2023-10-25,1914213.71,67.84,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,3088678.23
4993,2023-10-26,957106.86,67.13,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,1201204.05


In [39]:
year_week = '2023-42'
len(df[df['Year_Week']==year_week]), df[df['Year_Week']==year_week].groupby('Year_Week')['Daily_Sales'].sum(),df[df['Year_Week']==year_week].groupby('Year_Week')['Daily_Sales_New'].sum(), \
df[df['Year_Week']==year_week]['Weekly_Sales'].head(1)

(35,
 Year_Week
 2023-42    47882917.1
 Name: Daily_Sales, dtype: float64,
 Year_Week
 2023-42    44251518.34
 Name: Daily_Sales_New, dtype: float64,
 992    45544116.29
 Name: Weekly_Sales, dtype: float64)

### Include Holidays:

Data got messed up, will work on it later.

In [52]:
df = pd.read_csv('./Datasets/Walmart_daily.csv')
df

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales,Daily_Sales_New
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06
1,2021-02-02,1039762.27,39.03,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,2.860995e+06
2,2021-02-03,2079524.55,38.14,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,2.202798e+06
3,2021-02-04,1039762.27,37.56,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.801550e+06
4,2021-02-05,2079524.55,38.26,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,4.695086e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,2023-10-23,478553.43,66.78,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,2.403298e+06
4991,2023-10-24,957106.86,68.08,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,3.420971e+05
4992,2023-10-25,1914213.71,67.84,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,3.088678e+06
4993,2023-10-26,957106.86,67.13,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,1.201204e+06


In [53]:
df_original

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State,Week_Number,Year,Year_Week
0,1,2021-02-05,1643690.90,0,42.31,2.572,211.096358,8.106,Florida,4,2021,2021-4
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.242170,8.106,Florida,5,2021,2021-5
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida,6,2021,2021-6
3,1,2021-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,Florida,7,2021,2021-7
4,1,2021-03-05,1554806.68,0,46.50,2.625,211.350143,8.106,Florida,8,2021,2021-8
...,...,...,...,...,...,...,...,...,...,...,...,...
6430,45,2023-09-29,713173.95,0,64.88,3.997,192.013558,8.684,Virginia,38,2023,2023-38
6431,45,2023-10-06,733455.07,0,64.89,3.985,192.170412,8.667,Virginia,39,2023,2023-39
6432,45,2023-10-13,734464.36,0,54.47,4.000,192.327265,8.667,Virginia,40,2023,2023-40
6433,45,2023-10-20,718125.53,0,56.47,3.969,192.330854,8.667,Virginia,41,2023,2023-41


In [54]:
df

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales,Daily_Sales_New
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06
1,2021-02-02,1039762.27,39.03,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,2.860995e+06
2,2021-02-03,2079524.55,38.14,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,2.202798e+06
3,2021-02-04,1039762.27,37.56,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.801550e+06
4,2021-02-05,2079524.55,38.26,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,4.695086e+05
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4990,2023-10-23,478553.43,66.78,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,2.403298e+06
4991,2023-10-24,957106.86,68.08,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,3.420971e+05
4992,2023-10-25,1914213.71,67.84,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,3.088678e+06
4993,2023-10-26,957106.86,67.13,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,1.201204e+06


In [55]:
# Join back on the original dataframe to have the weekly total alongside each day
# df = df.merge(weekly_sales, on='Year_Week', suffixes=('', '_Weekly_Total'))
df = df.merge(df_original[['Holiday_Flag', 'Year_Week']], on='Year_Week', suffixes=('', '_Original'))
df.head(15)

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales,Daily_Sales_New,Holiday_Flag_Original
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
1,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
2,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
3,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
4,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
5,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
6,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
7,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
8,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0
9,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.5,1127108.0,0


In [56]:
df[df['Holiday_Flag_Original']==1]

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales,Daily_Sales_New,Holiday_Flag_Original
225,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63,4.694361e+05,1
226,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63,4.694361e+05,1
227,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63,4.694361e+05,1
228,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63,4.694361e+05,1
229,2021-02-06,1559643.41,37.58,2.58,200.61,7.55,0,Florida,5,2021,2021-5,48336677.63,4.694361e+05,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222565,2023-09-08,2029690.80,82.24,3.71,210.98,5.97,0,Virginia,35,2023,2023-35,48330059.31,2.274433e+06,1
222566,2023-09-08,2029690.80,82.24,3.71,210.98,5.97,0,Virginia,35,2023,2023-35,48330059.31,2.274433e+06,1
222567,2023-09-08,2029690.80,82.24,3.71,210.98,5.97,0,Virginia,35,2023,2023-35,48330059.31,2.274433e+06,1
222568,2023-09-08,2029690.80,82.24,3.71,210.98,5.97,0,Virginia,35,2023,2023-35,48330059.31,2.274433e+06,1


1

In [99]:
# Function to randomly set Holiday_flag to 1 for every week that had a holiday.
def randomly_set_ones(group):
    num_ones_added = 0
    for index, row in group.iterrows():
        if row['Holiday_Flag_Original'] == 1 and num_ones_added <= np.random.randint(1,2):
            group.at[index, 'Holiday_Flag'] = 1
            num_ones_added += 1
    return group


In [100]:
# Apply the function to each group of 'a' values
df1 = df.groupby('Year_Week').apply(randomly_set_ones)

df1.reset_index(drop=True, inplace=True)

  df1 = df.groupby('Year_Week').apply(randomly_set_ones)


In [103]:
df#1[df1['Holiday_Flag']==1]

Unnamed: 0,Date,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,State,Week_Number,Year,Year_Week,Weekly_Sales,Daily_Sales_New,Holiday_Flag_Original
0,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06,0
1,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06,0
2,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06,0
3,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06,0
4,2021-02-01,519881.14,37.69,2.58,200.61,7.55,0,Florida,4,2021,2021-4,49750740.50,1.127108e+06,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224770,2023-10-27,1914213.71,67.75,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,7.450581e-09,0
224771,2023-10-27,1914213.71,67.75,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,7.450581e-09,0
224772,2023-10-27,1914213.71,67.75,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,7.450581e-09,0
224773,2023-10-27,1914213.71,67.75,3.53,211.92,5.67,0,Virginia,42,2023,2023-42,45544116.29,7.450581e-09,0


In [102]:
df.to_csv('./Datasets/Walmart_daily.csv',index=False)

PermissionError: [Errno 13] Permission denied: './Datasets/Walmart_daily.csv'

## Redo Daily

## Split dataset into train and inference

In [1]:
import pandas as pd

In [9]:
base_folder = 'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline'
dataset_name = 'Walmart_Weekly'
original_file = 'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Walmart_Weekly_labelled.csv'
saved_train_file = base_folder + '/train/' + dataset_name +'_train.csv'
saved_inference_file = base_folder + '/inference/' + dataset_name +'_inference.csv'

In [10]:
def read(data_path, sheet_name = ''):
    df = pd.DataFrame()
    if data_path.split('.')[-1] == 'xlsx':
        if sheet_name:
            df = pd.read_excel(data_path, sheet_name=sheet_name)
        else:
            df = pd.read_excel(data_path)
        print("Shape of the data in file {} is {}".format(data_path, df.shape))
    else:
        try:
            df = pd.read_csv(data_path)
            print("Shape of the data in file {} is {}".format(data_path, df.shape))
            if df.shape[0] == 0:
                print("No data in file {}".format(data_path))
        except Exception as e:
            print("Issue while reading data at {} \n{}".format(data_path, e))
    return df


def standardize_date_col(dataframe, date_col):
    dataframe[date_col] = pd.to_datetime(dataframe[date_col], format='%d-%m-%Y', errors='coerce').fillna(pd.to_datetime(df['Date'], format='%d/%m/%y', errors='coerce'))
    # Convert all dates to 'mm-dd-yyyy' format
    dataframe[date_col] = dataframe[date_col].dt.strftime('%Y-%m-%d')
    return dataframe

     

In [11]:
# Read data from csv or excel, sheet_name is the sheet in excel that contians data 
data = read(original_file, sheet_name= 'RAW')
data.head(3)

Shape of the data in file C:\Geeta\learning\projects\AnomalyDetectionSXM\Notebooks\Datasets\Walmart_Weekly_labelled.csv is (715, 11)


Unnamed: 0,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag,Anomaly,Sales_Amount_Upper,Sales_Amount_Lower,State
0,2021-02-05,10397622.73,37.2,2.58,200.61,7.55,0,0,11573691.83,9513064.59,Florida
1,2021-02-12,10378496.65,36.72,2.55,200.74,7.55,1,0,11032180.28,8971553.04,Florida
2,2021-02-19,10060556.61,39.7,2.52,200.79,7.55,0,0,10763335.98,8702708.74,Florida


In [12]:
data.Date.max()

'2023-10-27'

In [13]:
inference_data = data[data['Date']==data.Date.max()]
train_data = data[data['Date']!=data.Date.max()]
data.shape, train_data.shape, inference_data.shape

((715, 11), (710, 11), (5, 11))

In [14]:
inference_data.columns

Index(['Date', 'Weekly_Sales', 'Temperature', 'Fuel_Price', 'CPI',
       'Unemployment', 'Holiday_Flag', 'Anomaly', 'Sales_Amount_Upper',
       'Sales_Amount_Lower', 'State'],
      dtype='object')

In [15]:
saved_train_file, saved_inference_file

('C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline/train/Walmart_Weekly_train.csv',
 'C:\\Geeta\\learning\\projects\\AnomalyDetectionSXM\\Notebooks\\Datasets\\Pipeline/inference/Walmart_Weekly_inference.csv')

In [17]:
train_data.to_csv(saved_train_file, index=False)
inference_data.to_csv(saved_inference_file, index=False)

## Next functionality

In [1]:
import pandas as pd

file_path = './Datasets/Walmart.csv'
data = pd.read_csv(file_path)

# Standardize date format and display the first few rows to understand its structure
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')

# Display the first few rows of the dataframe
data.head()



Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,State
0,1,2021-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,Florida
1,1,2021-02-12,1641957.44,1,38.51,2.548,211.24217,8.106,Florida
2,1,2021-02-19,1611968.17,0,39.93,2.514,211.289143,8.106,Florida
3,1,2021-02-26,1409727.59,0,46.63,2.561,211.319643,8.106,Florida
4,1,2021-03-05,1554806.68,0,46.5,2.625,211.350143,8.106,Florida


In [3]:
from datetime import datetime, timedelta
import numpy as np

# Define the date range
start_date = datetime.strptime("2021-02-01", "%Y-%m-%d")
end_date = datetime.strptime("2023-10-29", "%Y-%m-%d")

# Generate all dates within the range
all_dates = pd.date_range(start=start_date, end=end_date, freq='D')

# Define a function to map weekly sales to daily, with random distribution
def distribute_weekly_sales(row, all_dates):
    start_week_date = row['Date'] - timedelta(days=row['Date'].weekday() + 1)  # Adjusting to start from Saturday
    end_week_date = start_week_date + timedelta(days=6)
    
    # Filter dates within the week
    week_dates = all_dates[(all_dates >= start_week_date) & (all_dates <= end_week_date)]
    
    # Distribute sales randomly across the week
    sales_distribution = np.random.dirichlet(np.ones(len(week_dates)), size=1) * row['Weekly_Sales']
    
    # Create a DataFrame for the week
    weekly_data = pd.DataFrame({
        'Date': week_dates,
        'Store': row['Store'],
        'Daily_Sales': sales_distribution.flatten(),
        'Temperature': row['Temperature'] * (1 + np.random.uniform(-0.0001, 0.0001, len(week_dates))),
        'Fuel_Price': row['Fuel_Price'] * (1 + np.random.uniform(-0.0001, 0.0001, len(week_dates))),
        'CPI': row['CPI'] * (1 + np.random.uniform(-0.0001, 0.0001, len(week_dates))),
        'Unemployment': row['Unemployment'] * (1 + np.random.uniform(-0.0001, 0.0001, len(week_dates))),
        'State': row['State'],
        'Holiday_Flag': row['Holiday_Flag']  # This will be adjusted later for daily holiday flag
    })
    
    return weekly_data

# Convert 'Date' column to datetime
data['Date'] = pd.to_datetime(data['Date'])

# Apply the function to distribute sales across the days of the week
weekly_mapped_data = pd.concat([distribute_weekly_sales(row, all_dates) for index, row in data.iterrows()])

weekly_mapped_data.head()


Unnamed: 0,Date,Store,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,State,Holiday_Flag
0,2021-02-01,1,81989.301025,42.308687,2.57212,211.082677,8.106305,Florida,0
1,2021-02-02,1,529538.775093,42.309271,2.572137,211.080533,8.105218,Florida,0
2,2021-02-03,1,217136.828395,42.311895,2.571906,211.081689,8.105332,Florida,0
3,2021-02-04,1,137259.479387,42.307588,2.571832,211.082694,8.105521,Florida,0
4,2021-02-05,1,9768.223723,42.313824,2.572037,211.09247,8.105589,Florida,0


In [4]:
import holidays

# For the United States
us_holidays = holidays.UnitedStates(years=[2021, 2022, 2023])

# To print or use the list of holidays
for date, name in sorted(us_holidays.items()):
    print(date, name)


2021-01-01 New Year's Day
2021-01-18 Martin Luther King Jr. Day
2021-02-15 Washington's Birthday
2021-05-31 Memorial Day
2021-06-18 Juneteenth National Independence Day (observed)
2021-06-19 Juneteenth National Independence Day
2021-07-04 Independence Day
2021-07-05 Independence Day (observed)
2021-09-06 Labor Day
2021-10-11 Columbus Day
2021-11-11 Veterans Day
2021-11-25 Thanksgiving
2021-12-24 Christmas Day (observed)
2021-12-25 Christmas Day
2021-12-31 New Year's Day (observed)
2022-01-01 New Year's Day
2022-01-17 Martin Luther King Jr. Day
2022-02-21 Washington's Birthday
2022-05-30 Memorial Day
2022-06-19 Juneteenth National Independence Day
2022-06-20 Juneteenth National Independence Day (observed)
2022-07-04 Independence Day
2022-09-05 Labor Day
2022-10-10 Columbus Day
2022-11-11 Veterans Day
2022-11-24 Thanksgiving
2022-12-25 Christmas Day
2022-12-26 Christmas Day (observed)
2023-01-01 New Year's Day
2023-01-02 New Year's Day (observed)
2023-01-16 Martin Luther King Jr. Day
202

In [5]:

# Mark 'Holiday_Flag' as 1 for US holidays and increase 'Daily_Sales' by up to 10% randomly for these holidays
weekly_mapped_data['Holiday_Flag'] = weekly_mapped_data['Date'].isin(us_holidays).astype(int)
weekly_mapped_data.loc[weekly_mapped_data['Holiday_Flag'] == 1, 'Daily_Sales'] *= np.random.uniform(1, 1.1, size=sum(weekly_mapped_data['Holiday_Flag'] == 1))

weekly_mapped_data.head()

  weekly_mapped_data['Holiday_Flag'] = weekly_mapped_data['Date'].isin(us_holidays).astype(int)


Unnamed: 0,Date,Store,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,State,Holiday_Flag
0,2021-02-01,1,81989.301025,42.308687,2.57212,211.082677,8.106305,Florida,0
1,2021-02-02,1,529538.775093,42.309271,2.572137,211.080533,8.105218,Florida,0
2,2021-02-03,1,217136.828395,42.311895,2.571906,211.081689,8.105332,Florida,0
3,2021-02-04,1,137259.479387,42.307588,2.571832,211.082694,8.105521,Florida,0
4,2021-02-05,1,9768.223723,42.313824,2.572037,211.09247,8.105589,Florida,0


In [40]:
weekly_mapped_data[weekly_mapped_data['Date']=='2021-02-15']

Unnamed: 0,Date,State,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag
70,2021-02-15,California,2108856.45,37.0,2.73,150.61,8.84,1
71,2021-02-15,Florida,2333725.84,39.7,2.52,200.79,7.55,1
72,2021-02-15,Ohio,1666585.41,29.19,2.78,157.48,8.03,1
73,2021-02-15,Texas,2118788.19,41.56,2.66,163.13,9.68,1
74,2021-02-15,Virginia,1760029.69,41.15,2.67,167.34,8.99,1


In [19]:
# Sum stores under state
weekly_mapped_data = weekly_mapped_data.groupby(['Date',"State"]).agg(
    Daily_Sales=('Daily_Sales', 'sum'),
    Temperature=('Temperature', 'mean'),
    Fuel_Price=('Fuel_Price', 'mean'),
    CPI=('CPI', 'mean'),
    Unemployment=('Unemployment', 'mean'),
    Holiday_Flag=('Holiday_Flag', 'max') # Assuming if any store has a holiday, it's a holiday across the region
).reset_index()


In [20]:
weekly_mapped_data

Unnamed: 0,Date,State,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag
0,2021-02-01,California,1610296.53,32.527778,2.770000,150.502222,8.840000,0
1,2021-02-01,Florida,1044395.77,37.197778,2.574444,200.607778,7.552222,0
2,2021-02-01,Ohio,1085410.27,22.867778,2.816667,157.338889,8.031111,0
3,2021-02-01,Texas,647356.29,39.262222,2.706667,163.000000,9.681111,0
4,2021-02-01,Virginia,645338.48,38.331111,2.716667,167.205556,8.988889,0
...,...,...,...,...,...,...,...,...
4995,2023-10-28,California,1096515.76,56.587778,3.910000,157.865556,7.230000,0
4996,2023-10-28,Florida,1012465.39,66.451111,3.530000,211.924444,5.670000,0
4997,2023-10-28,Ohio,1035764.10,56.995556,3.915556,166.206667,7.140000,0
4998,2023-10-28,Texas,588231.68,64.087778,3.790000,171.394444,7.763333,0


In [17]:
sum(weekly_mapped_data[(weekly_mapped_data['State']=='Ohio') & (weekly_mapped_data['Date']=='2021-02-01')]['Daily_Sales'])

1085410.2788387286

In [21]:
# clean up the data:
# Identify numeric columns
numeric_columns = weekly_mapped_data.select_dtypes(include=[int, float]).columns

# Round the numeric values in these columns to 2 decimal places
weekly_mapped_data[numeric_columns] = weekly_mapped_data[numeric_columns].round(2)
weekly_mapped_data

Unnamed: 0,Date,State,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag
0,2021-02-01,California,1610296.53,32.53,2.77,150.50,8.84,0
1,2021-02-01,Florida,1044395.77,37.20,2.57,200.61,7.55,0
2,2021-02-01,Ohio,1085410.27,22.87,2.82,157.34,8.03,0
3,2021-02-01,Texas,647356.29,39.26,2.71,163.00,9.68,0
4,2021-02-01,Virginia,645338.48,38.33,2.72,167.21,8.99,0
...,...,...,...,...,...,...,...,...
4995,2023-10-28,California,1096515.76,56.59,3.91,157.87,7.23,0
4996,2023-10-28,Florida,1012465.39,66.45,3.53,211.92,5.67,0
4997,2023-10-28,Ohio,1035764.10,57.00,3.92,166.21,7.14,0
4998,2023-10-28,Texas,588231.68,64.09,3.79,171.39,7.76,0


In [29]:
weekly_mapped_data = weekly_mapped_data[weekly_mapped_data['Date']!='2023-10-28']
weekly_mapped_data

Unnamed: 0,Date,State,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag
0,2021-02-01,California,1610296.53,32.53,2.77,150.50,8.84,0
1,2021-02-01,Florida,1044395.77,37.20,2.57,200.61,7.55,0
2,2021-02-01,Ohio,1085410.27,22.87,2.82,157.34,8.03,0
3,2021-02-01,Texas,647356.29,39.26,2.71,163.00,9.68,0
4,2021-02-01,Virginia,645338.48,38.33,2.72,167.21,8.99,0
...,...,...,...,...,...,...,...,...
4990,2023-10-27,California,2430803.18,56.59,3.91,157.86,7.23,0
4991,2023-10-27,Florida,1022090.78,66.46,3.53,211.92,5.67,0
4992,2023-10-27,Ohio,1215995.54,57.00,3.92,166.20,7.14,0
4993,2023-10-27,Texas,932135.59,64.09,3.79,171.40,7.76,0


In [39]:
# weekly_mapped_data = weekly_mapped_data.rename(columns={'Weekly_Sales':'Daily_Sales'})
weekly_mapped_data

Unnamed: 0,Date,State,Daily_Sales,Temperature,Fuel_Price,CPI,Unemployment,Holiday_Flag
0,2021-02-01,California,1610296.53,32.53,2.77,150.50,8.84,0
1,2021-02-01,Florida,1044395.77,37.20,2.57,200.61,7.55,0
2,2021-02-01,Ohio,1085410.27,22.87,2.82,157.34,8.03,0
3,2021-02-01,Texas,647356.29,39.26,2.71,163.00,9.68,0
4,2021-02-01,Virginia,645338.48,38.33,2.72,167.21,8.99,0
...,...,...,...,...,...,...,...,...
4990,2023-10-27,California,2430803.18,56.59,3.91,157.86,7.23,0
4991,2023-10-27,Florida,1022090.78,66.46,3.53,211.92,5.67,0
4992,2023-10-27,Ohio,1215995.54,57.00,3.92,166.20,7.14,0
4993,2023-10-27,Texas,932135.59,64.09,3.79,171.40,7.76,0


In [38]:
weekly_mapped_data.loc[weekly_mapped_data['Holiday_Flag'] == 1, 'Daily_Sales'] *= np.random.uniform(1, 1.25, size=sum(weekly_mapped_data['Holiday_Flag'] == 1))
weekly_mapped_data['Daily_Sales'] = weekly_mapped_data['Daily_Sales'].round(2)


In [41]:
weekly_mapped_data.to_csv("./Datasets/weekly_mapped_data.csv", index=False)