In [5]:
import pandas as pd
import os
import datetime as dt

# Extracting
def extract(store_data, extra_data) :
    extra_df = pd.read_parquet(extra_data)
    merged_df = store_data.merge(extra_df, on="index")
    return merged_df

merged_df = extract(grocery_sales, "extra_data.parquet")

# Transforming
def transform(raw_data):
    raw_data.fillna(
        value={
            "Weekly_Sales": raw_data["Weekly_Sales"].mean(),
            "CPI": raw_data["CPI"].mean(),
            "Unemployment": raw_data["Unemployment"].mean()
        }, inplace = True
    )
    
    raw_data["Date"] = pd.to_datetime(raw_data["Date"], format="%Y-%m-%d")

    raw_data = raw_data[["Store_ID", "Date", "Dept", "IsHoliday", "Weekly_Sales", "CPI", "Unemployment"]]

    raw_data = raw_data.loc[raw_data["Weekly_Sales"] > 10000.0, :]
    print("THIS IS RAW DATA WEEKLY_SALES", raw_data["Weekly_Sales"].dtypes)

    raw_data["Month"] = raw_data["Date"].dt.month
    raw_data = raw_data.drop("Date", axis = 1)
    
    return raw_data

clean_data = transform(merged_df)
print(clean_data)

def avg_weekly_sales_per_month(clean_data):
    agg_data = round(clean_data.groupby(by=["Month"])["Weekly_Sales"].mean(),2)
    agg_data = agg_data.reset_index()
    agg_data = agg_data.rename(columns={"Weekly_Sales": "Avg_Sales"})
    return agg_data

agg_data = avg_monthly_sales(clean_data)

# Loading
def load(cleaned_data, cleaned_path, agg_data, agg_path):
    try:
        cleaned_data.to_csv(cleaned_path, index= False)
        agg_data.to_csv(agg_path, index = False)
    except: 
        print("Failed to load the data")

load(clean_data, "clean_data.csv", agg_data, "agg_data.csv")

# Validation
def validation(file_path):
    file_exists = os.path.exists(file_path)
    print(file_exists)
    if not file_exists:
        raise Exception(f"File does not Exists at path {file_path}")

validation("clean_data.csv")
validation("agg_data.csv")


THIS IS RAW DATA WEEKLY_SALES float64
        Store_ID  Dept  IsHoliday  ...         CPI  Unemployment  Month
0              1     1          0  ...  211.096358      8.106000    2.0
1              1    26          0  ...  211.096358      8.106000    2.0
2              1    17          0  ...  211.096358      8.106000    2.0
5              1    79          0  ...  211.096358      7.500052    2.0
6              1    55          0  ...  211.096358      7.500052    2.0
...          ...   ...        ...  ...         ...           ...    ...
231513        24    40          0  ...  134.514367      8.212000    5.0
231515        24    93          0  ...  134.514367      8.212000    5.0
231516        24     9          0  ...  134.514367      8.212000    5.0
231517        24     8          0  ...  134.514367      8.212000    5.0
231519        24    87          0  ...  134.514367      8.212000    5.0

[106231 rows x 7 columns]
True
True
