In [1]:
import pandas as pd
import numpy as np
import openpyxl

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Load Data

In [2]:
global_cs_2023 = pd.read_excel("../Data/Global Case Sales 2023.xlsx") # (572,887  ,  20)
master_facility = pd.read_excel("../Data/Master Facility List.xlsx") # (7,126  ,  92)
assurance_volume = pd.read_csv("../Data/QSET_ASSURANCE_PROD_VOL_EXPORT.csv") # (29,177  ,  5)

water_wwd = pd.read_excel("../Data/Water & WWD Comments.xlsx")
water_wwd = water_wwd[water_wwd['Indicator'] == 'Total Wastewater Discharged (kL) [kL]']  # (9,254  ,  9)

In [3]:
column_names = [
    "Indicator_Name", "Code", "Entity_Name", 
    "Facility_ID", "Reporting_Period", "Answer"
]

numeric_input_ind = pd.read_csv("../Data/Monthly Numeric Indicator.csv", encoding='utf-16', sep='\t', header=0, names=column_names) # (197,418  ,  6)

In [4]:
column_names = [
    "Reporting_Period", "Entity_Name", "Code", "Indicator_Name", "Answer", "Unit", 
    "Help_Text", "Comments", "Frequency", "FolderPath", "Ord"
]
text_input_ind = pd.read_csv("../Data/Monthly Text Input Indicator.csv", encoding='ISO-8859-1',header=0, names=column_names) # (70,917  ,  11)

### Task 1: Text Input
- Check to see if answers for the same indicator switches one month to next, if yes outlier
- If answer is missing, outlier

- Columns to keep: Facility_ID, Facility_Name, Bottler, OU, Reporting_Period, Detection_Results

In [5]:
# Extract Facility_ID by splitting FolderPath at '>', taking the last part, and trimming whitespace.
text_input_ind['Facility_ID'] = text_input_ind['FolderPath'].apply(lambda x: x.split('>')[-1].strip())

# Convert Facility_ID to a numeric type, coercing errors to NaN, and then change to int64.
text_input_ind['Facility_ID'] = pd.to_numeric(text_input_ind['Facility_ID'], errors='coerce').astype('int64')

In [6]:
# Count the number of missing values in the "Answer" column
missing_count_answer = text_input_ind["Answer"].isnull().sum()

# Print the result
print(f"Number of missing values in 'Answer': {missing_count_answer}")

Number of missing values in 'Answer': 26106


In [7]:
# Perform a merge based on Facility_ID
text_input_ind = text_input_ind.merge(
    master_facility[['FACILITY_ID', 'FACILITY_NAME', 'BTLR_NAME_ISSCOM', 'BUNAME_BU']],
    left_on='Facility_ID',
    right_on='FACILITY_ID',
    how='left'
)

# # Drop the extra FACILITY_ID column after merge to avoid duplication
# text_input_ind = text_input_ind.drop(columns=['FACILITY_ID_x'])

# # Rename columns to match the desired names
# text_input_ind = text_input_ind.rename(columns={'FACILITY_ID_y': 'Facility_ID', 'FACILITY_NAME': 'Facility_Name'})


In [8]:
text_input_ind.shape

(70917, 16)

In [9]:
# Count the number of missing values in the "Answer" column
missing_count_answer = text_input_ind["Answer"].isnull().sum()

# Print the result
print(f"Number of missing values in 'Answer': {missing_count_answer}")

Number of missing values in 'Answer': 26106


In [10]:
# Keep only the relevant columns
cols_to_keep = ["BUNAME_BU", "Reporting_Period", "Facility_ID", "FACILITY_NAME", "BTLR_NAME_ISSCOM", "Indicator_Name", "Answer"]
text_input_ind = text_input_ind[cols_to_keep]

In [11]:
# Count the number of missing values in the "Answer" column
missing_count_answer = text_input_ind["Answer"].isnull().sum()

# Print the result
print(f"Number of missing values in 'Answer': {missing_count_answer}")

Number of missing values in 'Answer': 26106


In [12]:
text_input_ind.shape

(70917, 7)

In [None]:
def detect_changes(df):
    """
    Process a DataFrame to identify anomalies in answers for specific indicators within facilities over time.

    This function cleans business unit names, sorts data by facility and indicators, flags entries where answers 
    have changed or are missing, and updates the DataFrame with detection results and details. It returns both 
    the processed complete DataFrame and a subset containing flagged entries.

    Parameters:
    ----------
    df : pandas.DataFrame
        The DataFrame containing columns such as 'BUNAME_BU', 'Reporting_Period', 'Facility_ID', 'Indicator_Name', 
        'Answer', and other relevant fields necessary for detection.

    Returns:
    -------
    df : pandas.DataFrame
        The complete DataFrame updated with flags and detection details for each entry.

    flagged_df : pandas.DataFrame
        A subset of the DataFrame containing only flagged entries with columns including 'BUNAME_BU', 'Reporting_Period', 
        'Facility_ID', 'Indicator_Name', 'Detection_Result', and 'Detection_Details'.

    datasets : dict of pandas.DataFrame
        A dictionary with keys representing BUNAME values from the original data where:
        - "{buname}_full_dataset": Full dataset for the BU.
        - "{buname}_flagged_dataset": Flagged subset for the BU.

    Notes:
    -----
    - The function does not separate files by `BUNAME_BU`; it organizes data and detection logic.
    - 'Reporting_Period' needs to be in a format suitable for conversion to datetime objects for accurate sorting.
    - Anomalies in the 'Answer' include cases where the answer changes from the previous period or is missing.

    Example:
    -------
    df, flagged_df, datasets = detect_changes(input_dataframe)
    """    

    # Remove suffix "BU" from BUNAME_BU values
    df['BUNAME_BU'] = df['BUNAME_BU'].str.replace(r'\s*BU$', '', regex=True)

    # Ensure 'Reporting_Period' is a datetime object for sorting
    df['Reporting_Period'] = pd.to_datetime(df['Reporting_Period'], format='%m/%d/%Y')

    # Sort the DataFrame by 'Facility_ID', 'Indicator_Name', and 'Reporting_Period'
    df.sort_values(by=['Facility_ID', 'Indicator_Name', 'Reporting_Period'], inplace=True)

    # Initialize 'Flag' column with default value 0
    df['Flag'] = 0
    
    # Define a function to flag changes and missing answers
    def flag_changes_with_initial_check(group):
        shifted = group['Answer'].shift()
        # Flag changes or missing answers
        flags = (group['Answer'] != shifted) | group['Answer'].isna()
        # Explicitly set the first entry to True if it is NaN
        if group['Answer'].isna().iloc[0]:
            flags.iloc[0] = True
        else:
            flags.iloc[0] = False
        return flags.astype(int)

    # Apply the flagging function to each group
    df['Flag'] = df.groupby(['Facility_ID', 'Indicator_Name'], as_index=False, group_keys=False).apply(flag_changes_with_initial_check)

    # Initialize new columns for structured results
    df['Detection_Result'] = pd.NA
    df['Detection_Details'] = pd.NA

    # Update detection result and details based on flags
    for name, group in df.groupby(['Facility_ID', 'Indicator_Name']):
        changes = group.index[group['Flag'] == 1]

        for idx in changes:
            facility_id = df.at[idx, 'Facility_ID']
            indicator_name = df.at[idx, 'Indicator_Name']
            reporting_period = df.at[idx, 'Reporting_Period'].strftime('%m/%d/%Y')

            # Correctly find the previous reporting period
            prev_idx = group.index.get_loc(idx) - 1
            
            if prev_idx >= 0:
                previous_reporting_period = group['Reporting_Period'].iloc[prev_idx].strftime('%m/%d/%Y')
                previous_answer = group['Answer'].iloc[prev_idx]
            else:
                previous_reporting_period = "N/A"
                previous_answer = None

            if pd.isna(df.at[idx, 'Answer']):
                df.at[idx, 'Detection_Result'] = "Missing Answer"
                df.at[idx, 'Detection_Details'] = f"Anomaly Detected On: {reporting_period}"
            else:
                if prev_idx >= 0:
                    new_answer = df.at[idx, 'Answer']
                    df.at[idx, 'Detection_Result'] = "Answer Changed"
                    df.at[idx, 'Detection_Details'] = (
                        f"Answer Changed From: {previous_answer} to {new_answer}\n"
                        f"Between: {previous_reporting_period} and {reporting_period}"
                    )

    # Create the flagged dataset with the specified columns
    flagged_df = df[df['Flag'] == 1][[
        'BUNAME_BU', 'Reporting_Period', 'Facility_ID', 'FACILITY_NAME', 
        'Indicator_Name', 'BTLR_NAME_ISSCOM', 'Detection_Result', 'Detection_Details'
    ]]

    # List of cleaned BUNAME values to iterate over
    buname_values = df['BUNAME_BU'].unique()

    # Create datasets for each BUNAME
    datasets = {}
    for buname in buname_values:
        bu_full_df = df[df['BUNAME_BU'] == buname]
        bu_flagged_df = flagged_df[flagged_df['BUNAME_BU'] == buname]
        
        datasets[f"{buname}_full_dataset"] = bu_full_df
        datasets[f"{buname}_flagged_dataset"] = bu_flagged_df

    return df, flagged_df, datasets

In [14]:
def save_datasets_to_csv(datasets, flagged_df):
    # Save each flagged dataset to CSV files
    for key in datasets:
        if "flagged_dataset" in key:
            bu_name = key.split("_")[0]
            flagged_filename = f"{bu_name}_task1_flagged.csv"
            bu_flagged_df = datasets[key]
            bu_flagged_df.to_csv(flagged_filename, index=False)

    # Save complete flagged DataFrame containing entries from all BUs
    complete_flagged_filename = "all_OUs_task1_flagged.csv"
    flagged_df.to_csv(complete_flagged_filename, index=False)

In [15]:
full_dataset, flagged_dataset, separate_datasets = detect_changes(text_input_ind)

In [16]:
save_datasets_to_csv(separate_datasets, flagged_dataset)

In [None]:
# ### This function processes a DataFrame to identify anomalies or changes in answers within 
# specific groups based on Facility_ID and Indicator_Name over time. It operates on 
# columns such as BUNAME_BU, Reporting_Period, and Answer, updating the DataFrame to 
# add flags for anomalies and detailed detection results.


# def detect_changes(df):
    
#     # Remove suffix "BU" from BUNAME_BU values
#     df['BUNAME_BU'] = df['BUNAME_BU'].str.replace(r'\s*BU$', '', regex=True)
    
#     # Ensure 'Reporting_Period' is a datetime object for sorting
#     df['Reporting_Period'] = pd.to_datetime(df['Reporting_Period'], format='%m/%d/%Y')

#     # Sort the DataFrame by 'Facility_ID', 'Indicator_Name', and 'Reporting_Period'
#     df.sort_values(by=['Facility_ID', 'Indicator_Name', 'Reporting_Period'], inplace=True)

#     # Initialize 'Flag' column with default value 0
#     df['Flag'] = 0
    
#     # Define a function to flag changes and missing answers
#     def flag_changes_with_initial_check(group):
#         shifted = group['Answer'].shift()
#         # Flag changes or missing answers
#         flags = (group['Answer'] != shifted) | group['Answer'].isna()
#         # Explicitly set the first entry to True if it is NaN
#         if group['Answer'].isna().iloc[0]:
#             flags.iloc[0] = True
#         else:
#             flags.iloc[0] = False
#         return flags.astype(int)

#     # Apply the flagging function to each group
#     df['Flag'] = df.groupby(['Facility_ID', 'Indicator_Name'], as_index=False, group_keys=False).apply(flag_changes_with_initial_check)

#     # Initialize new columns for structured results
#     df['Detection_Result'] = pd.NA
#     df['Detection_Details'] = pd.NA

#     # Update detection result and details based on flags
#     for name, group in df.groupby(['Facility_ID', 'Indicator_Name']):
#         changes = group.index[group['Flag'] == 1]

#         for idx in changes:
#             facility_id = df.at[idx, 'Facility_ID']
#             indicator_name = df.at[idx, 'Indicator_Name']
#             reporting_period = df.at[idx, 'Reporting_Period'].strftime('%m/%d/%Y')

#             # Correctly find the previous reporting period
#             prev_idx = group.index.get_loc(idx) - 1
            
#             if prev_idx >= 0:
#                 previous_reporting_period = group['Reporting_Period'].iloc[prev_idx].strftime('%m/%d/%Y')
#                 previous_answer = group['Answer'].iloc[prev_idx]
#             else:
#                 previous_reporting_period = "N/A"
#                 previous_answer = None

#             if pd.isna(df.at[idx, 'Answer']):
#                 df.at[idx, 'Detection_Result'] = "Missing Answer"
#                 df.at[idx, 'Detection_Details'] = f"Anomaly Detected On: {reporting_period}"
#             else:
#                 if prev_idx >= 0:
#                     new_answer = df.at[idx, 'Answer']
#                     df.at[idx, 'Detection_Result'] = "Answer Changed"
#                     df.at[idx, 'Detection_Details'] = (
#                         f"Answer Changed From: {previous_answer} to {new_answer}\n"
#                         f"Between: {previous_reporting_period} and {reporting_period}"
#                     )

#     # Create the flagged dataset with the specified columns
#     flagged_df = df[df['Flag'] == 1][[
#         'BUNAME_BU', 'Reporting_Period', 'Facility_ID', 'FACILITY_NAME', 
#         'Indicator_Name', 'BTLR_NAME_ISSCOM', 'Detection_Result', 'Detection_Details'
#     ]]

#     return df, flagged_df

# # Example usage
# full_dataset, flagged_dataset = detect_changes(text_input_ind)
# display(full_dataset.head())
# display(flagged_dataset.head())