In [1]:
import pandas as pd
import numpy as np
import openpyxl

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Load Data

In [2]:
global_cs_2023 = pd.read_excel("../Data/Global Case Sales 2023.xlsx") # (572,887  ,  20)
master_facility = pd.read_excel("../Data/Master Facility List.xlsx") # (7,126  ,  92)
assurance_volume = pd.read_csv("../Data/QSET_ASSURANCE_PROD_VOL_EXPORT.csv") # (29,177  ,  5)

water_wwd = pd.read_excel("../Data/Water & WWD Comments.xlsx")
water_wwd = water_wwd[water_wwd['Indicator'] == 'Total Wastewater Discharged (kL) [kL]']  # (9,254  ,  9)

In [3]:
column_names = [
    "Indicator_Name", "Code", "Entity_Name", 
    "Facility_ID", "Reporting_Period", "Answer"
]

numeric_input_ind = pd.read_csv("../Data/Monthly Numeric Indicator.csv", encoding='utf-16', sep='\t', header=0, names=column_names) # (197,418  ,  6)

In [4]:
column_names = [
    "Reporting_Period", "Entity_Name", "Code", "Indicator_Name", "Answer", "Unit", 
    "Help_Text", "Comments", "Frequency", "FolderPath", "Ord"
]
text_input_ind = pd.read_csv("../Data/Monthly Text Input Indicator.csv", encoding='ISO-8859-1',header=0, names=column_names) # (70,917  ,  11)

### Task 1: Text Input
- Check to see if answers for the same indicator switches one month to next, if yes outlier
- If answer is missing, outlier

- Columns to keep: Facility_ID, Facility_Name, Bottler, OU, Reporting_Period, Detection_Results

In [5]:
# Extract Facility_ID by splitting FolderPath at '>', taking the last part, and trimming whitespace.
text_input_ind['Facility_ID'] = text_input_ind['FolderPath'].apply(lambda x: x.split('>')[-1].strip())

# Convert Facility_ID to a numeric type, coercing errors to NaN, and then change to int64.
text_input_ind['Facility_ID'] = pd.to_numeric(text_input_ind['Facility_ID'], errors='coerce').astype('int64')

In [6]:
# Count the number of missing values in the "Answer" column
missing_count_answer = text_input_ind["Answer"].isnull().sum()

# Print the result
print(f"Number of missing values in 'Answer': {missing_count_answer}")

Number of missing values in 'Answer': 26106


In [7]:
# Perform a merge based on Facility_ID
text_input_ind = text_input_ind.merge(
    master_facility[['FACILITY_ID', 'FACILITY_NAME', 'BTLR_NAME_ISSCOM', 'BUNAME_BU']],
    left_on='Facility_ID',
    right_on='FACILITY_ID',
    how='left'
)

# # Drop the extra FACILITY_ID column after merge to avoid duplication
# text_input_ind = text_input_ind.drop(columns=['FACILITY_ID_x'])

# # Rename columns to match the desired names
# text_input_ind = text_input_ind.rename(columns={'FACILITY_ID_y': 'Facility_ID', 'FACILITY_NAME': 'Facility_Name'})


In [8]:
text_input_ind.shape

(70917, 16)

In [9]:
# Count the number of missing values in the "Answer" column
missing_count_answer = text_input_ind["Answer"].isnull().sum()

# Print the result
print(f"Number of missing values in 'Answer': {missing_count_answer}")

Number of missing values in 'Answer': 26106


In [10]:
# Keep only the relevant columns
cols_to_keep = ["BUNAME_BU", "Reporting_Period", "Facility_ID", "FACILITY_NAME", "BTLR_NAME_ISSCOM", "Indicator_Name", "Answer"]
text_input_ind = text_input_ind[cols_to_keep]

In [11]:
# Count the number of missing values in the "Answer" column
missing_count_answer = text_input_ind["Answer"].isnull().sum()

# Print the result
print(f"Number of missing values in 'Answer': {missing_count_answer}")

Number of missing values in 'Answer': 26106


In [12]:
text_input_ind.shape

(70917, 7)

In [None]:
#### Successfully Flags Where Indicator_Name Changes from One Month to the Next
def flag_changes(df):
    # Ensure 'Reporting_Period' is a datetime object for sorting
    df['Reporting_Period'] = pd.to_datetime(df['Reporting_Period'], format='%m/%d/%Y')

    # Sort the DataFrame by 'Facility_ID', 'Indicator_Name', and 'Reporting_Period'
    df.sort_values(by=['Facility_ID', 'Indicator_Name', 'Reporting_Period'], inplace=True)

    # Initialize 'Flag' column with default value 0
    df['Flag'] = 0

    # Identify where the 'Answer' field changes for each 'Facility_ID' and 'Indicator_Name' combination
    df['Flag'] = df.groupby(['Facility_ID', 'Indicator_Name'])['Answer'].transform(
        lambda x: x != x.shift())

    # Convert boolean to integer (0, 1) for the 'Flag' column
    df['Flag'] = df['Flag'].astype(int)

    # Return the modified DataFrame
    return df

# Example usage
# Assuming 'text_input_ind' is your DataFrame
text_flag = flag_changes(text_input_ind)


Unnamed: 0,BUNAME_BU,Reporting_Period,Facility_ID,FACILITY_NAME,BTLR_NAME_ISSCOM,Indicator_Name,Answer,Flag,Detection_Result
27277,AFRICA BU,2024-01-01,12,BUKAVU,HEINEKEN,Do you have secondary operations on-site? (ie....,No,1,"Facility ID: 12, switched Indicator from Elect..."
30217,AFRICA BU,2024-02-01,12,BUKAVU,HEINEKEN,Do you have secondary operations on-site? (ie....,No,0,
33145,AFRICA BU,2024-03-01,12,BUKAVU,HEINEKEN,Do you have secondary operations on-site? (ie....,No,0,
36064,AFRICA BU,2024-04-01,12,BUKAVU,HEINEKEN,Do you have secondary operations on-site? (ie....,No,0,
38992,AFRICA BU,2024-05-01,12,BUKAVU,HEINEKEN,Do you have secondary operations on-site? (ie....,No,0,


In [25]:
def flag_changes(df):
    # Ensure 'Reporting_Period' is a datetime object for sorting
    df['Reporting_Period'] = pd.to_datetime(df['Reporting_Period'], format='%m/%d/%Y')

    # Sort the DataFrame by 'Facility_ID', 'Indicator_Name', and 'Reporting_Period'
    df.sort_values(by=['Facility_ID', 'Indicator_Name', 'Reporting_Period'], inplace=True)

    # Initialize 'Flag' column with default value 0
    df['Flag'] = 0

    # Identify where the 'Answer' field changes for each 'Facility_ID' and 'Indicator_Name' combination
    def flag_changes_with_initial_check(group):
        # Shift the group to compare current values to the previous ones
        shifted = group['Answer'].shift()
        # Set True if current answer differs from the previous, except for the first entry
        flags = group['Answer'] != shifted
        # Explicitly set the first entry to False
        flags.iloc[0] = False
        return flags.astype(int)  # Convert boolean to integer

    # Apply the flagging function to each group
    df['Flag'] = df.groupby(['Facility_ID', 'Indicator_Name'], as_index=False, group_keys=False).apply(flag_changes_with_initial_check)

    # Return the modified DataFrame
    return df

text_flag = flag_changes(text_input_ind)

In [26]:
def update_detection_result(df):
    df['Reporting_Period'] = pd.to_datetime(df['Reporting_Period'], format='%m/%d/%Y')
    df.sort_values(by=['Facility_ID', 'Indicator_Name', 'Reporting_Period'], inplace=True)

    df['Detection_Result'] = pd.NA

    for name, group in df.groupby(['Facility_ID', 'Indicator_Name']):
        changes = group.index[group['Flag'] == 1]

        for idx in changes:
            prev_idx = idx - 1
            if prev_idx in df.index:
                facility_id = df.at[idx, 'Facility_ID']
                indicator_name = df.at[idx, 'Indicator_Name']
                new_answer = df.at[idx, 'Answer']
                prev_answer = df.at[prev_idx, 'Answer']
                detection_text = f"Facility ID: {facility_id}, Indicator: {indicator_name}, Answer changed from {prev_answer} to {new_answer}"
                df.at[idx, 'Detection_Result'] = detection_text

    return df

In [27]:
# Example usage
tmp = update_detection_result(text_flag)
tmp[(tmp['Facility_ID'] == 10013)]

Unnamed: 0,BUNAME_BU,Reporting_Period,Facility_ID,FACILITY_NAME,BTLR_NAME_ISSCOM,Indicator_Name,Answer,Flag,Detection_Result
27223,INSWA BU,2024-01-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
30163,INSWA BU,2024-02-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
33091,INSWA BU,2024-03-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
36010,INSWA BU,2024-04-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
38938,INSWA BU,2024-05-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
41851,INSWA BU,2024-06-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
44785,INSWA BU,2024-07-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
47719,INSWA BU,2024-08-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
50632,INSWA BU,2024-09-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,
53530,INSWA BU,2024-10-01,10013,HCCB SANAND,HINDUSTAN CC BVG,Do you have secondary operations on-site? (ie....,No,0,


In [None]:
tmp