In [78]:
import pandas as pd
import re

original_data = pd.read_csv('../../../Menu.csv')
cleaned_data = pd.read_csv('../CleanedMenu.csv')

# Define functions to check each IC violation
def check_special_characters(event):
    return bool(re.search(r'[\[\];.,:!?\(\)"\']', str(event)))

def check_leading_trailing_whitespace(event):
    return str(event) != str(event).strip()

def check_case_standardization(event):
    if pd.isna(event):
        return False
    return str(event) != str(event).upper()

def check_material_consistency(material):
    if pd.isna(material):
        return False
    standard_materials = {'CARD', 'FOLDER', 'BROADSIDE', 'BOOKLET', 'BROADSHEET', 'PAPER', 'CARDS' }
    return material not in standard_materials

def check_dimensions_format(dimensions):
    if pd.isna(dimensions):
        return False
    return not bool(re.match(r'^\d+(\.\d+)?\s*[xX]\s*\d+(\.\d+)?$', str(dimensions)))

def check_area_calculation(length, width, area, is_centimeters=False):
    if pd.isna(length) or pd.isna(width) or pd.isna(area):
        return False
    try:
        if is_centimeters:
            return abs((float(length) * float(width) / 2.54 / 2.54) - float(area)) > 0.01
        return abs(float(length) * float(width) - float(area)) > 0.01
    except ValueError:
        return True

def check_single_float_number(value):
    if pd.isna(value):
        return False
    try:
        float_value = float(value)
        return not isinstance(float_value, float)
    except ValueError:
        return False
        
def check_special_characters_location(location):
    if pd.isna(location):
        return False
    return bool(re.search(r'[\[\]:;,!?\(\)"]', str(location)))

original_violations = {
    "special_characters": original_data['event'].apply(check_special_characters).sum(),
    "whitespace": original_data['event'].apply(check_leading_trailing_whitespace).sum(),
    "case_standardization": original_data['event'].apply(check_case_standardization).sum(),
    "unique_events": original_data['event'].nunique(),
    "location_special_characters": original_data['location'].apply(check_special_characters_location).sum(),
    "location_whitespace": original_data['location'].apply(check_leading_trailing_whitespace).sum(),
    "unique_locations": original_data['location'].nunique()
}

total_non_empty_rows = {
    "event_transform": cleaned_data['event_transform'].dropna().shape[0],
    "Material": cleaned_data['Material'].dropna().shape[0],
    "Dimensions": cleaned_data['Dimensions'].dropna().shape[0],
    "Length": cleaned_data['Length'].dropna().shape[0],
    "Width": cleaned_data['Width'].dropna().shape[0],
    "Area": cleaned_data['Area'].dropna().shape[0],
    "location": cleaned_data['location'].dropna().shape[0]
}

cleaned_violations = {
    "special_characters": cleaned_data['event_transform'].apply(check_special_characters).sum(),
    "whitespace": cleaned_data['event_transform'].apply(check_leading_trailing_whitespace).sum(),
    "case_standardization": cleaned_data['event_transform'].apply(check_case_standardization).sum(),
    "unique_events": cleaned_data['event_cluster'].nunique(),
    "material_consistency": cleaned_data['Material'].apply(check_material_consistency).sum() / total_non_empty_rows['Material'] * 100,
    "dimensions_format": cleaned_data['Dimensions'].apply(check_dimensions_format).sum() / total_non_empty_rows['Dimensions'] * 100,
    "length_single_float": cleaned_data['Length'].apply(check_single_float_number).sum() / total_non_empty_rows['Length'] * 100,
    "width_single_float": cleaned_data['Width'].apply(check_single_float_number).sum() / total_non_empty_rows['Width'] * 100,
    "area_calculation": cleaned_data.apply(lambda row: check_area_calculation(row['Length'], row['Width'], row['Area'], is_centimeters=row['isCentimeters'] == 1) if 'Length' in row and 'Width' in row and 'Area' in row else False, axis=1).sum() / total_non_empty_rows['Area'] * 100,
    "location_special_characters": cleaned_data['location'].apply(check_special_characters_location).sum(),
    "location_whitespace": cleaned_data['location'].apply(check_leading_trailing_whitespace).sum(),
    "unique_locations": cleaned_data['location'].nunique()
}

violations_df = pd.DataFrame([original_violations, cleaned_violations], index=["Original", "Cleaned"])
print(violations_df)


          special_characters  whitespace  case_standardization  unique_events  \
Original                 916           3                   729           1770   
Cleaned                    0           0                     0            285   

          location_special_characters  location_whitespace  unique_locations  \
Original                         1002                   14              6283   
Cleaned                           213                    0              5318   

          material_consistency  dimensions_format  length_single_float  \
Original                   NaN                NaN                  NaN   
Cleaned                 0.8289           1.143298                  0.0   

          width_single_float  area_calculation  
Original                 NaN               NaN  
Cleaned                  0.0               0.0  
