In [126]:
# Import and leverage the imported file for analysis
import pandas as pd
import openpyxl
import os
import tkinter as tk
from tkinter import Tk
from tkinter import filedialog

Tk().withdraw()
file_path = filedialog.askopenfilename()

In [127]:
# Extract the base name of the file
file_name = os.path.basename(file_path)

# Remove the extension from the file name
file_name_without_extension = os.path.splitext(file_name)[0]

In [128]:
# Read the Excel file into a DataFrame
df = pd.read_excel(file_path)

print(file_name)
print(df.columns)

[Hartford pilot data] Zesty 100K Sample - External 01122023.xlsx
Index(['Ref', 'Effective Date', 'Street', 'City', 'State', 'Zip', 'Lat',
       'Long'],
      dtype='object')


In [129]:
# Define the possible names of the address columns
address_columns = {
    'street_and_house_number': ['Addr1','Risk Address','Street'],
    'city': ['City','Risk City','city'],
    'state_abbreviation': ['State','StateProvCd','Risk State'],
    'postal_code': ['Risk Zip Code','PostalCd','Zip','Zip Code'],
    'latitude': ['Lat', 'Risk Lat', 'lat'],
    'longitude': ['Long','Risk Long','long']
    }
    
# Check if any of the address column names are in the DataFrame
address_column_mask = df.columns.isin(
    address_columns['street_and_house_number'] +
    address_columns['city'] +
    address_columns['state_abbreviation'] +
    address_columns['postal_code']
)

# Get the name of the column to use for each component
street_and_house_number_col = list(set(df.columns) & set(address_columns['street_and_house_number']))[0]
city_col = list(set(df.columns) & set(address_columns['city']))[0]
state_abbreviation_col = list(set(df.columns) & set(address_columns['state_abbreviation']))[0]
postal_code_col = list(set(df.columns) & set(address_columns['postal_code']))[0]

# Select the first column that exists in the DataFrame
for column, column_names in address_columns.items():
    match = df.columns[df.columns.isin(column_names)].tolist()
    if match:
        address_columns[column] = match[0]

if not any(address_column_mask):
    raise ValueError('None of the specified address columns were found in the DataFrame')

# Concatenate the address components into a single address string
df['address'] = (
    df[address_columns['street_and_house_number']].astype(str) + ', ' +
    df[address_columns['city']].astype(str) + ', ' +
    df[address_columns['state_abbreviation']].astype(str) + ' ' +
    df[address_columns['postal_code']].astype(str)
)

In [130]:
# Flag duplicates in the address column
df['address_duplicate'] = df['address'].duplicated()
    
# Count the number of duplicates
num_address_duplicates = df['address_duplicate'].sum()

# Calculate the percentage of duplicates
percent_address_duplicates = num_address_duplicates / df.shape[0] * 100
    
# Repeat the same steps for the latitude and longitude columns
df['lat_long'] = df[address_columns['latitude']].astype(str) + ', ' + df[address_columns['longitude']].astype(str)
df['lat_long_duplicate'] = df['lat_long'].duplicated()
num_lat_long_duplicates = df['lat_long_duplicate'].sum()
percent_lat_long_duplicates = num_lat_long_duplicates / df.shape[0] * 100

In [131]:
# Print the results
print('Number of duplicate addresses:', num_address_duplicates)
print('Percentage of duplicate addresses:', percent_address_duplicates)
print('Number of duplicate lat/long pairs:', num_lat_long_duplicates)
print('Percentage of duplicate lat/long pairs:', percent_lat_long_duplicates)

Number of duplicate addresses: 115
Percentage of duplicate addresses: 0.11499999999999999
Number of duplicate lat/long pairs: 1221
Percentage of duplicate lat/long pairs: 1.221


In [132]:
# Define the possible names of the effective date columns
effective_date_columns = ['Effective Date', 'EffDate', 'eff_date']

# Check if any of the effective date column names are in the DataFrame
effective_date_column_mask = df.columns.isin(effective_date_columns)

# Get the name of the column to use for the effective date
effective_date_col = list(set(df.columns) & set(effective_date_columns))[0]

# Select the first column that exists in the DataFrame
for column, column_names in {'effective_date': effective_date_columns}.items():
    match = df.columns[df.columns.isin(column_names)].tolist()
    if match:
        effective_date_col = match[0]

if not any(effective_date_column_mask):
    raise ValueError('None of the specified effective date columns were found in the DataFrame')

In [133]:
# Convert the effective date column to datetime format
df[effective_date_col] = pd.to_datetime(df[effective_date_col], errors='coerce')

# Find the index of the effective date column
col_index = df.columns.get_loc(effective_date_col)

# Create a new output flag column with values True if the effective date is prior to 2020, False otherwise
df.insert(loc=col_index + 1, column='effective_date_prior_to_2020', value=df[effective_date_col] < '2020-01-01')

# Calculate the percentage of records with the output flag set to True
percent_records = df['effective_date_prior_to_2020'].mean() * 1

print(f"Percentage of records with effective date prior to 2020: {percent_records}")

# Count the number of records with effective date prior to 2020
count_records = df[df['effective_date_prior_to_2020'] == True].shape[0]

print(f"Count of records with effective date prior to 2020: {count_records}")


Percentage of records with effective date prior to 2020: 0.83756
Count of records with effective date prior to 2020: 83756


In [134]:
# Create a dataframe with the results
duplicate_results = {'Duplicate Addresses': [num_address_duplicates],
                     'Percentage of Duplicate Addresses': percent_address_duplicates / 100,
                     'Duplicate Lat/Long Pairs': [num_lat_long_duplicates],
                     'Percentage of Duplicate Lat/Long Pairs': percent_lat_long_duplicates / 100,
                     'Percentage Before 2020': percent_records,
                     'Count Before 2020': count_records}

duplicate_results_df = pd.DataFrame(duplicate_results)

# Create a file dialog to allow the user to select the save location
root = tk.Tk()
root.withdraw()
default_file_name = file_name_without_extension + "_duplicate_QA_Check.xlsx"
file_path = filedialog.asksaveasfilename(defaultextension='.xlsx', initialfile=default_file_name)


# Write the original data to a new Excel file
df.to_excel(file_path, engine='openpyxl', index=False)

# Append the results to the same Excel file
book = openpyxl.load_workbook(file_path)
writer = pd.ExcelWriter(file_path, engine='openpyxl')
writer.book = book
duplicate_results_df.to_excel(writer, sheet_name='Duplicate Results Report', index=False)
writer.save()
writer.close()
print("Excel write task completed.")


Excel write task completed.
