In [1]:
import os, json
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
# Function to check date format and return True if valid, False otherwise
def validate_and_convert_date(date_str):
    if date_str.strip():  # Check if not empty or null
        try:
            # Use '%m/%d/%y' format for validation and conversion
            return datetime.strptime(date_str, '%m/%d/%y')
        except ValueError:
            return None  # Return None for invalid date strings
    return None  # Return None for empty strings or non-string values

In [3]:
def join_unique(x):
    return ', '.join(set(filter(None, map(str.strip, x))))

def join_unique_address(x):
    return '/ '.join(set(filter(None, map(str.strip, x))))

In [4]:
# File paths
input_folder_path = 'C:/Files for SHB4 - Copy/Resources/'
final_output_file_path = 'C:/Files for SHB4 - Copy/final_df1.csv'
levels_json_file = 'C:/Files for SHB4 - Copy/levels_Revised.json'
files = os.listdir(input_folder_path)

In [5]:
dtype_dict = {
    'Associated Docs (People)': str,
    'People Tracker Control Number': str,
    'First Name or Initial': str,
    'Middle Name or Initial': str,
    'Last Name': str,
    'City': str,
    'State': str,
    'ZIP Code': str,
    'Date of Birth': str,
    'Financial Account Number': str
}

In [6]:
# List to store DataFrames
dfs = []


In [None]:
# Loop through each file and read it into a DataFrame, then append to the list
for file in files:
    if file.endswith('.csv'):  # Check if the file is a CSV file
        file_path = os.path.join(input_folder_path, file)  # Get the full file path
        df = pd.read_csv(file_path, dtype=dtype_dict,encoding='latin1')
        dfs.append(df)

In [8]:
# Combine DataFrames
combined_df = pd.concat(dfs, ignore_index=True)

In [9]:
# Removing Row Duplicates

combined_df.drop_duplicates(inplace=True)

In [10]:
# Fill NaN values with empty string ('') for non-numeric columns
non_numeric_columns = combined_df.select_dtypes(exclude=['number']).columns
combined_df[non_numeric_columns] = combined_df[non_numeric_columns].fillna('')

In [11]:
# Remove rows with NaN and empty strings in 'Social Security Number' column
combined_df = combined_df[
    (combined_df['Social Security Number'].notnull()) &  # Not NaN
    (combined_df['Social Security Number'] != '')       # Not empty string
].sort_values(by='Social Security Number')

In [12]:
# # If you want to convert the 'Date of Birth' back to string format with the desired format
# combined_df['Date of Birth'] = combined_df['Date of Birth'].dt.strftime('%m/%d/%Y')

In [None]:
# Validating the ZIP Codes, State, City fields
valid_zip_mask = combined_df['ZIP Code'].str.match(r'^\d{5}$') | (combined_df['ZIP Code'] == '')
valid_state_mask = combined_df['State'].str.match(r'^[A-Z]{2}$') | (combined_df['State'] == '')
valid_city_mask = combined_df['City'].str.match(r'^[a-zA-Z\s]+$') | (combined_df['City'] == '')
filtered_df = combined_df[valid_zip_mask & valid_state_mask & valid_city_mask]
filtered_df['Address'] = filtered_df['Address'].str.strip()

In [14]:
# Replacing Nan's with empty strings
df_sorted = filtered_df.applymap(lambda x: '' if pd.isna(x) else x)

In [15]:
# Load the JSON file
with open(levels_json_file, 'r') as file:
    levels_data = json.load(file)


In [16]:
# Remove quotations around 'join_unique' in aggregation functions
for level_data in levels_data.values():
    aggregation_functions = level_data['aggregation_functions']
    for col, func in aggregation_functions.items():
        if func == 'join_unique':
            aggregation_functions[col] = join_unique
        if func == 'join_unique_address':
            aggregation_functions[col] = join_unique_address

In [None]:
df_grouped.drop(columns=['Date of Birth'], inplace=True)  # Drop the existing 'Date of Birth' column


In [None]:
# Iterate over the levels and perform aggregation
for level_name, level_data in levels_data.items():
    columns = level_data['columns']
    aggregation_functions = level_data['aggregation_functions']
    df_grouped = filtered_df.groupby(columns).agg(aggregation_functions).reset_index()
    # Perform further operations with filtered_df as needed
    print(f"Grouping by {level_name} with columns - {columns} completed.")

In [None]:
df_grouped

In [None]:
# Writing the final output to a file
df_grouped.to_csv(final_output_file_path, index=False)