In [2]:
import pandas as pd


In [5]:
import os

# Set BASE_PATH to the project root (one level up from the notebooks directory)
BASE_PATH = os.path.dirname(os.getcwd())

def get_file_path(*paths):
    full_path = os.path.join(BASE_PATH, *paths)
    print(f"Attempting to access: {full_path}")
    return full_path

print(f"Project root directory (BASE_PATH): {BASE_PATH}")
print(f"Contents of project root: {os.listdir(BASE_PATH)}")

Project root directory (BASE_PATH): /Users/nathanstorey/Downloads/Agency-Name-Project-main
Contents of project root: ['.DS_Store', 'requirements.txt', 'docs', 'TODO.md', 'README.md', '.gitignore', '.venv', '.git', 'data', 'notebooks', 'src']


In [7]:
df = pd.read_csv(get_file_path('data', 'processed', 'nyc_agencies_export.csv'))

print("Existing fields:", df.columns.tolist())
print("Data types:\n", df.dtypes)
print("Sample values:\n", df.head())

Attempting to access: /Users/nathanstorey/Downloads/Agency-Name-Project-main/data/processed/nyc_agencies_export.csv
Existing fields: ['Name', 'NameAlphabetized', 'OperationalStatus', 'PreliminaryOrganizationType', 'Description', 'URL', 'ParentOrganization', 'NYCReportingLine', 'AuthorizingAuthority', 'LegalCitation', 'LegalCitationURL', 'LegalCitationText', 'LegalName', 'AlternateNames', 'Acronym', 'AlternateAcronyms', 'BudgetCode', 'PrincipalOfficerName', 'PrincipalOfficerTitle', 'OpenDatasetsURL', 'Notes', 'FoundingYear', 'SunsetYear', 'URISlug', 'DateCreated', 'DateModified', 'LastVerifiedDate', 'Name - NYC.gov Agency List', "Name - NYC.gov Mayor's Office", 'Name - NYC Open Data Portal', 'Name - ODA', 'Name - CPO', 'Name - WeGov', 'Name - Greenbook', 'Name - Checkbook', 'NameWithAcronym', 'NameAlphabetizedWithAcronym', 'PrincipalOfficerGivenName', 'PrincipalOfficerFamilyName']
Data types:
 Name                              object
NameAlphabetized                  object
OperationalS

In [8]:

# 1. Update Field Names (Terminology Transition)
# Identify columns containing 'Agency' and replace 'Agency' with 'Organization'
agency_columns = [col for col in df.columns if 'Agency' in col]
rename_mapping = {col: col.replace('Agency', 'Organization') for col in agency_columns}
df.rename(columns=rename_mapping, inplace=True)
print("Updated column names:", df.columns.tolist())

# 2. Adjust Data Types
# List of potential date fields to convert
date_fields = ['DateCreated', 'DateModified', 'FoundingYear', 'SunsetYear']

# Convert identified date fields to datetime, handling errors gracefully
for field in date_fields:
    if field in df.columns:
        df[field] = pd.to_datetime(df[field], errors='coerce')
print("Data types after conversion:\n", df.dtypes)

# 3. Remove Obsolete or Redundant Fields
# Specify fields to remove based on data analysis
obsolete_fields = ['Description_y', 'Contact Name_y']  # Replace with actual field names as needed

# Remove obsolete fields if they exist in the DataFrame
existing_obsolete_fields = [field for field in obsolete_fields if field in df.columns]
if existing_obsolete_fields:
    df.drop(columns=existing_obsolete_fields, inplace=True)
    print("Remaining columns after removal:", df.columns.tolist())
else:
    print("No obsolete fields found to remove.")

# 4. Ensure Consistency in Key Fields
# Standardize the 'Name' field
if 'Name' in df.columns:
    df['Name'] = df['Name'].astype(str).str.strip()
    # Optionally, apply additional normalization functions here
    print("Standardized 'Name' field.")

# 5. Update the Data Dictionary
# Create a data dictionary with Field Name, Data Type, Description, and Example Values
data_dictionary = pd.DataFrame({
    'Field Name': df.columns,
    'Data Type': df.dtypes.values,
    'Description': '',  # To be filled in manually later
    'Example Values': df.apply(lambda col: col.dropna().unique()[:3].tolist())
})

# Save the data dictionary to a CSV file
data_dictionary.to_csv(get_file_path('docs', 'data_dictionary.csv'), index=False)
print("Data dictionary saved to 'docs/data_dictionary.csv'.")

# 6. Save the Updated DataFrame
# Save the updated DataFrame to a new CSV file
output_path = get_file_path('data', 'processed', 'nyc_organizations_updated.csv')
df.to_csv(output_path, index=False)
print(f"Updated DataFrame saved to '{output_path}'.")




Updated column names: ['Name', 'NameAlphabetized', 'OperationalStatus', 'PreliminaryOrganizationType', 'Description', 'URL', 'ParentOrganization', 'NYCReportingLine', 'AuthorizingAuthority', 'LegalCitation', 'LegalCitationURL', 'LegalCitationText', 'LegalName', 'AlternateNames', 'Acronym', 'AlternateAcronyms', 'BudgetCode', 'PrincipalOfficerName', 'PrincipalOfficerTitle', 'OpenDatasetsURL', 'Notes', 'FoundingYear', 'SunsetYear', 'URISlug', 'DateCreated', 'DateModified', 'LastVerifiedDate', 'Name - NYC.gov Organization List', "Name - NYC.gov Mayor's Office", 'Name - NYC Open Data Portal', 'Name - ODA', 'Name - CPO', 'Name - WeGov', 'Name - Greenbook', 'Name - Checkbook', 'NameWithAcronym', 'NameAlphabetizedWithAcronym', 'PrincipalOfficerGivenName', 'PrincipalOfficerFamilyName']
Data types after conversion:
 Name                                        object
NameAlphabetized                            object
OperationalStatus                           object
PreliminaryOrganizationType  