In [3]:
import os
import xml.etree.ElementTree as ET
import pandas as pd

In [4]:
DataDirectory = "../Opportunity-Hack-KSJ/Data"

In [5]:
# Initialize a list to store the paths of XML files with 990PF records
xmlPaths990PF = []

# Loop through the folders and XML files
for folderName in os.listdir(DataDirectory):
    folderPath = os.path.join(DataDirectory, folderName)
    if os.path.isdir(folderPath):
        for xmlName in os.listdir(folderPath):
            if xmlName.endswith(".xml"):
                namespace = {'ns': 'http://www.irs.gov/efile'}
                xmlPath = os.path.join(folderPath, xmlName)
                # print(xmlPath)
                tree = ET.parse(xmlPath)
                root = tree.getroot()

                # Find Form Type and include only 990PF records
                returnTypeElem = root.find(".//ns:ReturnTypeCd", namespaces=namespace)
                if returnTypeElem is not None:
                    formType = returnTypeElem.text
                    if formType.lower() == "990pf":
                        # Add the path to the list for 990PF records
                        xmlPaths990PF.append(xmlPath)

# Print the paths of XML files containing 990PF records
for path in xmlPaths990PF:
    print(path)


/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202331589349100523_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202301589349100215_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202311519349101256_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202321679349100612_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202331659349101228_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202311519349101406_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202311639349100711_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202301709349100100_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202341609349100404_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202321649349101112_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202321509349100552_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202321529349100512_public.xml
/home/dark/GitHub/Opportunity-Hack-KSJ/Data/6A/202331669349100913_public.xml

In [6]:
import xml.etree.ElementTree as ET
import pandas as pd

# Define lists to store data for each column
ein_values = []
form_and_info_and_materials = []
recipient_person_names = []
submission_deadlines = []
restrictions_on_awards = []
only_contri_to_preselected = []
current_year_grant = []  # New column

n = 5000  # Number of records to extract
records_processed = 0  # Counter for records processed

# Loop through the paths of XML files containing 990PF records
for xmlPath in xmlPaths990PF:
    namespace = {'ns': 'http://www.irs.gov/efile'}
    
    try:
        # Parse the XML file
        tree = ET.parse(xmlPath)
        root = tree.getroot()
        
        # Find the <ApplicationSubmissionInfoGrp> element
        app_submission_info_elem = root.find(".//ns:ApplicationSubmissionInfoGrp", namespaces=namespace)
        
        if app_submission_info_elem is not None:
            records_processed += 1
            
            # Extract the EIN value
            einElem = root.find(".//ns:EIN", namespaces=namespace)
            einVal = einElem.text.strip() if einElem is not None else 'N/A'
            ein_values.append(einVal)
            # print(ein_values)
            
            # Extract values for columns within <ApplicationSubmissionInfoGrp>
            form_info_materials_elem = app_submission_info_elem.find(".//ns:FormAndInfoAndMaterialsTxt", namespaces=namespace)
            recipient_person_nm_elem = app_submission_info_elem.find(".//ns:RecipientPersonNm", namespaces=namespace)
            submission_deadlines_elem = app_submission_info_elem.find(".//ns:SubmissionDeadlinesTxt", namespaces=namespace)
            restrictions_on_awards_elem = app_submission_info_elem.find(".//ns:RestrictionsOnAwardsTxt", namespaces=namespace)
            
            # Extract the <OnlyContriToPreselectedInd> element
            only_contri_elem = app_submission_info_elem.find(".//ns:OnlyContriToPreselectedInd", namespaces=namespace)
            only_contri_value = 'yes' if only_contri_elem is not None and only_contri_elem.text.strip() == 'X' else 'no'
            only_contri_to_preselected.append(only_contri_value)
            
            # Extract the <CYContributionsGrantsAmt> element
            cy_contributions_elem = app_submission_info_elem.find(".//ns:CYContributionsGrantsAmt", namespaces=namespace)
            cy_contributions_value = cy_contributions_elem.text.strip() if cy_contributions_elem is not None else 'N/A'
            current_year_grant.append(cy_contributions_value)
                
            # Append values to respective lists with error handling
            form_info_materials = form_info_materials_elem.text.strip() if form_info_materials_elem is not None else 'N/A'
            recipient_person_nm = recipient_person_nm_elem.text.strip() if recipient_person_nm_elem is not None else ''
            submission_deadlines_txt = submission_deadlines_elem.text.strip() if submission_deadlines_elem is not None else ''
            restrictions_on_awards_txt = restrictions_on_awards_elem.text.strip() if restrictions_on_awards_elem is not None else ''
            
            form_and_info_and_materials.append(form_info_materials)
            recipient_person_names.append(recipient_person_nm)
            submission_deadlines.append(submission_deadlines_txt)
            restrictions_on_awards.append(restrictions_on_awards_txt)
        
        # Break the loop if you have extracted n records
        if records_processed >= n:
            break
    
    except Exception as e:
        print(f"Error processing {xmlPath}: {str(e)}")

# Ensure all lists have the same length
if all(len(lst) == records_processed for lst in [ein_values, form_and_info_and_materials, recipient_person_names, submission_deadlines, restrictions_on_awards, only_contri_to_preselected, current_year_grant]):
    # Create a pandas DataFrame from the extracted data
    df = pd.DataFrame({
        'EIN_value': ein_values,
        'FormAndInfoAndMaterialsTxt': form_and_info_and_materials,
        'RecipientPersonNm': recipient_person_names,
        'SubmissionDeadlinesTxt': submission_deadlines,
        'RestrictionsOnAwardsTxt': restrictions_on_awards,
        'OnlyContriToPreselectedInd': only_contri_to_preselected,
        'CurrentYearGrantAmount': current_year_grant  # New column
    })

    # Define the path for your CSV file
    csv_file_path = 'extracted_data.csv'

    # Save the DataFrame to a CSV file
    df.to_csv(csv_file_path, index=False)

    print(f'Data extracted and saved to {csv_file_path}')
else:
    print("Lists have different lengths. Please check your data extraction logic.")


Data extracted and saved to extracted_data.csv


In [12]:
df3 = pd.DataFrame()
df3['EIN'] = df['EIN_value']  # Select the column you want to rename

In [13]:
df3.head()

Unnamed: 0,EIN
0,561735521
1,271586209
2,261175028
3,10793830
4,730768139
