In [2]:
import zipfile_deflate64
import zipfile
import os
import configparser
from bs4 import BeautifulSoup
import requests
from snowflake.snowpark import Session
import xmltodict
import json
import os
from tqdm import tqdm
from xml.parsers.expat import ExpatError

def irs_scrape (url):
    data_folder = '../data/irs_data'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    zip_links = soup.find_all('a', href=lambda href: href and href.endswith('.zip'))
    zip_urls = [link['href'] for link in zip_links if "2024" in link['href']]

    # Clear existing files in the data folder
    if os.path.exists(data_folder):
        for filename in os.listdir(data_folder):
            file_path = os.path.join(data_folder, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
    else:
        os.makedirs(data_folder, exist_ok=True)
    
    print("Old files deleted.")

    # Download and unzip each file
    for file in zip_urls:
        response = requests.get(file)
        
        # Download the zip file
        zip_filename = os.path.join(data_folder, file.split("/")[-1])
            
        # Save the zip file to the "data" folder
        with open(zip_filename, "wb") as file:
            file.write(response.content)
        
        print(zip_filename,'Downloaded')
            
        # Extract the contents of the zip file to the "data" folder
        try:
            with zipfile.ZipFile(zip_filename, "r") as zip_ref:
                zip_ref.extractall(data_folder)
        except:
            with zipfile_deflate64.ZipFile(zip_filename, "r") as zip_ref:
                zip_ref.extractall(data_folder)

        print(zip_filename,'Unzipped')
            
        # Remove the zip file after extraction
        os.remove(zip_filename)

        print(zip_filename,'Deleted')

    print("Download and extraction complete.")

#irs_scrape('https://www.irs.gov/charities-non-profits/form-990-series-downloads')

In [None]:
path = '../data/irs_data'
files = os.listdir(path)
file_list = [f'{path}/{x}' for x in files]
json_dir = '../data/irs_data_json/final_output_2023.json'

# list of dictionaries containing the data from each XML file
data_list = []

# loop through the XML files
for xml_file in tqdm(file_list):
    try:
        with open(xml_file, "r", encoding="utf-8", errors="ignore") as f:
            # convert the XML to a dictionary
            data = xmltodict.parse(f.read())
            data_list.append(data)
    except ExpatError as e:
        # Handle the parsing error (invalid XML)
        print(f"Skipping {xml_file} due to parsing error: {e}")

# save the list of dictionaries to a JSON file
with open(json_dir, 'w') as json_file:
    json.dump(data_list, json_file)

In [3]:
config = configparser.ConfigParser()
config_path = os.path.join("..", "credentials.ini")
config.read(config_path)

# Create a Snowflake session
session = Session.builder.configs({
    'account':config["bdw_snowflake"]["ACCOUNT"],
    'user':config["bdw_snowflake"]["USER"],
    'password':config["bdw_snowflake"]["PASSWORD"],
    'role':config["bdw_snowflake"]["ROLE"],
    'warehouse':config["bdw_snowflake"]["WAREHOUSE"],
    'database': 'BDW_RAW',
    'schema': 'IRS_DATA'
}).create()

#session.file.put('../data/irs_data_json/final_output_2023.json', "@irs_data_staging/new_irs_filings", auto_compress=True)

In [None]:
existing_table = session.read.table("IRS_RETURNS_RAW")
full_2023_table = session.read.options({'STRIP_OUTER_ARRAY':True,'ON_ERROR':'CONTINUE'}).json("../data/irs_data_json/final_output_2023.json")

In [None]:
existing_table.show(5)

In [None]:
session.read.options({'STRIP_OUTER_ARRAY':True,'ON_ERROR':'CONTINUE'}).json("../data/irs_data_json/final_output_2023.json").show(5)

In [None]:
from snowflake.snowpark.functions import parse_json

existing_table.withColumn("parsed_json", parse_json('JSON_DATA')).show()

In [None]:
new_filings = session.read.options({'STRIP_OUTER_ARRAY':True,'ON_ERROR':'CONTINUE'}).json("@irs_data_staging/new_irs_filings")

new_filings.copy_into_table('IRS_RETURNS_RAW')

In [None]:
session.read.options({'STRIP_OUTER_ARRAY':True,'ON_ERROR':'CONTINUE'}).json("@irs_data_staging/new_irs_filings").count()

In [None]:
# Clear existing files in the data folder
data_folder = '../data/irs_data/'

if os.path.exists(data_folder):
        for filename in os.listdir(data_folder):
            file_path = os.path.join(data_folder, filename)
            if os.path.isfile(file_path):
                os.remove(file_path)
else:
        os.makedirs(data_folder, exist_ok=True)

print("Old files deleted.")

In [4]:
import pandas as pd

cols = [
    'EIN',
    'TAX_YEAR',
    'NAME',
    'TERMINATED',
    'GROSS_UNDER_50K',
    'YEAR_START',
    'YEAR_END',
    'WEBSITE',
    'PRINCIPLE_OFFICER',
    'POC_STREET_ADDRESS',
    'POC_STREET_ADDRESS_2',
    'POC_CITY',
    'POC_FOREIGN_CITY',
    'POC_STATE',
    'POC_ZIP_CODE',
    'POC_COUNTRY',
    'ORG_STREET_ADDRESS_2',
    'ORG_PO_BOX',
    'ORG_CITY',
    'ORG_FOREIGN_CITY',
    'ORG_STATE',
    'ORG_ZIP_CODE',
    'ORG_COUNTRY',
    'DBA_NAME'
    ]
n_df = pd.read_csv("/Users/jack.mccormick/Downloads/data-download-epostcard.txt", names=cols, usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23], sep='|', error_bad_lines=False)



  n_df = pd.read_csv("/Users/jack.mccormick/Downloads/data-download-epostcard.txt", names=cols, usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23], sep='|', error_bad_lines=False)
  n_df = pd.read_csv("/Users/jack.mccormick/Downloads/data-download-epostcard.txt", names=cols, usecols=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23], sep='|', error_bad_lines=False)


In [5]:
from snowflake.snowpark.types import StructType, StructField, StringType, IntegerType, ArrayType

schema = StructType([
StructField( 'EIN' , StringType()),
StructField( 'TAX_YEAR' , IntegerType()),
StructField( 'NAME' , StringType()),
StructField( 'TERMINATED' , StringType()),
StructField( 'GROSS_UNDER_50K' , StringType()),
StructField( 'YEAR_START' , StringType()),
StructField( 'YEAR_END' , StringType()),
StructField( 'WEBSITE' , StringType()),
StructField( 'PRINCIPLE_OFFICER' , StringType()),
StructField( 'POC_STREET_ADDRESS' , StringType()),
StructField( 'POC_STREET_ADDRESS_2' , StringType()),
StructField( 'POC_CITY' , StringType()),
StructField( 'POC_FOREIGN_CITY' , StringType()),
StructField( 'POC_STATE' , StringType()),
StructField( 'POC_ZIP_CODE' , StringType()),
StructField( 'POC_COUNTRY' , StringType()),
StructField( 'ORG_STREET_ADDRESS_2' , StringType()),
StructField( 'ORG_PO_BOX' , StringType()),
StructField( 'ORG_CITY' , StringType()),
StructField( 'ORG_FOREIGN_CITY' , StringType()),
StructField( 'ORG_STATE' , StringType()),
StructField( 'ORG_ZIP_CODE' , StringType()),
StructField( 'ORG_COUNTRY' , StringType()),
StructField( 'DBA_NAME' , StringType())
])

session.create_dataframe(n_df, schema=schema)

ArrowTypeError: ("Expected bytes, got a 'int' object", 'Conversion failed for column EIN with type object')