David Flowers II \
Z1942130 \
Assignment 7 \
Koop FA2024

# 1. Download & Extract Files

In [2]:
import requests
import os
import numpy as np
import pandas as pd

from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from zipfile import ZipFile

base_url = "https://faculty.cs.niu.edu/~dakoop/cs503-2024fa/a7/"
files_to_download = [f"unemp-{x}.zip" for x in range(1970, 2021, 10)]
decades = [f"{x}" for x in range(1970, 2021, 10)]

# Helper function to download files
def download_url(url, filename):

    # If the zip file already exists, don't get it again
    if os.path.isfile(filename):
        return

    response = requests.get(url, stream=True)

    # If we don't get OK, raise exception, print status code
    if not response.ok:
        raise Exception(f"File could not be downloaded: {response.status_code}")
        
    with open(filename, 'wb') as f:
        f.write(response.content)

# Extraction helper function
def extract_file(file_name):
    with ZipFile(file, "r") as z:
        z.extractall(".")

# Loop through files to download
for file in files_to_download:
    download_url(base_url + file, file)
    extract_file(file)

# 2. Find Matching Files (10 pts)

In [3]:
searching_ext = {".npy", ".csv"}

# Get all matching files
data_files = [f for f in Path("data").rglob('*') if f.suffix in searching_ext]

# Separate updated and non-updated files
update_files = [f for f in data_files if {'update', 'mod'}.intersection(f.parts)]
regular_files = [f for f in data_files if not {'update', 'mod'}.intersection(f.parts)]

# Remove regular files that have update versions
update_names = {f.name for f in update_files}
regular_files = [f for f in regular_files if f.name not in update_names]

# Sort it just to make debugging easier later on
complete_file_set = sorted(regular_files + update_files, key=lambda x: x.name)

# 3. Structural Pattern Matching to Process a File (20 pts)

In [4]:
# Counties we need
needed_counties = [
        "DEKALB COUNTY", 
        "KANE COUNTY", 
        "BOONE COUNTY", 
        "MCHENRY COUNTY", 
        "WINNEBAGO COUNTY", 
        "OGLE COUNTY", 
        "LEE COUNTY", 
        "KENDALL COUNTY"]

def process_data_file(path):
    # Temp holder for read in data
    temp_data = {}
    
    # Match filetype and update status
    match path.parts:
        case [_, _, ("update" | "mod"), *_, ext] if ext.endswith('.npy'):
            temp_data =  pd.DataFrame(np.load(path))
        case [_, _, ("update" | "mod"), *_, ext] if ext.endswith('.csv'):
            temp_data = pd.DataFrame(pd.read_csv(path))
        case [*_, ext] if ext.endswith('.npy'):
            temp_data = pd.DataFrame(np.load(path))
        case [*_, ext] if ext.endswith('.csv'):
            temp_data = pd.DataFrame(pd.read_csv(path))

    # Convert all COUNTY to upper
    temp_data["COUNTY"] = temp_data["COUNTY"].str.upper()
    
    # Filter the columns we need
    column_filter = temp_data[["COUNTY","YEAR","LABOR_FORCE","EMPLOYED", "UNEMPLOYED_NUMBER"]]

    # Filter the counties we need
    county_filter = column_filter[column_filter['COUNTY'].isin(needed_counties)]
    
    # Add the CALC_RATE column
    result = county_filter.copy()
    result["CALC_RATE"] = (result["UNEMPLOYED_NUMBER"] / result["LABOR_FORCE"]) * 100

    # Return result
    return result

# 4. Use Threads to Process Files (30 pts)

In [11]:
with ThreadPoolExecutor() as executor:
    all_dataframes = list(executor.map(process_data_file, complete_file_set))

display(all_dataframes[0])
concatenated_data = pd.concat(all_dataframes)

for county in needed_counties:
    county_data = concatenated_data[concatenated_data["COUNTY"] == county].copy()
    output_file_name = f"{county.replace(" COUNTY", "").strip()}.csv.gz"
    county_data.to_csv(output_file_name, compression="gzip")

Unnamed: 0,COUNTY,YEAR,LABOR_FORCE,EMPLOYED,UNEMPLOYED_NUMBER,CALC_RATE
3,BOONE COUNTY,1974,12340,11864,476,3.857374
18,DEKALB COUNTY,1974,32217,30920,1297,4.025825
44,KANE COUNTY,1974,116543,112739,3804,3.264031
46,KENDALL COUNTY,1974,8802,8614,187,2.124517
51,LEE COUNTY,1974,11436,10855,581,5.080448
55,MCHENRY COUNTY,1974,50277,48557,1719,3.419058
70,OGLE COUNTY,1974,14047,13656,390,2.776394
100,WINNEBAGO COUNTY,1974,116416,110901,5514,4.736462


# Check Results

In [6]:
counties_no_suffix = ['DEKALB', 'KANE', 'BOONE', 'MCHENRY', 'WINNEBAGO', 'OGLE', 'LEE', 'KENDALL'] 
for c in sorted(counties_no_suffix):
    cdf = pd.read_csv(f'{c}.csv.gz')
    print(c, cdf["CALC_RATE"].mean())

BOONE 8.054294578004583
DEKALB 5.721435538644467
KANE 6.40548752171531
KENDALL 5.457945584277057
LEE 6.260036908826006
MCHENRY 5.87830091101972
OGLE 6.60481888833855
WINNEBAGO 7.695749477329302
