# Track 2: Sectoral & Geographical Trends

Main Question: Which industries and regions dominate? And this is sector-Based Visualizations with county or ZIP Maps.


In [1]:
import pandas as pd
import os

## Data Cleaning

In [2]:
def merge_excel_by_year(input_folder, output_folder, file_suffix):
    if not os.path.exists(input_folder):
        print(f"Input path does not exist: {input_folder}")
        return
    
    file_names = [f for f in os.listdir(input_folder) if f.endswith('.xlsx')]
    
    if not file_names:
        print(f"No .xlsx files found in the input path: {input_folder}")
        return
    
    yearly_data = {}

    for file_name in file_names:
        year = file_name[:4]
        
        if not year.isdigit():
            print(f"Skipping file with invalid format: {file_name}")
            continue
        
        file_path = os.path.join(input_folder, file_name)
        try:
            df = pd.read_excel(file_path)
        except Exception as e:
            print(f"Failed to read file: {file_name}. Error: {e}")
            continue
        
        # Drop columns where more than 40% of the values are blank
        threshold = 0.4 * len(df)
        df = df.dropna(axis=1, thresh=threshold)
        
        df['Year'] = year
        
        if year not in yearly_data:
            yearly_data[year] = df
        else:
            yearly_data[year] = pd.concat([yearly_data[year], df], ignore_index=True)

    os.makedirs(output_folder, exist_ok=True)

    for year, data in yearly_data.items():
        output_file = os.path.join(output_folder, f"{year}{file_suffix}.csv")
        try:
            data.to_csv(output_file, index=False)
            print(f"File saved: {output_file}")
        except Exception as e:
            print(f"Failed to save file: {output_file}. Error: {e}")

# Example usage
input_folder = "./Datasets/Contributions/"
output_folder = "./Datasets_Cleaned/Contributions/"
file_suffix = "_mi_cfr_contributions"

merge_excel_by_year(input_folder, output_folder, file_suffix)


File saved: ./Datasets_Cleaned/Contributions/2023_mi_cfr_contributions.csv
File saved: ./Datasets_Cleaned/Contributions/2020_mi_cfr_contributions.csv
File saved: ./Datasets_Cleaned/Contributions/2021_mi_cfr_contributions.csv
File saved: ./Datasets_Cleaned/Contributions/2024_mi_cfr_contributions.csv
File saved: ./Datasets_Cleaned/Contributions/2025_mi_cfr_contributions.csv
File saved: ./Datasets_Cleaned/Contributions/2022_mi_cfr_contributions.csv


In [3]:
input_folder = "./Datasets/Expenditures/"
output_folder = "./Datasets_Cleaned/Expenditures/"
file_suffix = "_mi_cfr_expenditures"

merge_excel_by_year(input_folder, output_folder, file_suffix)

input_folder = "./Datasets/Receipts/"
output_folder = "./Datasets_Cleaned/Receipts/"
file_suffix = "_mi_cfr_receipts"

merge_excel_by_year(input_folder, output_folder, file_suffix)


File saved: ./Datasets_Cleaned/Expenditures/2020_mi_cfr_expenditures.csv
File saved: ./Datasets_Cleaned/Expenditures/2025_mi_cfr_expenditures.csv
File saved: ./Datasets_Cleaned/Expenditures/2022_mi_cfr_expenditures.csv
File saved: ./Datasets_Cleaned/Expenditures/2023_mi_cfr_expenditures.csv
File saved: ./Datasets_Cleaned/Expenditures/2024_mi_cfr_expenditures.csv
File saved: ./Datasets_Cleaned/Expenditures/2021_mi_cfr_expenditures.csv
File saved: ./Datasets_Cleaned/Receipts/2021_mi_cfr_receipts.csv
File saved: ./Datasets_Cleaned/Receipts/2023_mi_cfr_receipts.csv
File saved: ./Datasets_Cleaned/Receipts/2025_mi_cfr_receipts.csv
File saved: ./Datasets_Cleaned/Receipts/2020_mi_cfr_receipts.csv
File saved: ./Datasets_Cleaned/Receipts/2022_mi_cfr_receipts.csv
File saved: ./Datasets_Cleaned/Receipts/2024_mi_cfr_receipts.csv
