In [162]:
import os
import pandas as pd
import numpy as np
import openpyxl
import re
import warnings
from collections import Counter

# Specify the path to your Excel file
file_path = os.getcwd()+'/data/FY25 Development Report Workbook.xlsx'

# Read the Excel file
df = pd.read_excel(file_path)

# Load the Excel file to inspect its sheet names
xls = pd.ExcelFile(file_path)

# Print the list of sheet names
print(xls.sheet_names)

['July Report', '07 Data Dump', '07 Five Reports', 'Aug Report', '08 Data Dump', '08 Five Reports', 'Sept Report', '09 Data Dump', '09 Five Reports', 'Oct Report', '10 Data Dump', '10 Five Reports', 'Nov Report', '11 Data Dump', '11 Five Reports', 'Dec Report', '12 Data Dump', '12 Five Reports', 'Jan Report ', '01 Data Dump', '01 Five Reports', 'Feb Report', 'Mar Report', 'Apr Report', 'May Report', 'June Report', 'QTR Stats']


In [21]:
sorted(xls.sheet_names)

['01 Data Dump',
 '01 Five Reports',
 '07 Data Dump',
 '07 Five Reports',
 '08 Data Dump',
 '08 Five Reports',
 '09 Data Dump',
 '09 Five Reports',
 '10 Data Dump',
 '10 Five Reports',
 '11 Data Dump',
 '11 Five Reports',
 '12 Data Dump',
 '12 Five Reports',
 'Apr Report',
 'Aug Report',
 'Dec Report',
 'Feb Report',
 'Jan Report ',
 'July Report',
 'June Report',
 'Mar Report',
 'May Report',
 'Nov Report',
 'Oct Report',
 'QTR Stats',
 'Sept Report']

In [None]:
# This function serves to get all the files out of our data folder so we can loop through them. 
folder = os.getcwd()+'/data'

def get_files_in_folder(folder):
    file_names = []
    for file_name in os.listdir(folder):
        # Skip .DS_Store files
        if file_name == '.DS_Store':
            continue
        file_path = folder+'/'+file_name
        if os.path.isfile(file_path):
            file_names.append(file_name)
    return file_names

get_files_in_folder(folder)

['FY25 Development Report Workbook.xlsx']

In [116]:
# This function takes the 

def get_file_fiscal_year(file):
    # Get the fiscal year
    match = re.search(r'FY(\d{2})', file)
    if match:
        fiscal_year = '20' + match.group(1)
    else:
        # Issue a warning if the fiscal year is not found
        warnings.warn(f"The file name '{file}' does not specify a fiscal year. Please specify 'FY' in the file name.", UserWarning)
        fiscal_year = None  # Set to None to indicate missing fiscal year
    return fiscal_year
    
get_file_fiscal_year(get_files_in_folder(folder)[0])

'2025'

In [118]:
file = os.getcwd()+'/data'+'/FY25 Development Report Workbook.xlsx'
def build_sheet_details_df(file):
    # Load the sheet names from the Excel file
    sheets = pd.ExcelFile(file).sheet_names
    
    # Filter the sheet names into two lists and sort them
    data_dump_sheets = sorted([sheet for sheet in sheets if "Data Dump" in sheet])
    five_reports_sheets = sorted([sheet for sheet in sheets if "Five Reports" in sheet])
    
    # Check if the number of Data Dump sheets matches Five Reports sheets
    if len(data_dump_sheets) != len(five_reports_sheets):
        raise ValueError("The number of 'Data Dump' sheets does not match the number of 'Five Reports' sheets. Please check the sheet names inside the file.")
    
    fiscal_year = get_file_fiscal_year(file)

    months = len(data_dump_sheets)

    fiscal_years = [fiscal_year]*months
    
    # Extract the month from the sheet names
    months = [re.match(r'(\d{2})', sheet).group(1) for sheet in data_dump_sheets if re.match(r'^\d{2} ', sheet)]
    
    # Create the DataFrame with the initial columns
    month_df_1 = pd.DataFrame({
        'Data Dump' : data_dump_sheets,
        'Five Reports' : five_reports_sheets,
        'Month Number' : months,  
        'Fiscal Year' : fiscal_years
    })
    
    # Add the Calendar Year column based on the month
    month_df_1['Calendar Year'] = month_df_1['Month Number'].apply(lambda x: str(int(fiscal_year)-1) if x not in ['01', '02', '03', '04', '05', '06'] else fiscal_year)

    # DataFrame with month mappings
    month_df_2 = pd.DataFrame({
        'Month Number' : ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'],
        'Month Name' : ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    })

    # Left join month_df_1 with month_df_2 on the 'Month' column using merge
    month_df_merged = pd.merge(month_df_1, month_df_2, how='left', left_on='Month Number', right_on='Month Number')
    # Rearrange the columns in the desired order
    month_df_merged = month_df_merged[['Data Dump', 'Five Reports', 'Month Number', 'Month Name', 'Calendar Year', 'Fiscal Year']]

    return month_df_merged

build_sheet_details_df(file)


Unnamed: 0,Data Dump,Five Reports,Month Number,Month Name,Calendar Year,Fiscal Year
0,01 Data Dump,01 Five Reports,1,January,2025,2025
1,07 Data Dump,07 Five Reports,7,July,2024,2025
2,08 Data Dump,08 Five Reports,8,August,2024,2025
3,09 Data Dump,09 Five Reports,9,September,2024,2025
4,10 Data Dump,10 Five Reports,10,October,2024,2025
5,11 Data Dump,11 Five Reports,11,November,2024,2025
6,12 Data Dump,12 Five Reports,12,December,2024,2025


# Extract a Data Dump from a Sheet

In [None]:
file = '/Users/lukewyman/github-classroom/bmkt699-capstone-2025spring/capstone_missoula_building_permits/Phase 3 Community Development Snapshots/data/FY25 Development Report Workbook.xlsx'
sheet = '09 Data Dump'

def extract_data_dump_main(file, sheet):
    data = pd.read_excel(file, sheet).iloc[:,0:15]

    # Filter out rows where 'Record Number' contains numeric values
    data = data[~data['Record Number'].apply(lambda x: isinstance(x, (int, float)))]
    
    # Filter out NaN values in 'Record Number'
    data = data[~data['Record Number'].isna()]

    # Remove only the trailing newline character
    data['ADU or TED on Address?'] = data['ADU or TED on Address?'].str.rstrip('\n').replace('', np.nan)

    return data

sheet_data_main = extract_data_dump_main(file, sheet)

def find_header_row_in_revisions(file, sheet):
    """Finds the row number that likely contains the header based on >50% non-null values."""
    preview = pd.read_excel(file, sheet, header=None, nrows=5).iloc[:,16:33]
    total_cols = preview.shape[1]

    for i, row in preview.iterrows():
        non_null_count = row.notna().sum()
        unnamed_count = sum(str(col).startswith("Unnamed") for col in row.astype(str))

        # Ensure more than 50% of the row contains valid data AND is not mostly 'Unnamed'
        if non_null_count > (0.5 * total_cols) and unnamed_count < (0.5 * total_cols):
            return i  # Return row index as header


    return None  # Fallback if no valid header is found

def extract_data_dump_revisions(file, sheet):

    header = find_header_row_in_revisions(file, sheet)

    data = pd.read_excel(file, sheet, header=header).iloc[:,16:33]
    
    # Filter out rows where 'Record Number' contains numeric values and where 'Address' contains numeric values
    data = data[~data['Record Number'].apply(lambda x: isinstance(x, (int, float)))]
    data = data[(data['Project Cost'] > 0) | (data['Job Cost Valuation'] > 0)]

    # Drop columns with "Unnamed" in their name
    data = data.loc[:, ~data.columns.str.contains("Unnamed", na=False)]

    # Add the Category column
    data['Property Type'] = ['Modification to Work in Progress']*len(data)

    # Total Construction Valuation
    data['TOTAL CONSTRUCTION VALUATION'] = data['Project Cost'].fillna(0) + data['Job Cost Valuation'].fillna(0)

    # Current Market Evaluation
    data['CURRENT MARKET VALUATION'] = 

    return data

sheet_data_revisions = extract_data_dump_revisions(file, sheet)

sheet_data_revisions


Unnamed: 0,Record Number,Work Description,Business Name,Project Cost,Job Cost Valuation,Current Valuation - Comm,Current Valuation - Res,Address,Property Type,TOTAL CONSTRUCTION VALUATION
0,2024-MSS-RES-00546.01,9/12/24 REVISION TO ADD ADU,EDGELL BUILDING AND DEVELOPMENT INC,1000.0,,,,401 WOODWORTH AVE,Modification to Work in Progress,1000.0
1,2024-MSS-RES-00546.01,9/12/24 REVISION TO ADD ADU,EDGELL BUILDING AND DEVELOPMENT INC,1000.0,,,,401-A WOODWORTH AVE,Modification to Work in Progress,1000.0
2,2023-MSS-RES-00803.02,8/14/24 REVISION/ADD INTERIOR WALL/ADD WINDOW/...,ROSS RENOVATION AND CONSTRUCTION,5000.0,,,,127 W HILL CREST DR,Modification to Work in Progress,5000.0


## Data Definitions for Cleaning

* Single Dwelling Attached: 'ADU or TED on Address?' == 'SFR-ATT'

In [319]:
def assign_permit_type(row):
    
    # New Construction
    
    # Single Dwelling Attached
    if (row['Residential Subtype'] == 'BNSFT - New Single Family Townhouse' or row['Residential Subtype'] == 'BNSFR - New Single Family Residence') and row['ADU or TED on Address?'] == 'SFR-ATT':
        return 'Single Dwelling Attached'
    # Single Dwelling Detached
    elif (row['Residential Subtype'] == 'BNSFT - New Single Family Townhouse' or row['Residential Subtype'] == 'BNSFR - New Single Family Residence') and row['ADU or TED on Address?'] == 'SFR-DET':
        return 'Single Dwelling Detached'
    # Duplex
    elif row['Residential Subtype'] == 'BNRDX - New Duplex' and row['ADU or TED on Address?'] == 'Duplex':
        return 'Duplex'
    # Multi-Dwelling Apartment
    elif (row['Commercial Subtype'] == 'BNMRA - New Multifamily 3-4 Units' or row['Commercial Subtype'] == 'BNMRB - New Multifamily 5+ Units') and row['ADU or TED on Address?'] == 'MFR-APT':
        return 'Multi-Dwelling Apartment'
    # Multi-Dwelling Condo
    elif (row['Commercial Subtype'] == 'BNMRA - New Multifamily 3-4 Units' or row['Commercial Subtype'] == 'BNMRB - New Multifamily 5+ Units') and row['ADU or TED on Address?'] == 'MFR-CONDO':
        return 'Multi-Dwelling Condo'
    # TED Single Dwelling
    elif row['Residential Subtype'] == 'BNSFR - New Single Family Residence' and row['ADU or TED on Address?'] == 'TED SF':
        return 'TED Single Dwelling'
    # TED Two Unit
    elif (row['Residential Subtype'] == 'BNSFT - New Single Family Townhouse' or row['Residential Subtype'] == 'BNSFR - New Single Family Residence') and row['ADU or TED on Address?'] == 'TED 2U':
        return 'TED Two Unit'    
    # TED 3+ Unit
    elif (row['Residential Subtype'] == 'BNSFT - New Single Family Townhouse' or row['Residential Subtype'] == 'BNSFR - New Single Family Residence') and row['ADU or TED on Address?'] == 'TED 3+':
        return 'TED 3+'
    # Misc. (Garage, Shed, etc.)
    elif (row['Residential Subtype'] == 'BNRDA - New Detached Accessory Building' or row['Residential Subtype'] == 'BNRDG - New Detached Garage/Carport' or row['Residential Subtype'] == 'BRFND - New Residential Foundation'):
        return 'Misc. (Garage, Shed, etc.)'
    # Assembly
    elif row['Residential Subtype'] == 'BNCCR -  New Church/Religious Building':
        return 'Assembly'
    # Business
    elif row['Category'] == '08 - New Business':
        return 'Business'
    # Education (Undefined)
    elif 1 == 2:
        return 'Education'
    # Hazardous (Undefined)
    elif 1 == 2:
        return 'Hazardous'
    # Institutional (Undefined)
    elif 1 == 2:
        return 'Institutional'
    
    # Addition/Remodel

    if row['Category'] == '01 - Remodel Commercial':
        return 'Commercial'
    if row['Category'] == '02 - Remodel Residential':
        return 'Residential'

    # Flag Unspecified
    else:
        return 'Unspecified'

In [320]:
sheet_data['Property Type'] = sheet_data.apply(assign_permit_type, axis=1)
print(Counter(sheet_data['Property Type']))

Counter({'Residential': 63, 'Commercial': 21, 'Single Dwelling Detached': 12, 'Multi-Dwelling Apartment': 11, 'Misc. (Garage, Shed, etc.)': 5, 'TED Single Dwelling': 4, 'Single Dwelling Attached': 3, 'Business': 3, 'Unspecified': 2, 'Duplex': 2})


In [306]:
sheet_data.loc[sheet_data['Property Type'] == 'TED Two Unit']

Unnamed: 0,Category,Record Number,Commercial Subtype,Residential Subtype,ADU/TED on Permit?,Work Description,Business Name,Project Cost,Job Cost Valuation,Current Valuation - Comm,Current Valuation - Res,Address,ADU or TED on Address?,TOTAL CONSTRUCTION VALUATION,CURRENT MARKET VALUATION,Property Type
