In [1]:
import streamlit as st
import pandas as pd
import numpy as np
import geopandas as gpd  # For handling GeoJSON data
import folium
import branca.colormap as cm
from streamlit_folium import folium_static  # Import to render folium maps in Streamlit
from sklearn.metrics import accuracy_score



In [2]:
# Load the GeoJSON file
zip_geojson = gpd.read_file('demo_zips.geojson')

# Load person data, forcing ZIP to be read as strings
person_data = pd.read_csv('data.csv', dtype={'ZIP': str})

# Ensure ZIP codes have leading zeros and handle floats
person_data['ZIP'] = person_data['ZIP'].apply(lambda x: str(int(float(x))).zfill(5) if pd.notnull(x) else '')

# Ensure GeoJSON ZIP codes are formatted as strings with leading zeros
zip_geojson['ZIP'] = zip_geojson['ZCTA5CE10'].astype(str).str.zfill(5)


In [20]:
energy_score_threshold=0.5

In [34]:
person_data.shape

(196045, 9)

In [41]:
person_data.head()

Unnamed: 0,PROFILE_ID,NUMBER_OF_TRADELINES,WEIGHTED_ENERGYSCORE,WEIGHTED_ACTUAL_OUTPUT,FICO_V9_SCORE,ZIP,EQUIFAX_TYPE_CODE,FICO_PASS,ENERGYSCORE_PASS
0,1,15,0.061694,0,696,4265,FF,False,False
1,2,13,0.055457,0,557,33467,FP,False,False
2,3,11,0.08206,0,621,2131,DC,False,False
3,4,4,7.8e-05,0,638,17968,FA,False,False
4,5,6,0.000887,0,563,94118,DM,False,False


In [51]:
person_data[person_data['ZIP']=='01001'].head(15)

Unnamed: 0,PROFILE_ID,NUMBER_OF_TRADELINES,WEIGHTED_ENERGYSCORE,WEIGHTED_ACTUAL_OUTPUT,FICO_V9_SCORE,ZIP,EQUIFAX_TYPE_CODE,FICO_PASS,ENERGYSCORE_PASS
19926,20140,7,0.000196,0,781,1001,FZ,True,True
21069,21296,4,0.000472,0,699,1001,FA,False,True
53204,53827,6,0.081874,0,758,1001,FZ,True,True
58658,59376,11,0.000896,0,843,1001,BB,True,True
79839,80901,4,0.168476,0,501,1001,FY,False,True
91399,92628,5,0.00036,0,549,1001,VF,False,True
104589,106019,3,0.000132,0,700,1001,BB,False,True
108599,110086,1,3.8e-05,0,721,1001,BB,True,True
161372,164003,8,0.001151,0,706,1001,FZ,True,True
161413,164044,4,0.000329,0,723,1001,FZ,True,True


In [52]:
7/11

0.6363636363636364

In [35]:
zip_metrics.head()

Unnamed: 0,ZIP,Total Population,Percent Below FICO,Percent Above FICO,FICO Accuracy,EnergyScore Accuracy,Qualification Increase
0,1001,11.0,0.363636,0.636364,1.0,1.0,36.363636
1,1002,12.0,0.166667,0.833333,1.0,1.0,16.666667
2,1005,3.0,0.0,1.0,,,0.0
3,1007,18.0,0.222222,0.777778,1.0,1.0,22.222222
4,1010,3.0,0.333333,0.666667,1.0,1.0,33.333333


In [53]:
# Function to calculate metrics for each ZIP
def calculate_zip_metrics(stats_data_person, fico_threshold, energy_score_threshold):
    stats_data_person['FICO_PASS'] = stats_data_person['FICO_V9_SCORE'] > fico_threshold
    stats_data_person['ENERGYSCORE_PASS'] = stats_data_person['WEIGHTED_ENERGYSCORE'] > energy_score_threshold

    def calc_metrics(group):
        total_population = len(group)

        if total_population == 0:
            return pd.Series({
                'Total Population': 0,
                'Percent Below FICO': 0,
                'Percent Above FICO': 0,
                'FICO Accuracy': np.nan,
                'EnergyScore Accuracy': np.nan,
                'Qualification Increase': 0,
            })
        
        above_fico = group[group['FICO_V9_SCORE'] > fico_threshold]
        below_fico = group[group['FICO_V9_SCORE'] < fico_threshold]

        below_fico_pass = below_fico[below_fico['WEIGHTED_ENERGYSCORE'] <= energy_score_threshold]
        
        pct_below_fico = len(below_fico) / total_population
        pct_above_fico = len(above_fico) / total_population

        percent_increase_in_qualifications = (len(below_fico_pass) / total_population) * 100 if len(below_fico_pass) > 0 else 0
        numeric_increase_in_qualifications = len(below_fico_pass) if len(below_fico_pass) > 0 else 0

        energy_accuracy = accuracy_score(group['WEIGHTED_ACTUAL_OUTPUT'], group['ENERGYSCORE_PASS']) #if len(below_fico) > 0 else np.nan

        fico_accuracy = accuracy_score(group['WEIGHTED_ACTUAL_OUTPUT'], group['FICO_PASS']) #if len(below_fico) > 0 else np.nan

        return pd.Series({
            'Total Population': total_population,
            'Percent Below FICO': pct_below_fico,
            'Percent Above FICO': pct_above_fico,
            'FICO Accuracy': fico_accuracy,
            'EnergyScore Accuracy': energy_accuracy,
            'Qualification Increase': percent_increase_in_qualifications,
            'Numeric Increase': numeric_increase_in_qualifications,
        })

    # Group by ZIP and apply metrics calculation
    zip_metrics = stats_data_person.groupby('ZIP').apply(calc_metrics)
    zip_metrics = zip_metrics.reset_index()
    return zip_metrics


# Function to calculate ZIP to utility mapping and display on the map
def calculate_zip_to_util(zip_level_geo, state_name):
    # Load the utility data for the state
    state_util = load_state_util(state_name)
    state_util.rename(columns={'new_name': 'Utility'}, inplace=True)

    # Ensure ZIP code geometries have the same projection as the utility data
    zip_level_geo = zip_level_geo.to_crs(state_util.crs)

    # Convert the ZIP geometries to representative points
    zip_level_geo['geometry'] = zip_level_geo.representative_point()

    # Perform spatial join with utility data based on point locations
    zip_level_geo = gpd.sjoin(zip_level_geo, state_util, how='left', predicate='within')

    # Group by utility name ('new_name') and calculate the mean of 'Qualification Increase'
    zip_to_util = zip_level_geo.groupby('Utility')['Qualification Increase'].mean().reset_index()

    # Merge utility data with the calculated qualification increase
    state_util = state_util.merge(zip_to_util, on='Utility', how='left')

    return state_util


In [47]:
state_name = "Massachusetts"
fico_threshold = 700
energy_score_threshold = .5


In [None]:
zip_metrics = calculate_zip_metrics(person_data, fico_threshold, energy_score_threshold)
zip_level_geo = pd.merge(zip_metrics, zip_geojson, on='ZIP', how='left')
zip_level_geo = zip_level_geo.dropna(subset=['geometry'])
zip_level_geo = gpd.GeoDataFrame(zip_level_geo, geometry='geometry')


In [49]:
zip_metrics.head()

Unnamed: 0,ZIP,Total Population,Percent Below FICO,Percent Above FICO,FICO Accuracy,EnergyScore Accuracy,Qualification Increase,Numeric Increase
0,1001,11.0,0.272727,0.636364,1.0,0.0,27.272727,3.0
1,1002,12.0,0.166667,0.833333,1.0,0.0,16.666667,2.0
2,1005,3.0,0.0,1.0,,,0.0,0.0
3,1007,18.0,0.222222,0.777778,1.0,0.0,22.222222,4.0
4,1010,3.0,0.333333,0.666667,1.0,0.0,33.333333,1.0


In [50]:
zip_metrics['Numeric Increase'].describe()

count    23050.000000
mean         3.378655
std          4.289429
min          0.000000
25%          1.000000
50%          2.000000
75%          5.000000
max         43.000000
Name: Numeric Increase, dtype: float64

In [7]:
zip_level_geo.head()

Unnamed: 0,ZIP,Total Population,Percent Below FICO,Percent Above FICO,FICO Accuracy,EnergyScore Accuracy,Qualification Increase,ZCTA5CE10,GEOID10,CLASSFP10,MTFCC10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,state,geometry
0,1001,11.0,0.181818,0.818182,1.0,1.0,18.181818,1001,1001,B5,G6350,S,29797658.0,2121390.0,42.0623678,-72.6257536,Massachusetts,"MULTIPOLYGON (((-72.66788 42.04416, -72.66788 ..."
1,1002,12.0,0.166667,0.833333,1.0,1.0,16.666667,1002,1002,B5,G6350,S,142615075.0,4281257.0,42.3640311,-72.4587594,Massachusetts,"MULTIPOLYGON (((-72.54666 42.40131, -72.54656 ..."
2,1005,3.0,0.0,1.0,,,0.0,1005,1005,B5,G6350,S,114638390.0,666424.0,42.4188835,-72.1120769,Massachusetts,"MULTIPOLYGON (((-72.19734 42.42264, -72.19661 ..."
3,1007,18.0,0.055556,0.944444,1.0,1.0,5.555556,1007,1007,B5,G6350,S,136237871.0,6941673.0,42.2790098,-72.4004682,Massachusetts,"MULTIPOLYGON (((-72.47202 42.35151, -72.47138 ..."
4,1010,3.0,0.333333,0.666667,1.0,1.0,33.333333,1010,1010,B5,G6350,S,90055966.0,1421379.0,42.1281757,-72.2053516,Massachusetts,"MULTIPOLYGON (((-72.27435 42.14364, -72.27411 ..."


In [8]:
zip_level_geo.shape

(644, 18)

In [10]:
zip_level_geo['Total Population'].describe()

count    644.000000
mean       8.427019
std        8.490922
min        1.000000
25%        2.000000
50%        6.000000
75%       12.000000
max       48.000000
Name: Total Population, dtype: float64

In [7]:
import fiona

In [48]:
# Load the GeoJSON file
#zip_geojson = gpd.read_file('select_zips.geojson')

zip_geojson = gpd.read_file('demo_zips.geojson')

# Load person data, forcing ZIP to be read as strings
person_data = pd.read_csv('data.csv', dtype={'ZIP': str})

# Ensure ZIP codes have leading zeros and handle floats
person_data['ZIP'] = person_data['ZIP'].apply(lambda x: str(int(float(x))).zfill(5) if pd.notnull(x) else '')

# Ensure GeoJSON ZIP codes are formatted as strings with leading zeros
zip_geojson['ZIP'] = zip_geojson['ZCTA5CE10'].astype(str).str.zfill(5)

# Check if ZIPs were properly converted
print(person_data['ZIP'].head())
print(zip_geojson['ZIP'].head())


0    04265
1    33467
2    02131
3    17968
4    94118
Name: ZIP, dtype: object
0    02457
1    02458
2    02459
3    02460
4    02461
Name: ZIP, dtype: object


In [49]:
# Function to calculate metrics for each ZIP
def calculate_zip_metrics(stats_data_person, fico_cutoff, energyscore_cutoff):
    # Create masks for conditions
    stats_data_person['FICO_PASS'] = stats_data_person['FICO_V9_SCORE'] > fico_cutoff
    stats_data_person['ENERGYSCORE_PASS'] = stats_data_person['WEIGHTED_ENERGYSCORE'] > energyscore_cutoff

    # Function to calculate various metrics
    def calc_metrics(group):
        total_population = len(group)
        if total_population == 0:
            return pd.Series({
                'Total Population': 0,
                'Percent Below FICO': 0,
                'Percent Above FICO': 0,
                'FICO Accuracy': np.nan,
                'EnergyScore Accuracy': np.nan,
                'Qualification Increase': 0,
            })
        
        
        below_fico = group[group['FICO_PASS'] == False]
        above_fico = group[group['FICO_PASS'] == True]

        if len(below_fico) == 0:
            return pd.Series({
                'Total Population': total_population,
                'Percent Below FICO': 0,
                'Percent Above FICO': len(above_fico) / total_population,
                'FICO Accuracy': np.nan,
                'EnergyScore Accuracy': np.nan,
                'Qualification Increase': 0,
            })

        below_fico_pass = below_fico[below_fico['WEIGHTED_ENERGYSCORE'] <= energyscore_cutoff]
        below_fico_fail = below_fico[below_fico['WEIGHTED_ENERGYSCORE'] > energyscore_cutoff]

        pct_below_fico = len(below_fico) / total_population
        pct_above_fico = len(above_fico) / total_population

        if len(below_fico_pass) == 0:
            percent_increase_in_qualifications = 0
        else:
            percent_increase_in_qualifications = (len(below_fico_pass) / total_population) * 100

        # get FICO accuracy, precision, recall, f1 and roc_auc score
       # fico_accuracy = accuracy_score(below_fico['WEIGHTED_ACTUAL_OUTPUT'], below_fico['WEIGHTED_ENERGYSCORE'] > energyscore_cutoff) if len(below_fico) > 0 else np.nan
        fico_accuracy = accuracy_score(below_fico['WEIGHTED_ACTUAL_OUTPUT'], below_fico['FICO_PASS']) if len(below_fico) > 0 else np.nan

        energy_accuracy = accuracy_score(below_fico['WEIGHTED_ACTUAL_OUTPUT'], below_fico['ENERGYSCORE_PASS']) if len(below_fico) > 0 else np.nan

      #  accuracy_increase = energy_accuracy - fico_accuracy

        return pd.Series({
            'Total Population': total_population,
            'Percent Below FICO': pct_below_fico,
            'Percent Above FICO': pct_above_fico,
            'FICO Accuracy': fico_accuracy,
            'EnergyScore Accuracy': energy_accuracy,
            'Qualification Increase': percent_increase_in_qualifications,
          #  'Accuracy Percentage Increase': accuracy_increase,

           # 'Accuracy Percentage Increase': (energy_accuracy - fico_accuracy) / fico_accuracy * 100 if fico_accuracy * energy_accuracy> 0 else 0,
           

        })

    # Group by ZIP and apply metrics calculation
    zip_metrics = stats_data_person.groupby('ZIP').apply(calc_metrics)

    zip_metrics = zip_metrics.reset_index()
    return zip_metrics

fico_threshold = 700
energy_score_threshold = 0.5
# Calculate metrics and merge with geo data
zip_metrics = calculate_zip_metrics(person_data, fico_threshold, energy_score_threshold)
zip_level_geo = pd.merge(zip_metrics, zip_geojson, on='ZIP', how='left')
zip_level_geo = zip_level_geo.dropna(subset=['geometry'])
zip_level_geo = gpd.GeoDataFrame(zip_level_geo, geometry='geometry')


  zip_metrics = stats_data_person.groupby('ZIP').apply(calc_metrics)


In [None]:
zip_level_geo

In [None]:


def load_state_util(state_name):
    if state_name == 'New Mexico':
        temp = gpd.read_file('nm_utils.geojson')
        temp = temp[['new_name', 'geometry']]
        return temp
    elif state_name == 'Massachusetts':
        return gpd.read_file('ma_utils.geojson')


def calculate_zip_to_util(zip_level_geo, util_name, state_name):
    # Load the utility data for the state
    state_util = load_state_util(state_name)

    # Ensure ZIP code geometries have the same projection as the utility data
    zip_level_geo = zip_level_geo.to_crs(state_util.crs)

    # Convert the ZIP geometries to representative points (instead of centroids)
    zip_level_geo['geometry'] = zip_level_geo.representative_point()

    # Perform spatial join with utility data based on point locations using predicate
    zip_level_geo = gpd.sjoin(zip_level_geo, state_util, how='left', predicate='within')

    # Group by utility name ('new_name') and calculate the mean of 'Qualification Increase'
    zip_to_util = zip_level_geo.groupby('new_name')['Qualification Increase'].mean().reset_index()

    return zip_to_util

# Example usage
state_name = 'New Mexico'
util_name = 'Xcel'
zip_to_util = calculate_zip_to_util(zip_level_geo, util_name, state_name)
zip_to_util.head()


In [None]:
zip_metrics

In [None]:
zip_level_geo.shape

In [None]:
zip_metrics['FICO Accuracy'].mean(),zip_metrics['EnergyScore Accuracy'].mean()

In [None]:
zip_geojson.head()