In [106]:
import numpy as np
import pandas as pd
import geopandas as gp
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [107]:
# df_latest has sdwis data for all 4 quarters and drops duplicates
sdwis = pd.read_csv(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\df_latest.csv',low_memory=False)
usgs_gu_wwsa = gp.read_file(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\V1_GU_wWS\v1_GU_wWS.shp')
usgs_gu_wwsa_df = pd.DataFrame(usgs_gu_wwsa.drop(columns='geometry'))
# usgs_gu_wwsa_df.to_csv(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\usgs_gu_wwsa_df.csv')
# Machine learning model output for domestic demand per capita
dpc_2020 = pd.read_csv(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\delivery_water_use_model\predictions\national_dpc_predictions.csv')
dpc_2020 = dpc_2020.loc[dpc_2020['year']==2020].copy()
# Filtered for Treatment and include Facility Activity = Activity/Inactive
sdwis_nofilter = pd.read_csv(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\SDWIS.csv',low_memory=False)
sdwis_facilities_nofilter = pd.read_csv(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\2020_WT_Facilities_Details.csv',low_memory=False)

In [111]:
# Drop rows with duplicate PWS ID, Facility ID and treatment unit
sdwis_filtered = sdwis[['PWS ID','Facility Id','PWS Name', 'Population Served Count', 'Activity Status', 'Facility Activity', 'Primary Source', 'PWS Type','Treatment Process','Treatment Objective']].drop_duplicates(subset = ['PWS ID','Facility Id','Treatment Process'])
# Filter for Facility activity = Active
# sdwis_filtered = sdwis_filtered[sdwis_filtered['Facility Activity']=='Active']
# Filter for where population served is 0
# sdwis_filtered = sdwis_filtered[sdwis_filtered['Population Served Count']>0]
# Remove rows without an treatment process listed
sdwis_filtered = sdwis_filtered.dropna(subset=['Treatment Process'])
# Remove PWS ID and Facility with only 1 treatment process
sdwis_filtered = sdwis_filtered.groupby(['PWS ID','Facility Id']).filter(lambda x: len(x['Treatment Process'])>1).reset_index()

# usgs_gu_wwsa_df  = usgs_gu_wwsa_df[usgs_gu_wwsa_df['GU_POP']>0]

# Make the wsa string with 9 characters
updated_column_list = []
avg_dpc = []

for idx,row in dpc_2020.iterrows():
    try:
        a = int(row['wsa_agidf'])
        # print(f'{a:09d}')
        updated_column_list.append(str(f'{a:09d}'))
        # updated_column_list.append(row['wsa_agidf'])
    except:
        updated_column_list.append(row['wsa_agidf'].upper())

    avg_dpc.append(row.iloc[5::].mean())

dpc_2020['wsa_agidf'] = updated_column_list
dpc_2020['avg_dpc'] = avg_dpc

SDWIS is only active treatment facilities

USGS GUs are based on places- Has assigned PWS IDs and Seller PWS IDs and WSA IDs

DPC has WSA IDs assigned

In [112]:
# Get unique PWS IDs
sdwis_pws_ids = sdwis_filtered['PWS ID'].unique()
sdwis_no_filter_pws_ids = sdwis_nofilter['PWS ID'].unique()
sdwis_facilities_nofilter_pws_ids = sdwis_facilities_nofilter['PWS ID'].unique()

usgs_pws_ids = usgs_gu_wwsa_df['PWS_ID'].unique()
usgs_seller_pws_ids = usgs_gu_wwsa_df['SELLER_PWS'].dropna().unique()
common_pws_ids = np.unique(np.concatenate([usgs_seller_pws_ids,usgs_pws_ids]))
usgs_wsa_ids = usgs_gu_wwsa_df['WSA_AGIDF'].unique()
dpc_wsa_ids = dpc_2020['wsa_agidf'].unique()

print('Unique PWS_ID in SDWIS filtered (df_latest):', len(sdwis_pws_ids))
print('Unique PWS_ID in SDWIS no filter:', len(sdwis_no_filter_pws_ids))
print('Unique PWS_ID in SDWIS facilities no filter:', len(sdwis_facilities_nofilter_pws_ids))

print('\nUSGS')
print('Total USGS GU-wWS rows:',len(usgs_gu_wwsa_df))
print('Unique PWS_ID in USGS:', len(usgs_pws_ids))
print('Unique SELLER PWS_ID in USGS:', len(usgs_seller_pws_ids))
print('Total unique PWS IDs (with/without seller):', len(common_pws_ids))
print('Unique WSA_AGIDF in USGS:', len(usgs_wsa_ids))
print('USGS DPC data:', len(dpc_wsa_ids))

Unique PWS_ID in SDWIS filtered (df_latest): 21354
Unique PWS_ID in SDWIS no filter: 39067
Unique PWS_ID in SDWIS facilities no filter: 425282

USGS
Total USGS GU-wWS rows: 27159
Unique PWS_ID in USGS: 21429
Unique SELLER PWS_ID in USGS: 1706
Total unique PWS IDs (with/without seller): 22252
Unique WSA_AGIDF in USGS: 18807
USGS DPC data: 18807


In [91]:
# Check in the 1 facility not in common between the USGS DPC dataset and the USGS WSA dataset
res_usgs_wsa_ids = [i for i in usgs_wsa_ids if i not in dpc_wsa_ids]
res_dpc = [i for i in dpc_wsa_ids if i not in usgs_wsa_ids]

print('In GU_wWs  dataset but not DPC:', res_usgs_wsa_ids)
print('In DPC dataset but not GU_wWs:', res_dpc)

In GU_wWs  dataset but not DPC: ['CO0118006', 'CA3610057', 'CO0203002']
In DPC dataset but not GU_wWs: ['CA2010009', 'CA3310018', 'CA3310025']


In [78]:
# Trying to check if USGS WSA data set PWS IDs can be mapped to SDWIS df_latest dataset

# usgs_check = usgs_gu_wwsa_df[['GU_ID','PLACE_FIPS','PLACE_NAME','CNTY_NM', 'STATE_NAME','PWS_ID','SELLER_PWS','WSA_AGIDF']]

# # Check if the PWS ID or Seller PWS ID is in the SDWIS dataset
# check_list = []
# pws_check_list = []

# for idx,row in usgs_check.iterrows():
#     if row['SELLER_PWS'] in sdwis_pws_ids:
#         check_list.append('Yes')
#         pws_check_list.append(row['SELLER_PWS'])
#     elif row['PWS_ID'] in sdwis_pws_ids:
#         check_list.append('Yes')
#         pws_check_list.append(row['PWS_ID'])
#     elif row['WSA_AGIDF'] in sdwis_pws_ids:
#         check_list.append('Yes')
#         pws_check_list.append(row['WSA_AGIDF'])
#     else:
#         check_list.append('No')
#         pws_check_list.append(row['WSA_AGIDF'])

# usgs_check['pws_check'] = pws_check_list
# usgs_check['In SDWIS'] = check_list

# print('Number of facilities mapped to SDWIS df_latest:', len(usgs_check[usgs_check['In SDWIS']=='Yes']))
# print('Number of unique PWS IDs:',len(usgs_check['pws_check'].unique()))

In [92]:
# Trying to check if USGS WSA data set PWS IDs can be mapped to SDWIS No filter dataset

usgs_check = usgs_gu_wwsa_df[['GU_ID','STATE_NAME','CNTY_NM','PLACE_FIPS','PLACE_NAME',
                              'GU_POP','WTR_TYPE_E','PWS_ID','SELLER_PWS','WSA_AGIDF']]

# Check if the PWS ID or Seller PWS ID is in the SDWIS dataset
check_list = []
pws_check_list = []

for idx,row in usgs_check.iterrows():
    if row['SELLER_PWS'] in sdwis_no_filter_pws_ids:
        check_list.append('Yes')
        pws_check_list.append(row['SELLER_PWS'])
    elif row['PWS_ID'] in sdwis_no_filter_pws_ids:
        check_list.append('Yes')
        pws_check_list.append(row['PWS_ID'])
    elif row['WSA_AGIDF'] in sdwis_no_filter_pws_ids:
        check_list.append('Yes')
        pws_check_list.append(row['WSA_AGIDF'])
    else:
        check_list.append('No')
        pws_check_list.append(row['WSA_AGIDF'])

usgs_check['pws_check'] = pws_check_list
usgs_check['In SDWIS'] = check_list

print('Number of facilities mapped to SDWIS no filter:', len(usgs_check[usgs_check['In SDWIS']=='Yes']))
print('Number of unique PWS IDs:',len(usgs_check['pws_check'].unique()))

Number of facilities mapped to SDWIS no filter: 25101
Number of unique PWS IDs: 19026


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usgs_check['pws_check'] = pws_check_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usgs_check['In SDWIS'] = check_list


In [80]:
# Trying to check if USGS WSA data set PWS IDs can be mapped to SDWIS Facilities no filter dataset

# usgs_check = usgs_gu_wwsa_df[['GU_ID','PLACE_FIPS','PLACE_NAME','CNTY_NM', 'STATE_NAME','PWS_ID','SELLER_PWS','WSA_AGIDF']]

# # Check if the PWS ID or Seller PWS ID is in the SDWIS dataset
# check_list = []
# pws_check_list = []

# for idx,row in usgs_check.iterrows():
#     if row['SELLER_PWS'] in sdwis_facilities_nofilter_pws_ids:
#         check_list.append('Yes')
#         pws_check_list.append(row['SELLER_PWS'])
#     elif row['PWS_ID'] in sdwis_facilities_nofilter_pws_ids:
#         check_list.append('Yes')
#         pws_check_list.append(row['PWS_ID'])
#     elif row['WSA_AGIDF'] in sdwis_facilities_nofilter_pws_ids:
#         check_list.append('Yes')
#         pws_check_list.append(row['WSA_AGIDF'])
#     else:
#         check_list.append('No')
#         pws_check_list.append(row['WSA_AGIDF'])

# usgs_check['pws_check'] = pws_check_list
# usgs_check['In SDWIS'] = check_list

# print('Number of facilities mapped to SDWIS no filter:', len(usgs_check[usgs_check['In SDWIS']=='Yes']))
# print('Number of unique PWS IDs:',len(usgs_check['pws_check'].unique()))

In [101]:
# Mapping usgs check pws_check to dpc 2020 population
usgs_check_dpc = usgs_check.merge(dpc_2020[['wsa_agidf','avg_dpc']],left_on='WSA_AGIDF',right_on='wsa_agidf')
# usgs_check_dpc.to_csv(r'C:\Users\mhardika\Documents\AMO\2050\analysis_files_2024\usgs_check_dpc.csv')