In [338]:
import requests
import geopandas as gpd
from urllib.parse import urlencode
import numpy as np
import pandas as pd
import time
import json

## Reading API keys for NREL API calls

In [343]:
with open('config/secrets.json', 'r') as f:
    json_txt = json.load(f)
    secrets_dict = dict(json_txt)
# NREL_BERK_API_KEY
# NREL_PERSONAL_API_KEY
print(secrets_dict.keys())

dict_keys(['NREL_BERK_API_KEY', 'NREL_PERSONAL_API_KEY'])


# NREL Solar Irradiance Data

API: https://nsrdb.nrel.gov/data-sets/api-instructions.html
Glossary:  https://www.nrel.gov/grid/solar-resource/solar-glossary.html#d

In [2]:
!ls cleaned_data/

full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.cpg
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.dbf
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.prj
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shp
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shx
[34mfull-pge-with-nrel[m[m
nrel_full.csv
nrel_sample.csv
pge-monthly-consumption_2013-2020.csv
pge-monthly-elec-by-zip_2013-2020.csv
pge-monthly-full-cleaned-by-zip_2013-2020.csv
pge-monthly-gas-by-zip_2013-2020.csv


In [3]:
pge_monthly_cleaned_fn = 'cleaned_data/full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shp'
pge_monthly = gpd.read_file(pge_monthly_cleaned_fn)

In [165]:
pge_monthly.head()

Unnamed: 0,zip,city,state,latitude,longitude,timezone,dst,ZIPCODE,DATE,MONTH,...,TOTALCUSTO,TOTALKWH,AVERAGEKWH,TOTALTHERM,AVERAGETHE,Calculated,Calculat_1,AVGKWH Dif,AVGTHERMS,geometry
0,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-01-01,1,...,0,0,0,0,0,0.0,0.0,0.0,0.0,POINT (-120.84510 39.17703)
1,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-02-01,2,...,0,0,0,0,0,0.0,0.0,0.0,0.0,POINT (-120.84510 39.17703)
2,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-03-01,3,...,0,0,0,0,0,0.0,0.0,0.0,0.0,POINT (-120.84510 39.17703)
3,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-04-01,4,...,0,0,0,0,0,0.0,0.0,0.0,0.0,POINT (-120.84510 39.17703)
4,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-05-01,5,...,100,47657,477,0,0,476.57,0.0,-0.43,0.0,POINT (-120.84510 39.17703)


In [166]:
len(pge_monthly['zip'].unique())

935

## Sample Use case for nrel api


In [344]:
# Declare all variables as strings. Spaces must be replaced with '+', i.e., change 'John Smith' to 'John+Smith'.
# Define the lat, long of the location and the year
lat, lon, year = 33.2164, -97.1292, 2010
# You must request an NSRDB api key from the link above
api_key = secrets_dict['NREL_BERK_API_KEY']
# Set the attributes to extract (e.g., dhi, ghi, etc.), separated by commas.
attributes = 'ghi,dhi,dni,wind_speed,air_temperature,solar_zenith_angle'
# Choose year of data
year = '2019'
# Set leap year to true or false. True will return leap day data if present, false will not.
leap_year = 'false'
# Set time interval in minutes, i.e., '30' is half hour intervals. Valid intervals are 30 & 60.
interval = '30'
# Specify Coordinated Universal Time (UTC), 'true' will use UTC, 'false' will use the local time zone of the data.
# NOTE: In order to use the NSRDB data in SAM, you must specify UTC as 'false'. SAM requires the data to be in the
# local time zone.
utc = 'false'
# Your full name, use '+' instead of spaces.
your_name = 'Justin Wong'
# Your reason for using the NSRDB.
reason_for_use = 'Educational'
# Your affiliation
your_affiliation = 'UC Berkeley'
# Your email address
your_email = 'justinryanwong@berkeley.edu'
# Please join our mailing list so we can keep you up-to-date on new developments.
mailing_list = 'true'

# Declare url string
url = 'https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT({lon}%20{lat})&names={year}&leap_day={leap}&interval={interval}&utc={utc}&full_name={name}&email={email}&affiliation={affiliation}&mailing_list={mailing_list}&reason={reason}&api_key={api}&attributes={attr}'.format(year=year, lat=lat, lon=lon, leap=leap_year, interval=interval, utc=utc, name=your_name, email=your_email, mailing_list=mailing_list, affiliation=your_affiliation, reason=reason_for_use, api=api_key, attr=attributes)
# Return just the first 2 lines to get metadata:
info = pd.read_csv(url, nrows=1)
# See metadata for specified properties, e.g., timezone and elevation
timezone, elevation = info['Local Time Zone'], info['Elevation']

In [346]:
# Declare all variables as strings. Spaces must be replaced with '+', i.e., change 'John Smith' to 'John+Smith'.
# Define the lat, long of the location and the year
lat, lon, year = 37.516687, -122.29026, 2010
# 37.516687	-122.29026
# You must request an NSRDB api key from the link above
api_key = secrets_dict['NREL_BERK_API_KEY']
# Set the attributes to extract (e.g., dhi, ghi, etc.), separated by commas.
attributes = 'ghi,dhi,dni,wind_speed,air_temperature,solar_zenith_angle'
# Choose year of data
year = '2019'
# Set leap year to true or false. True will return leap day data if present, false will not.
leap_year = 'false'
# Set time interval in minutes, i.e., '30' is half hour intervals. Valid intervals are 30 & 60.
interval = '30'
# Specify Coordinated Universal Time (UTC), 'true' will use UTC, 'false' will use the local time zone of the data.
# NOTE: In order to use the NSRDB data in SAM, you must specify UTC as 'false'. SAM requires the data to be in the
# local time zone.
utc = 'false'
# Your full name, use '+' instead of spaces.
your_name = 'Justin Wong'
# Your reason for using the NSRDB.
reason_for_use = 'Educational'
# Your affiliation
your_affiliation = 'UC Berkeley'
# Your email address
your_email = 'justinryanwong@berkeley.edu'
# Please join our mailing list so we can keep you up-to-date on new developments.
mailing_list = 'true'

# Declare url string
url = 'https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT({lon}%20{lat})&names={year}&leap_day={leap}&interval={interval}&utc={utc}&full_name={name}&email={email}&affiliation={affiliation}&mailing_list={mailing_list}&reason={reason}&api_key={api}&attributes={attr}'.format(year=year, lat=lat, lon=lon, leap=leap_year, interval=interval, utc=utc, name=your_name, email=your_email, mailing_list=mailing_list, affiliation=your_affiliation, reason=reason_for_use, api=api_key, attr=attributes)
# Return just the first 2 lines to get metadata:
info = pd.read_csv(url, nrows=1)
# See metadata for specified properties, e.g., timezone and elevation
timezone, elevation = info['Local Time Zone'], info['Elevation']

sf_data = pd.read_csv(url, skiprows=2)

In [347]:
info

Unnamed: 0,Source,Location ID,City,State,Country,Latitude,Longitude,Time Zone,Elevation,Local Time Zone,...,Cloud Type 11,Cloud Type 12,Fill Flag 0,Fill Flag 1,Fill Flag 2,Fill Flag 3,Fill Flag 4,Fill Flag 5,Surface Albedo Units,Version
0,NSRDB,119754,-,-,-,37.53,-122.3,-8,31,-8,...,Dust,Smoke,,Missing Image,Low Irradiance,Exceeds Clearsky,Missing CLoud Properties,Rayleigh Violation,,3.1.1


In [348]:
sf_data.iloc[8612] ## checking if it's reasonable for sf to be 17.7 degrees C in mid summer at 10 am

Year                  2019.00
Month                    6.00
Day                     29.00
Hour                    10.00
Minute                   0.00
GHI                    796.00
DHI                    197.00
DNI                    705.00
Wind Speed               4.70
Temperature             17.70
Solar Zenith Angle      31.75
Name: 8612, dtype: float64

In [349]:
sf_data.groupby(by='Month').agg('mean').reset_index()

Unnamed: 0,Month,Year,Day,Hour,Minute,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle
0,1,2019.0,16.0,11.5,15.0,89.897177,35.293683,136.165995,3.639449,12.498992,105.511613
1,2,2019.0,14.5,11.5,15.0,113.46503,45.420387,140.109375,5.285565,10.586979,99.61067
2,3,2019.0,16.0,11.5,15.0,181.806452,62.893145,208.875,3.507056,12.449866,91.220208
3,4,2019.0,15.5,11.5,15.0,252.890278,74.800694,268.840278,3.434583,13.956667,82.370493
4,5,2019.0,16.0,11.5,15.0,270.545027,82.442876,266.405242,3.928293,13.713374,75.622923
5,6,2019.0,15.5,11.5,15.0,345.111111,66.438889,402.542361,3.424375,16.575972,72.61791
6,7,2019.0,16.0,11.5,15.0,337.567204,56.494624,411.922043,4.040524,16.674395,74.076116
7,8,2019.0,16.0,11.5,15.0,306.302419,51.508737,384.617608,3.299664,18.539382,79.69455
8,9,2019.0,15.5,11.5,15.0,252.547917,45.18125,351.6,3.560903,18.757778,87.925583
9,10,2019.0,16.0,11.5,15.0,197.836694,36.193548,323.405242,2.814382,16.548992,96.797103


In [350]:
len(sf_data.groupby(by='Month').agg('mean').reset_index())

12

### Setting up function to call NREL api

In [351]:
params_dict = {
#     lat, lon, year = 33.2164, -97.1292, 2010
#     year = '2010'
    "api_key": secrets_dict['NREL_BERK_API_KEY'],
    "attributes": 'ghi,dhi,dni,wind_speed,air_temperature,solar_zenith_angle',
    "leap_year": 'false',
    "interval": '60',
    "utc": 'false',
    "your_name": 'Justin Wong',
    "reason_for_use": 'Educational',
    "your_affiliation": 'UC Berkeley',
    "your_email": 'justinryanwong\@berkeley.edu',
    "mailing_list": 'false'
}

def construct_nrel_url(long, lati, year):
    params_dict = {
        "api_key": secrets_dict['NREL_PERSONAL_API_KEY'],
        "attributes": 'ghi,dhi,dni,wind_speed,air_temperature,solar_zenith_angle',
        "leap_year": 'false',
        "interval": '60',
        "utc": 'false',
        "your_name": 'Justin Wong',
        "reason_for_use": 'Educational',
        "your_affiliation": 'UC Berkeley',
        "mailing_list": 'false'
    }
    params_dict['names'] = int(year) ## api only accepts integers
    params_query_string = urlencode(params_dict)
    return 'https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?' +"wkt=POINT({lon}%20{lat})".format(lon=long, lat=lati) + "&"+ params_query_string + "&email=justin.wong26@gmail.com"#"&email=justinryanwong@berkeley.edu"

In [352]:
sf_lat = 37.7749
sf_long= -122.4194
year = 2019
construct_nrel_url(sf_long, sf_lat, year)

'https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT(-122.4194%2037.7749)&api_key=Gq5beHrpNxNpWGy0MsWUPUaEthAAcR6jHiCle9a9&attributes=ghi%2Cdhi%2Cdni%2Cwind_speed%2Cair_temperature%2Csolar_zenith_angle&leap_year=false&interval=60&utc=false&your_name=Justin+Wong&reason_for_use=Educational&your_affiliation=UC+Berkeley&mailing_list=false&names=2019&email=justin.wong26@gmail.com'

## Extracting parameters for NREL database queries from the PG&E dataset

Using the NREL dataset requires a (longitude, latitude) pair and a year. NREL gives back 30 or 60 minute intervals for the entire year(i.e. every 30 minutes for every day of the year) with attributes that you specify

In [353]:
info.columns

Index(['Source', 'Location ID', 'City', 'State', 'Country', 'Latitude',
       'Longitude', 'Time Zone', 'Elevation', 'Local Time Zone',
       'Clearsky DHI Units', 'Clearsky DNI Units', 'Clearsky GHI Units',
       'Dew Point Units', 'DHI Units', 'DNI Units', 'GHI Units',
       'Solar Zenith Angle Units', 'Temperature Units', 'Pressure Units',
       'Relative Humidity Units', 'Precipitable Water Units',
       'Wind Direction Units', 'Wind Speed', 'Cloud Type -15', 'Cloud Type 0',
       'Cloud Type 1', 'Cloud Type 2', 'Cloud Type 3', 'Cloud Type 4',
       'Cloud Type 5', 'Cloud Type 6', 'Cloud Type 7', 'Cloud Type 8',
       'Cloud Type 9', 'Cloud Type 10', 'Cloud Type 11', 'Cloud Type 12',
       'Fill Flag 0', 'Fill Flag 1', 'Fill Flag 2', 'Fill Flag 3',
       'Fill Flag 4', 'Fill Flag 5', 'Surface Albedo Units', 'Version'],
      dtype='object')

In [33]:
nrel_param_columns = ['zip', 'latitude', 'longitude', 'YEAR']

for c in nrel_param_columns:
    pge_monthly[c] = pd.to_numeric(pge_monthly[c])

unique_zip_and_year = pge_monthly[nrel_param_columns].groupby(['zip', 'YEAR']).agg('mean').reset_index()

In [101]:
zipcode_params_for_nrel = unique_zip_and_year[unique_zip_and_year['YEAR'] != 2020].reset_index(drop=True)
zipcode_params_for_nrel.head()

Unnamed: 0,zip,YEAR,latitude,longitude
0,92304,2013,34.548138,-115.65796
1,92304,2014,34.548138,-115.65796
2,92304,2015,34.548138,-115.65796
3,92304,2016,34.548138,-115.65796
4,92304,2017,34.548138,-115.65796


In [102]:
zipcode_params_for_nrel

Unnamed: 0,zip,YEAR,latitude,longitude
0,92304,2013,34.548138,-115.65796
1,92304,2014,34.548138,-115.65796
2,92304,2015,34.548138,-115.65796
3,92304,2016,34.548138,-115.65796
4,92304,2017,34.548138,-115.65796
...,...,...,...,...
6469,96161,2015,39.339574,-120.22805
6470,96161,2016,39.339574,-120.22805
6471,96161,2017,39.339574,-120.22805
6472,96161,2018,39.339574,-120.22805


## Calling the NREL API for construct a dataframe that aggregates monthly average measurements

There are 6474 unique zipcodes and year pairs in our PG&E dataset. We want to find the matching NREL solar radiation measurements for each of these 6474 zipcodes-year pairs, so that we can have monthly measurements.

There are rate limits defined based on their documentation: https://developer.nrel.gov/docs/solar/nsrdb/guide/

In [75]:
import urllib

In [355]:
prog=0
start = time.time()
## The following block should take at least 6473 seconds, or 107 minutes
while prog != 6473:
    try:
        for df_index, df_row in zipcode_params_for_nrel[prog:].iterrows():
            prog = df_index
            ## pull out contents of the row. 
            ## These columns are the same as the dataframe `zipcode_params_for_nrel` in cells above
            zipcode = df_row['zip']
            year = int(df_row['YEAR']) ## api only accepts integers
            lati = df_row['latitude']
            long = df_row['longitude']

            ## create nrel api url call
            print("{c} - Searching {z} in {y}: ({long}, {lat})".format(z=zipcode, y=year, lat=lati, long=long, c=df_index))
            nrel_url = construct_nrel_url(long, lati, year)
            print("\t> Getting: ", nrel_url[:100], "...")

            ## create a dataframe using nrel api, as shown in https://nsrdb.nrel.gov/data-sets/api-instructions.html
            ## drop the unnecessary columns. We average each month, so day, hour, and minute are unnecessary
            unnecessary_cols = ["Day","Hour","Minute"]
            nrel_data = pd.read_csv(nrel_url, skiprows=2).drop(unnecessary_cols, axis=1)

            ## group by month to aggregate
            nrel_by_month = nrel_data.groupby(by='Month').agg('mean').reset_index()

            ## Create description columns so we can map back the measurements to the pg&e data
            nrel_by_month['zipcode'] = np.repeat(zipcode, len(nrel_by_month))
            nrel_by_month['latitude'] = np.repeat(lati, len(nrel_by_month))
            nrel_by_month['longitude'] = np.repeat(long, len(nrel_by_month))
            time.sleep(1) ## sleep for 1 second for nrel rate limit
            if df_index==0:
                nrel_by_month.to_csv('data/nrel/limited3.csv', mode='w')
            else:
                nrel_by_month.to_csv('data/nrel/limited3.csv', mode='a')
            constructing_df = pd.concat([constructing_df, nrel_by_month])

    except (urllib.error.HTTPError, Exception) as exception:
        ## catch http errors in case of 404, 429(these were thrown by the NREL api due to rate limits)
        print(exception)
        prog = prog
        continue
end = time.time()

print('Completed {n} nrel api calls in {s} seconds'.format(n=len(zipcode_params_for_nrel), s=end-start))

3524 - Searching 95248.0 in 2014: (-120.485154, 38.32562600000001)
	> Getting:  https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT(-120.485154%2038.325626000000 ...
3525 - Searching 95248.0 in 2015: (-120.48515399999998, 38.325626)
	> Getting:  https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT(-120.48515399999998%2038.3256 ...
3526 - Searching 95248.0 in 2016: (-120.485154, 38.32562600000001)
	> Getting:  https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT(-120.485154%2038.325626000000 ...
3527 - Searching 95248.0 in 2017: (-120.48515399999998, 38.325626)
	> Getting:  https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT(-120.48515399999998%2038.3256 ...
3528 - Searching 95248.0 in 2018: (-120.48515399999998, 38.325626)
	> Getting:  https://developer.nrel.gov/api/solar/nsrdb_psm3_download.csv?wkt=POINT(-120.48515399999998%2038.3256 ...
3529 - Searching 95248.0 in 2019: (-120.48515399999998, 38.325626)
	> Getti

KeyboardInterrupt: 

In [57]:
## store the full nrel as a plain csv 
constructing_df.to_csv('cleaned_data/nrel_full.csv')

In [175]:
constructing_df.head()

Unnamed: 0,Month,Year,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle,zipcode,latitude,longitude
0,1,2013.0,118.551075,30.461022,220.584677,3.564247,9.044355,105.47832,94002,37.516687,-122.29026
1,2,2013.0,166.96131,39.181548,269.005952,3.429167,9.815476,99.517798,94002,37.516687,-122.29026
2,3,2013.0,195.715054,71.255376,216.329301,3.340323,11.181452,91.103266,94002,37.516687,-122.29026
3,4,2013.0,284.418056,64.431944,334.995833,3.858194,12.713889,82.265542,94002,37.516687,-122.29026
4,5,2013.0,311.571237,71.837366,346.986559,4.352151,13.991935,75.565403,94002,37.516687,-122.29026


## Merging NREL Solar Radiation with PG&E Energy Consumption

1. Read the CSVs from the NREL data and drop duplicate
2. Add zipcodes to NREL data
3. Merge 2013-2019 data. NREL does not currently have 2020 data available yet

In [90]:
one = pd.read_csv('data/nrel/limited.csv')
two = pd.read_csv('data/nrel/limited2.csv')
three = pd.read_csv('data/nrel/limited3.csv')

In [92]:
combined = pd.concat([one, two, three])

In [126]:
all_nrel_monthly_in_all_zips = combined.drop(['Unnamed: 0'], axis=1).reset_index(drop=True).drop_duplicates().reset_index(drop=True)
all_nrel_monthly_in_all_zips

Unnamed: 0,Month,Year,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle,zipcode,latitude,longitude
0,1,2013.0,144.2258064516129,33.39650537634409,267.46639784946234,2.4864247311827956,8.838709677419354,104.79604838709689,92304.0,34.548138,-115.65796
1,2,2013.0,194.01636904761904,38.56845238095238,311.51934523809524,2.3967261904761905,11.654761904761905,99.1219494047619,92304.0,34.548138,-115.65796
2,3,2013.0,249.04435483870967,54.84543010752688,327.06317204301075,2.4489247311827977,18.990591397849464,91.0582526881719,92304.0,34.548138,-115.65796
3,4,2013.0,311.96805555555557,62.230555555555554,379.10833333333335,2.8834722222222267,21.9625,82.58176388888893,92304.0,34.548138,-115.65796
4,5,2013.0,341.9166666666667,64.85215053763442,399.4287634408602,2.8856182795698895,25.81451612903226,76.18926075268821,92304.0,34.548138,-115.65796
...,...,...,...,...,...,...,...,...,...,...,...
77684,8,2019.0,315.7258064516129,48.71370967741935,424.63978494623655,0.5072580645161294,21.40295698924733,79.46888440860221,96162.0,39.26599,-120.64145
77685,9,2019.0,231.10972222222222,48.11805555555556,325.1138888888889,0.40486111111111167,15.121388888888884,87.89591666666671,96162.0,39.26599,-120.64145
77686,10,2019.0,196.40725806451613,33.19086021505376,338.1720430107527,0.44784946236559137,9.530376344086022,96.95063172043011,96162.0,39.26599,-120.64145
77687,11,2019.0,125.52222222222223,31.509722222222223,239.2277777777778,0.41291666666666615,7.215972222222224,104.28987500000002,96162.0,39.26599,-120.64145


In [131]:
all_nrel_monthly_in_all_zips['Month'].value_counts()


1        6474
9        6474
4        6474
10       6474
8        6474
5        6474
12       6474
6        6474
2        6474
7        6474
3        6474
11       6474
Month       1
Name: Month, dtype: int64

In [130]:
all_nrel_monthly_in_all_zips[all_nrel_monthly_in_all_zips['Month']=='Month']

Unnamed: 0,Month,Year,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle,zipcode,latitude,longitude
12,Month,Year,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle,zipcode,latitude,longitude


In [143]:
mod = all_nrel_monthly_in_all_zips.drop(all_nrel_monthly_in_all_zips.loc[all_nrel_monthly_in_all_zips['Month']=='Month'].index).reset_index(drop=True)
mod['Month'].value_counts()

1     6474
9     6474
4     6474
10    6474
8     6474
5     6474
12    6474
6     6474
2     6474
7     6474
3     6474
11    6474
Name: Month, dtype: int64

In [206]:
# zipcode_params_for_nrel
mod['Year'] = mod['Year'].astype(float)
mod['Year'] = mod['Year'].astype(int)
mod['Month'] = mod['Month'].astype(int)

In [146]:
repeatted_zips = pd.DataFrame(data=np.repeat(zipcode_params_for_nrel.values,12,axis=0),
                             columns=[s for s in zipcode_params_for_nrel.columns])
repeatted_zips

Unnamed: 0,zip,YEAR,latitude,longitude
0,92304.0,2013.0,34.548138,-115.65796
1,92304.0,2013.0,34.548138,-115.65796
2,92304.0,2013.0,34.548138,-115.65796
3,92304.0,2013.0,34.548138,-115.65796
4,92304.0,2013.0,34.548138,-115.65796
...,...,...,...,...
77683,96162.0,2019.0,39.265990,-120.64145
77684,96162.0,2019.0,39.265990,-120.64145
77685,96162.0,2019.0,39.265990,-120.64145
77686,96162.0,2019.0,39.265990,-120.64145


In [158]:
mod.insert(0, 'zip', repeatted_zips['zip'].astype(int))


In [242]:
len(mod['zip'].unique())

932

In [318]:
## inner will yield 2013-2019, since there are no 2020 measurements from nrel
pge_with_nrel = pd.merge(left=pge_monthly[pge_monthly['YEAR']!= 2020], right=mod, left_on=['zip', 'MONTH', 'YEAR'], right_on=['zip','Month', 'Year'], how='left')

In [319]:
# display(pge_monthly[['zip', 'MONTH', 'YEAR']].head())
# display(mod.head())
# type(pge_monthly['YEAR'][0]), type(mod['Year'][0])
# len(pge_with_nrel['zip'].unique())
# m = 0
# mi = []
# for p in pge_monthly['zip'].unique():
#     if p in mod['zip'].unique():
#         print(p)
#     else:
#         m+=1
#         mi.append(p)
# print('missing:', m)
# print(mi)
# len(pge_with_nrel[pge_with_nrel['Year'].notna()]['zip'].unique())
# pge_with_nrel[pge_with_nrel['Year'].isna()][['zip','MONTH', 'YEAR']]
# len(pge_with_nrel['zip'].unique())
# k = pge_with_nrel[pge_with_nrel['GHI'].isna()]#.groupby('zip').agg('count')
# type(pge_monthly['ZIPCODE'][0]), type(pge_monthly['zip'][0]), type(mod['zip'][0])
pge_with_nrel

Unnamed: 0,zip,city,state,latitude_x,longitude_x,timezone,dst,ZIPCODE,DATE,MONTH,...,Year,GHI,DHI,DNI,Wind Speed,Temperature,Solar Zenith Angle,zipcode,latitude_y,longitude_y
0,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-01-01,1,...,2013,107.88575268817205,29.571236559139784,211.56182795698925,0.3540322580645159,2.3481182795698925,105.82493279569903,95717.0,39.177026,-120.8451
1,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-02-01,2,...,2013,158.59375,36.48511904761905,269.0595238095238,0.40148809523809587,4.525297619047619,99.71912202380952,95717.0,39.177026,-120.8451
2,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-03-01,3,...,2013,183.19892473118279,68.31989247311827,206.93548387096774,0.3981182795698933,8.529569892473118,91.12526881720433,95717.0,39.177026,-120.8451
3,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-04-01,4,...,2013,269.525,66.2375,318.2472222222222,0.40958333333333385,11.322222222222223,82.1030138888889,95717.0,39.177026,-120.8451
4,95717,Gold Run,CA,39.177026,-120.8451,-8,1,95717,2013-05-01,5,...,2013,299.9287634408602,71.88306451612904,342.23118279569894,0.4310483870967741,14.850806451612904,75.24653225806456,95717.0,39.177026,-120.8451
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136963,95811,Sacramnto,CA,38.596700,-121.4942,-8,1,95811,2019-08-01,8,...,2019,311.26344086021504,51.60349462365591,395.27284946236557,2.2737903225806466,26.286290322580637,79.54961021505376,95811.0,38.5967,-121.49420000000002
136964,95811,Sacramnto,CA,38.596700,-121.4942,-8,1,95811,2019-09-01,9,...,2019,244.57083333333333,48.915277777777774,335.4347222222222,2.023055555555555,23.207916666666673,87.9082916666665,95811.0,38.5967,-121.49420000000002
136965,95811,Sacramnto,CA,38.596700,-121.4942,-8,1,95811,2019-10-01,10,...,2019,193.0188172043011,39.31989247311828,304.5766129032258,1.8063172043010742,18.629838709677415,96.89096774193553,95811.0,38.5967,-121.49420000000002
136966,95811,Sacramnto,CA,38.596700,-121.4942,-8,1,95811,2019-11-01,11,...,2019,125.96388888888889,34.84583333333333,219.58333333333334,1.545833333333334,14.595000000000027,104.16968055555537,95811.0,38.5967,-121.49420000000002


In [321]:
## storing this for future reference
pge_with_nrel.to_file('cleaned_data/all-pge-with-nrel/pge-energy-consumption-with-nrel-solar_2013-2019_136968x35.shp')

In [324]:
print('Comparing full pge(2013-2020) and 2013-2019 datasets')
print(pge_monthly.shape)
print(pge_with_nrel.shape)

Comparing full pge(2013-2020) and 2013-2019 datasets
(154283, 24)
(136968, 35)


In [325]:
!ls cleaned_data

[34mall-pge-with-nrel[m[m
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.cpg
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.dbf
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.prj
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shp
full-pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shx
[34mfull-pge-with-nrel[m[m
nrel_full.csv
nrel_sample.csv
pge-monthly-consumption_2013-2020.csv
pge-monthly-elec-by-zip_2013-2020.csv
pge-monthly-full-cleaned-by-zip_2013-2020.csv
pge-monthly-gas-by-zip_2013-2020.csv
