In [1]:
import numpy as np
import pandas as pd

In [2]:
'''
function that will replace strings in `list_of_replacements` with `substitute` 
in the specified column `col` of the given dataframe `df`.

df: the pandas dataframe
col: the specific column that will contain the balues in list_of_replacements
substitute: the desired string. This should be the standardized string.
list_of_replacements: list of alternatives that will be replaced with substitute
'''
def clean_col(_df, col, substitute, list_of_replacements, inplace=True):
    ## allow in place substitutions or  new copy
    df = _df
    if not inplace:
        df = _df.copy()
    
    assert col in df.columns, "Make sure the column exists in your DataFrame"
    assert type(list_of_replacements) == list, "`list_of_replacements` must be a list"
    
    for replacement in list_of_replacements:
        df[col] = df[col].replace(replacement, substitute)
    return df

'''
Pretty prints the unique values in a column `col` for a particular dataframe `df`.

This will print the exact value between ``. This helps determine if text has leading/trailing spaces
'''
def col_value_counts(df, col):
    assert col in df.columns, "Make sure the column exists in your DataFrame"
    
    values_counted = df[col].value_counts()
    for k, v in values_counted.items():
#         v = values_counted[c]
        print('`{k}` : {v}'.format(k=k, v=v))
    
    
def pretty_print_cols(df):
    garbo = [print(c) for c in df.columns]

In [3]:
!ls cleaned_data

pge-monthly-consumption_2013-2020.csv
pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.cpg
pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.dbf
pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.prj
pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shp
pge-monthly-consumption_2013-2020_with-zipcodes-and-cities.shx
pge-monthly-elec-by-zip_2013-2020.csv
pge-monthly-gas-by-zip_2013-2020.csv


In [4]:
combinded_pge_fn = "cleaned_data/pge-monthly-consumption_2013-2020.csv"
pge_full = pd.read_csv(combinded_pge_fn)


  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
print(pge_full.shape)
print(pge_full.columns)
pge_full.head()

(154283, 13)
Index(['Unnamed: 0', 'ZIPCODE', 'MONTH', 'YEAR', 'CUSTOMERCLASS', 'COMBINED',
       'TOTALCUSTOMERS', 'TOTALKWH', 'AVERAGEKWH', 'TOTALTHM', 'AVERAGETHM',
       'TOTALTHERMS', 'AVERAGETHERMS'],
      dtype='object')


Unnamed: 0.1,Unnamed: 0,ZIPCODE,MONTH,YEAR,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHM,AVERAGETHM,TOTALTHERMS,AVERAGETHERMS
0,4956,93101.0,1.0,2013.0,Elec- Residential,Y,0,0,,,,,
1,4957,93101.0,2.0,2013.0,Elec- Residential,Y,0,0,,,,,
2,4958,93101.0,3.0,2013.0,Elec- Residential,Y,0,0,,,,,
3,4959,93105.0,1.0,2013.0,Elec- Residential,Y,0,0,,,,,
4,4960,93105.0,2.0,2013.0,Elec- Residential,Y,0,0,,,,,


## Data Types for Zipcode, Month, and Year

We want want certain columns to be integers

In [6]:
## Specify which columns have certain data types
convert_dict = {
    'ZIPCODE': int,
    'MONTH': int,
    'YEAR': int,
    'TOTALCUSTOMERS': int,
    'TOTALKWH': int,
    'AVERAGEKWH': int,
    'TOTALTHM': int,
    'AVERAGETHM': int,
    'TOTALTHERMS': int,
    'AVERAGETHERMS': int
    
}
## Drop columns we dont want to keep
bad_index = 'Unnamed: 0'
if bad_index in pge_full.columns:
    pge_full = pge_full.drop('Unnamed: 0', axis=1)
    
## Get rid of commas for the columns that are supposed to be integers
pge_full.replace(',', '', regex=True, inplace=True)
pge_full.fillna(0, inplace=True)

numerical_columns = ['TOTALCUSTOMERS', 'TOTALKWH', 'AVERAGEKWH', 'TOTALTHM', 'AVERAGETHM','TOTALTHERMS', 'AVERAGETHERMS']
for num_col in numerical_columns:
    pge_full[num_col] = pge_full[num_col].apply(pd.to_numeric, errors='coerce')

pge_full = pge_full.astype(convert_dict)

In [7]:
print(pge_full.shape)
print(pge_full.columns)
pge_full.head()

(154283, 12)
Index(['ZIPCODE', 'MONTH', 'YEAR', 'CUSTOMERCLASS', 'COMBINED',
       'TOTALCUSTOMERS', 'TOTALKWH', 'AVERAGEKWH', 'TOTALTHM', 'AVERAGETHM',
       'TOTALTHERMS', 'AVERAGETHERMS'],
      dtype='object')


Unnamed: 0,ZIPCODE,MONTH,YEAR,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHM,AVERAGETHM,TOTALTHERMS,AVERAGETHERMS
0,93101,1,2013,Elec- Residential,Y,0,0,0,0,0,0,0
1,93101,2,2013,Elec- Residential,Y,0,0,0,0,0,0,0
2,93101,3,2013,Elec- Residential,Y,0,0,0,0,0,0,0
3,93105,1,2013,Elec- Residential,Y,0,0,0,0,0,0,0
4,93105,2,2013,Elec- Residential,Y,0,0,0,0,0,0,0


## Consolidating Columns

Looking at `TOTALTHM` and `TOTALTHERMS`, these two columns should be exactly the same, but theyre simply named differently. They both represent gas usage.

This is the same case for `AVERAGETHM` and `AVERAGETHERMS`.

We will simply add these columns and drop one of them.

In [8]:
pge_full[(pge_full['ZIPCODE'] == 96090) & (pge_full['YEAR'] == 2013)].sort_values('MONTH')

Unnamed: 0,ZIPCODE,MONTH,YEAR,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHM,AVERAGETHM,TOTALTHERMS,AVERAGETHERMS
2549,96090,1,2013,Elec- Residential,N,185,124847,675,0,0,0,0
4338,96090,1,2013,Gas- Residential,N,148,0,0,11489,78,0,0
2550,96090,2,2013,Elec- Residential,N,185,97928,529,0,0,0,0
4339,96090,2,2013,Gas- Residential,N,148,0,0,7408,50,0,0
2551,96090,3,2013,Elec- Residential,N,186,94196,506,0,0,0,0
4340,96090,3,2013,Gas- Residential,N,149,0,0,4617,31,0,0
6894,96090,4,2013,Elec- Residential,N,186,89356,480,0,0,0,0
8685,96090,4,2013,Gas- Residential,N,149,0,0,2711,18,0,0
6895,96090,5,2013,Elec- Residential,N,185,107253,580,0,0,0,0
8686,96090,5,2013,Gas- Residential,N,149,0,0,1824,12,0,0


In [9]:
pge_full['TOTALTHERMS'] = pge_full['TOTALTHERMS'] + pge_full['TOTALTHM']
pge_full['AVERAGETHERMS'] = pge_full['AVERAGETHERMS'] + pge_full['AVERAGETHM']

pge_full = pge_full.drop(['TOTALTHM', 'AVERAGETHM'], axis=1)

In [10]:
pge_full[(pge_full['ZIPCODE'] == 96090) & (pge_full['YEAR'] == 2013)].sort_values('MONTH')

Unnamed: 0,ZIPCODE,MONTH,YEAR,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHERMS,AVERAGETHERMS
2549,96090,1,2013,Elec- Residential,N,185,124847,675,0,0
4338,96090,1,2013,Gas- Residential,N,148,0,0,11489,78
2550,96090,2,2013,Elec- Residential,N,185,97928,529,0,0
4339,96090,2,2013,Gas- Residential,N,148,0,0,7408,50
2551,96090,3,2013,Elec- Residential,N,186,94196,506,0,0
4340,96090,3,2013,Gas- Residential,N,149,0,0,4617,31
6894,96090,4,2013,Elec- Residential,N,186,89356,480,0,0
8685,96090,4,2013,Gas- Residential,N,149,0,0,2711,18
6895,96090,5,2013,Elec- Residential,N,185,107253,580,0,0
8686,96090,5,2013,Gas- Residential,N,149,0,0,1824,12


## Sanity Check Average Columns

We calculate our own averages for the therms and kwh columns and find the difference from the given columns from PG&E. We then display these sorted by the difference to see if there are any significant differences.

In [11]:
## Create new columns for calculated averages
pge_full['Calculated AVGKWH'] = pge_full['TOTALKWH']/pge_full['TOTALCUSTOMERS']
pge_full['Calculated AVGTHERMS'] = pge_full['TOTALTHERMS']/pge_full['TOTALCUSTOMERS']

## Create new column for difference between calculated avg and pge given avg
pge_full['AVGKWH Diff'] = pge_full['Calculated AVGKWH'] - pge_full['AVERAGEKWH']
pge_full['AVGTHERMS Diff'] = pge_full['Calculated AVGTHERMS'] - pge_full['AVERAGETHERMS']
pge_full = pge_full.fillna(0) ## for 0/0 cases

In [12]:
display(pge_full.sort_values(by='AVGKWH Diff', ascending=False))
display(pge_full.sort_values(by='AVGTHERMS Diff', ascending=False))


Unnamed: 0,ZIPCODE,MONTH,YEAR,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHERMS,AVERAGETHERMS,Calculated AVGKWH,Calculated AVGTHERMS,AVGKWH Diff,AVGTHERMS Diff
62048,94957,12,2015,Elec- Residential,N,750,964125,1285,0,0,1285.5,0.0,0.5,0.0
48125,94569,1,2015,Elec- Residential,N,86,46655,542,0,0,542.5,0.0,0.5,0.0
33573,95558,6,2014,Elec- Residential,N,134,81137,605,0,0,605.5,0.0,0.5,0.0
14515,95257,12,2013,Elec- Residential,N,254,179451,706,0,0,706.5,0.0,0.5,0.0
117090,95569,11,2018,Elec- Residential,N,118,107321,909,0,0,909.5,0.0,0.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28462,95629,12,2013,Elec- Residential,N,410,356905,871,0,0,870.5,0.0,-0.5,0.0
1892,95542,1,2013,Elec- Residential,Y,732,1182546,1616,0,0,1615.5,0.0,-0.5,0.0
13340,93623,11,2013,Elec- Residential,N,194,56357,291,0,0,290.5,0.0,-0.5,0.0
112273,95310,9,2018,Elec- Residential,N,696,472932,680,0,0,679.5,0.0,-0.5,0.0


Unnamed: 0,ZIPCODE,MONTH,YEAR,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHERMS,AVERAGETHERMS,Calculated AVGKWH,Calculated AVGTHERMS,AVGKWH Diff,AVGTHERMS Diff
140477,95041,3,2020,Gas- Residential,N,310,0,0,17515,56,0.0,56.5,0.0,0.5
109260,93962,5,2018,Gas- Residential,N,272,0,0,8840,32,0.0,32.5,0.0,0.5
135256,93450,11,2019,Gas- Residential,N,122,0,0,4087,33,0.0,33.5,0.0,0.5
50783,95328,1,2015,Gas- Residential,N,1112,0,0,66164,59,0.0,59.5,0.0,0.5
38764,93608,2,2014,Gas- Residential,Y,102,0,0,4437,43,0.0,43.5,0.0,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135352,93665,11,2019,Gas- Residential,N,188,0,0,6674,36,0.0,35.5,0.0,-0.5
140229,94586,1,2020,Gas- Residential,N,250,0,0,23625,95,0.0,94.5,0.0,-0.5
46917,94957,11,2014,Gas- Residential,N,718,0,0,72877,102,0.0,101.5,0.0,-0.5
17070,95677,11,2013,Gas- Residential,N,8706,0,0,378711,44,0.0,43.5,0.0,-0.5


## Creating Datetime column from month and year

This requires a day so we add the 1 of each month as a new column arbritrarily


In [13]:
# np.repeat(1, len(pge_full))
# pge_full = pge_full.drop('DAY', axis=1)
pge_full.insert(3, 'DAY', np.repeat(1, len(pge_full)))

In [14]:
pge_full.insert(1, 'DATE', pd.to_datetime(pge_full[['YEAR', 'MONTH', 'DAY']]))

In [15]:
pge_full.head()

Unnamed: 0,ZIPCODE,DATE,MONTH,YEAR,DAY,CUSTOMERCLASS,COMBINED,TOTALCUSTOMERS,TOTALKWH,AVERAGEKWH,TOTALTHERMS,AVERAGETHERMS,Calculated AVGKWH,Calculated AVGTHERMS,AVGKWH Diff,AVGTHERMS Diff
0,93101,2013-01-01,1,2013,1,Elec- Residential,Y,0,0,0,0,0,0.0,0.0,0.0,0.0
1,93101,2013-02-01,2,2013,1,Elec- Residential,Y,0,0,0,0,0,0.0,0.0,0.0,0.0
2,93101,2013-03-01,3,2013,1,Elec- Residential,Y,0,0,0,0,0,0.0,0.0,0.0,0.0
3,93105,2013-01-01,1,2013,1,Elec- Residential,Y,0,0,0,0,0,0.0,0.0,0.0,0.0
4,93105,2013-02-01,2,2013,1,Elec- Residential,Y,0,0,0,0,0,0.0,0.0,0.0,0.0


### Separate Electricity and Gas

In [16]:
pge_full['CUSTOMERCLASS'].unique()

array(['Elec- Residential', 'Gas- Residential'], dtype=object)

In [17]:
pge_full_gas = pge_full[pge_full['CUSTOMERCLASS'] == 'Gas- Residential']
pge_full_elec = pge_full[pge_full['CUSTOMERCLASS'] == 'Elec- Residential']


In [18]:
pge_full_gas.to_csv('cleaned_data/pge-monthly-gas-by-zip_2013-2020.csv')
pge_full_elec.to_csv('cleaned_data/pge-monthly-elec-by-zip_2013-2020.csv')

In [22]:
print(pge_full.shape)
pge_full.to_csv('cleaned_data/pge-monthly-full-cleaned-by-zip_2013-2020.csv')

(154283, 16)
