# Data processing - EIA dataset

Author: Huiting Song

In [20]:
# Import packages
import pandas as pd
import sklearn
import numpy as np
import nltk
import json
import matplotlib.pyplot as plt
import seaborn as sns
from codecarbon import EmissionsTracker

In [21]:
experiment_name = "dataprocessing-EIA"

tracker = EmissionsTracker(
    output_dir="/Users/crystal/Desktop/ANLY5550/codecarbon",
    output_file=f"{experiment_name}_emissions.csv",
    log_level="error",  # comment out this line to see regular output
)
tracker.start()

## Dataset from EIA

This dataset is from U.S. energy information administration (EIA). The data contains the monthly net generation for all utility-scale solar in virginia from Jan 2001 to Jan 2024. The data is meaningful in analyzing the amount of electric power genrated from solar in virginia. This can relate to the other dataset in this project to predict the electricity generation by solar and the factors influence the net generation.  

In [22]:
# Read CSV
net_gen = pd.read_csv("/Users/crystal/Desktop/ANLY5550/Data/Raw/Net_generation_for_all_utility-scale_solar_virginia.csv")

# Display the dataframe
net_gen.head() 

Unnamed: 0,description,units,source key,Jan 2001,Feb 2001,Mar 2001,Apr 2001,May 2001,Jun 2001,Jul 2001,...,Apr 2023,May 2023,Jun 2023,Jul 2023,Aug 2023,Sep 2023,Oct 2023,Nov 2023,Dec 2023,Jan 2024
0,Virginia : all sectors,thousand megawatthours,ELEC.GEN.SUN-VA-99.M,0,0,0,0,0,0,0,...,531,592,570,609,564,459,435,326,278,286
1,Virginia : electric power,thousand megawatthours,ELEC.GEN.SUN-VA-98.M,0,0,0,0,0,0,0,...,531,592,570,609,564,459,435,326,278,286
2,Virginia : electric utility,thousand megawatthours,ELEC.GEN.SUN-VA-1.M,0,0,0,0,0,0,0,...,220,241,225,234,222,185,170,134,100,105
3,Virginia : independent power producers,thousand megawatthours,ELEC.GEN.SUN-VA-94.M,--,--,--,--,--,--,--,...,312,351,345,375,342,274,265,192,178,182
4,Virginia : electric utility non-cogen,thousand megawatthours,ELEC.GEN.SUN-VA-2.M,--,--,--,--,--,--,--,...,312,351,345,375,342,274,265,192,178,182


In [23]:
# Melting the DataFrame
net_gen_long = pd.melt(net_gen, id_vars=['description', 'units', 'source key'], var_name='Date', value_name='value')

In [24]:
net_gen_long.head()

Unnamed: 0,description,units,source key,Date,value
0,Virginia : all sectors,thousand megawatthours,ELEC.GEN.SUN-VA-99.M,Jan 2001,0
1,Virginia : electric power,thousand megawatthours,ELEC.GEN.SUN-VA-98.M,Jan 2001,0
2,Virginia : electric utility,thousand megawatthours,ELEC.GEN.SUN-VA-1.M,Jan 2001,0
3,Virginia : independent power producers,thousand megawatthours,ELEC.GEN.SUN-VA-94.M,Jan 2001,--
4,Virginia : electric utility non-cogen,thousand megawatthours,ELEC.GEN.SUN-VA-2.M,Jan 2001,--


In [25]:
# Convert the 'Date' column to datetime type specifying the format to capture month and year
net_gen_long['Date'] = pd.to_datetime(net_gen_long['Date'], format='%b %Y')

# Ensure the DataFrame is sorted by Date
net_gen_long = net_gen_long.sort_values(by='Date')

# Convert '--' to NaN
net_gen_long['value'].replace('--', np.nan, inplace=True)

# Drop the 'description', 'units', and 'source key' columns as they are not needed
net_gen_long.drop(['units', 'source key'], axis=1, inplace=True)

In [26]:
net_gen_long.head(15)

Unnamed: 0,description,Date,value
0,Virginia : all sectors,2001-01-01,0.0
12,Virginia : residential,2001-01-01,
11,Virginia : industrial cogen,2001-01-01,
10,Virginia : industrial non-cogen,2001-01-01,
8,Virginia : commercial cogen,2001-01-01,
7,Virginia : commercial non-cogen,2001-01-01,
9,Virginia : all industrial,2001-01-01,
5,Virginia : electric utility cogen,2001-01-01,
4,Virginia : electric utility non-cogen,2001-01-01,
3,Virginia : independent power producers,2001-01-01,


In [27]:
# Replace non-numeric entries first.
net_gen_long['value'] = pd.to_numeric(net_gen_long['value'], errors='coerce')

# Pivot the table
net_gen_pivoted = net_gen_long.pivot(index='Date', columns='description', values='value')


In [28]:
# The pivot operation retains the original row index. So reset the index to make 'Date', 'units', and 'source key' into columns
net_gen_pivoted.head()

description,Virginia : all commercial,Virginia : all industrial,Virginia : all sectors,Virginia : commercial cogen,Virginia : commercial non-cogen,Virginia : electric power,Virginia : electric utility,Virginia : electric utility cogen,Virginia : electric utility non-cogen,Virginia : independent power producers,Virginia : industrial cogen,Virginia : industrial non-cogen,Virginia : residential
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2001-01-01,,,0.0,,,0.0,0.0,,,,,,
2001-02-01,,,0.0,,,0.0,0.0,,,,,,
2001-03-01,,,0.0,,,0.0,0.0,,,,,,
2001-04-01,,,0.0,,,0.0,0.0,,,,,,
2001-05-01,,,0.0,,,0.0,0.0,,,,,,


In [29]:
# Remove the column MultiIndex created by pivot
net_gen_pivoted.columns = [col for col in net_gen_pivoted.columns]

# Reset the index to move 'Date' back to a column if desired
net_gen_pivoted.reset_index(inplace=True)

In [30]:
net_gen_pivoted.head()

Unnamed: 0,Date,Virginia : all commercial,Virginia : all industrial,Virginia : all sectors,Virginia : commercial cogen,Virginia : commercial non-cogen,Virginia : electric power,Virginia : electric utility,Virginia : electric utility cogen,Virginia : electric utility non-cogen,Virginia : independent power producers,Virginia : industrial cogen,Virginia : industrial non-cogen,Virginia : residential
0,2001-01-01,,,0.0,,,0.0,0.0,,,,,,
1,2001-02-01,,,0.0,,,0.0,0.0,,,,,,
2,2001-03-01,,,0.0,,,0.0,0.0,,,,,,
3,2001-04-01,,,0.0,,,0.0,0.0,,,,,,
4,2001-05-01,,,0.0,,,0.0,0.0,,,,,,


Since i want to analyze the `Virginia : electric power`, so i will just keep this columns as the target variables. Also. the valid data is from Jan.2017, so the date will be from Jan 2017 to jan 2024.

In [31]:
# Selecting only the Date and Virginia : electric power columns
net_gen_target = net_gen_pivoted[['Date', 'Virginia : electric power']]

# Rename the 'Virginia : electric power' column
net_gen_target.rename(columns={'Virginia : electric power': 'net_gen_electric_power'}, inplace=True)

# Filter the DataFrame to keep only entries from January 2017 onwards
net_gen_filtered = net_gen_target[net_gen_target['Date'] >= '2017-01-01']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  net_gen_target.rename(columns={'Virginia : electric power': 'net_gen_electric_power'}, inplace=True)


In [32]:
net_gen_filtered.head()

Unnamed: 0,Date,net_gen_electric_power
192,2017-01-01,15.0
193,2017-02-01,19.0
194,2017-03-01,23.0
195,2017-04-01,25.0
196,2017-05-01,28.0


In [33]:
# save the appended dataframe to the RAW data folder
net_gen_filtered.to_csv('/Users/crystal/Desktop/ANLY5550/Data/Cleaned/EIA_net_generation.csv', index=False)

In [34]:
tracker.stop()

2.6738562391674378e-06