# Biogas Prediction Project - Data Exploration

In [1]:
# This notebook explores the biogas production dataset.

In [2]:
# Import required libraries
import sys
import os

# Add parent directory to path using absolute path
notebook_path = os.path.abspath('')
project_root = os.path.dirname(notebook_path)
if project_root not in sys.path:
    sys.path.append(project_root)

# Import other required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the load_data function
from data import load_data

%matplotlib inline

In [4]:
# Test data loading
df = load_data()
if df is not None:
    print("Dataset shape:", df.shape)
    display(df.head())

Data loaded successfully!
Dataset shape: (429, 23)


Unnamed: 0,Project Name,Cluster Name,Project Type,City,County,State,Digester Type,Status,Year Operational,Animal/Farm Type(s),...,Swine,Co-Digestion,Biogas Generation Estimate (cu-ft/day),Electricity Generated (kWh/yr),Biogas End Use(s),LCFS Pathway?,System Designer(s)/Developer(s) and Affiliates,Receiving Utility,Total Emission Reductions (MTCO2e/yr),Awarded USDA Funding?
0,Cargill - Sandy River Farm Digester,,Farm Scale,Morrilton,Conway,AR,Covered Lagoon,Operational,2008.0,Swine,...,4200.0,,1814400.0,,Flared Full-time,,Martin Construction Resource LLC (formerly RCM...,,4482.755303,
1,Butterfield RNG Digester,,Farm Scale,Buckeye,Maricopa,AZ,Mixed Plug Flow,Operational,2022.0,Dairy,...,,,,,Pipeline Gas,,"Avolta [Project Developer]; DVO, Inc. (formerl...",Southwest Gas,14030.128975,Yes
2,Caballero Dairy Farms Digester,,Farm Scale,Eloy,Pinal,AZ,Unknown or Unspecified,Construction,2022.0,Dairy,...,,,,,Pipeline Gas,,Brightmark [Project Developer],,98999.890159,
3,Green Gas Partners Stanfield Digester,,Centralized/Regional,Maricopa,Pinal,AZ,Complete Mix,Operational,2018.0,Dairy,...,,,3454500.0,,Pipeline Gas,,Equilibrium Capital [Project Developer]; Green...,Kinger Morgan El Paso,337499.62554,
4,Paloma Dairy Digester,,Farm Scale,Gila Bend,Maricopa,AZ,Complete Mix,Operational,2021.0,Dairy,...,,,,,CNG,,Black Bear Environmental Assets [Project Devel...,Southwest Gas Company,100569.493399,


In [None]:
# Data Exploration

# Display column names
print("\nColumns in the dataset:")
print(df.columns.tolist())

# Display basic statistics
print("\nBasic Statistics:")
display(df.describe())

# Check missing values
print("\nMissing Values:")
display(df.isnull().sum())



Columns in the dataset:
['Project Name', 'Cluster Name', 'Project Type', 'City', 'County', 'State', 'Digester Type', 'Status', 'Year Operational', 'Animal/Farm Type(s)', 'Cattle', 'Dairy', 'Poultry', 'Swine', 'Co-Digestion', 'Biogas Generation Estimate (cu-ft/day)', 'Electricity Generated (kWh/yr)', 'Biogas End Use(s)', 'LCFS Pathway?', 'System Designer(s)/Developer(s) and Affiliates', 'Receiving Utility', 'Total Emission Reductions (MTCO2e/yr)', 'Awarded USDA Funding?']

Basic Statistics:


Unnamed: 0,Year Operational,Cattle,Dairy,Poultry,Swine,Biogas Generation Estimate (cu-ft/day),Total Emission Reductions (MTCO2e/yr)
count,413.0,9.0,354.0,7.0,45.0,186.0,399.0
mean,2014.491525,6234.444444,4606.646893,282428.6,23598.555556,338801.0,35127.557018
std,7.182642,12792.163725,5428.725201,411741.0,40575.64673,398914.1,47354.39292
min,1979.0,30.0,45.0,33000.0,10.0,11200.0,4.219814
25%,2009.0,300.0,1312.5,82000.0,4400.0,100000.0,6407.048767
50%,2015.0,2400.0,3000.0,150000.0,8900.0,250000.0,19321.483926
75%,2021.0,4500.0,6500.0,215000.0,28000.0,389340.0,49385.399845
max,2023.0,40000.0,39000.0,1200000.0,239200.0,3454500.0,390000.010506



Missing Values:


Project Name                                        0
Cluster Name                                      300
Project Type                                        0
City                                                0
County                                              8
State                                               0
Digester Type                                       1
Status                                              0
Year Operational                                   16
Animal/Farm Type(s)                                 0
Cattle                                            420
Dairy                                              75
Poultry                                           422
Swine                                             384
Co-Digestion                                      325
Biogas Generation Estimate (cu-ft/day)            243
Electricity Generated (kWh/yr)                    258
Biogas End Use(s)                                   3
LCFS Pathway?               

In [None]:
# Create correlation heatmap
plt.figure(figsize=(12, 8))
numeric_cols = ['Cattle', 'Dairy', 'Poultry', 'Swine', 
                'Biogas Generation Estimate (cu-ft/day)']
sns.heatmap(df[numeric_cols].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Animal Counts and Biogas Generation')
plt.tight_layout()
plt.show()

# Distribution of Biogas Generation
plt.figure(figsize=(10, 6))
sns.histplot(df['Biogas Generation Estimate (cu-ft/day)'], bins=30)
plt.title('Distribution of Biogas Generation')
plt.xlabel('Biogas Generation (cu-ft/day)')
plt.show()