In [4]:
#!/usr/bin/env python3

import time

import yaml
import pandas as pd
from sqlalchemy import create_engine

In [5]:
with open('config.yml', 'r') as file:
    config = yaml.safe_load(file)

DB_HOST   = config['host']
DB_PORT   = config['port']
DB_USER   = config['user']
DB_PASSWD = config['password']
DB_NAME   = config['database']

In [None]:
# https://calgem-pid.conservation.ca.gov/pid/2024CaliforniaOilAndGasWells.csv

# Define the years we plan to download from CalGEM
years = ['2024']

# Iterate over each year we plan to download.
for year in years:
    print(f'Requesting data for year: {year}')

    # Create a dictionary of filenames for each dataset we plan to work with.
    calgem_filenames = {
        "wells_file": f"{year}CaliforniaOilAndGasWells.csv"
    }

    # Construct the CalGEM urls needed to download the datasets.
    wells_url = f"https://calgem-pid.conservation.ca.gov/pid/{calgem_filenames['wells_file']}"

    # step to be able to webscrape the data: https://stackoverflow.com/questions/55711159/pandas-read-csv-from-url-and-include-request-header
    # Custom way to tell CalGEM to accept our request for the data.
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
    }

     # Create a dataframe for each dataset from CalGEM, and formats the data as CSV.
    df_wells_data = pd.read_csv(wells_url, storage_options=headers)

     # add column Year to df_wells_data in each year downloaded.
    df_wells_data['ReportYear'] = year
    
    # Writes the downloaded CalGEM data to the CSV file.
    df_wells_data.to_csv(calgem_filenames['wells_file'])

     # Create a connection to the sqlite database.
    disk_engine = create_engine(f'mysql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

     # Write the data from each dataset into the corresponding database table.
    df_wells_data.to_sql('wells_data', disk_engine, if_exists='replace')

print(f"Completed Downloading of years: {",".join([year for year in years])}")


In [None]:
# https://opendata.arcgis.com/api/v3/datasets/2174fff844e9425ba07acac5cb975b2d_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1

aquifer_exemptions_url = 'https://opendata.arcgis.com/api/v3/datasets/2174fff844e9425ba07acac5cb975b2d_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1'

# string variable
project_name = 'aquifer_exemptions'

# Create a string for the csv file name.
calgem_aquifer_exemption_file = f'{project_name}.csv'

# Create a dataframe for each dataset from CalGEM, and formats the data as CSV.
df_aquifer_exemptions = pd.read_csv(aquifer_exemptions_url)

# Add a 30-second delay
time.sleep(15)
   
# Writes the downloaded CalGEM data to the CSV file.
df_aquifer_exemptions.to_csv(calgem_aquifer_exemption_file)

# Create a connection to the sqlite database.
disk_engine = create_engine(f'mysql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Write the data from each dataset into the corresponding database table.
df_aquifer_exemptions.to_sql(project_name, disk_engine, if_exists='replace')


In [None]:
#  https://opendata.arcgis.com/api/v3/datasets/62a8c86c840c4fea80664276682cc612_3/downloads?spatialRefId=4326&formats=csv&where=1%3D1

facility_boundaries_url = 'https://opendata.arcgis.com/api/v3/datasets/62a8c86c840c4fea80664276682cc612_3/downloads/data?format=csv&spatialRefId=4326&where=1%3D1'

# string variable
project_name = 'facility_boundaries'

# Create a string for the csv file name.
calgem_facility_boundaries_file = f'{project_name}.csv'

# Create a dataframe for each dataset from CalGEM, and formats the data as CSV.
df_facility_boundaries = pd.read_csv(facility_boundaries_url)

# Add a 30-second delay
time.sleep(15)
   
# Writes the downloaded CalGEM data to the CSV file.
df_facility_boundaries.to_csv(calgem_facility_boundaries_file)

# Create a connection to the sqlite database.
disk_engine = create_engine(f'mysql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Write the data from each dataset into the corresponding database table.
df_facility_boundaries.to_sql(project_name, disk_engine, if_exists='replace')


In [None]:
# https://opendata.arcgis.com/api/v3/datasets/a23f5eb6c9cf48a9ad34378a75411bf3_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1

facility_tanks_url = 'https://opendata.arcgis.com/api/v3/datasets/a23f5eb6c9cf48a9ad34378a75411bf3_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1'

# string variable
project_name = 'facility_tanks'

# Create a string for the csv file name.
calgem_facility_tanks_file = f'{project_name}.csv'

# Create a dataframe for each dataset from CalGEM, and formats the data as CSV.
df_facility_tanks = pd.read_csv(facility_tanks_url)

# Add a 30-second delay
time.sleep(15)
   
# Writes the downloaded CalGEM data to the CSV file.
df_facility_tanks.to_csv(calgem_facility_tanks_file)

# Create a connection to the sqlite database.
disk_engine = create_engine(f'mysql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Write the data from each dataset into the corresponding database table.
df_facility_tanks.to_sql(project_name, disk_engine, if_exists='replace')


In [None]:
# https://opendata.arcgis.com/api/v3/datasets/091104d846af4a6881ab5ff03d516fee_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1

facility_tank_settings_url = 'https://opendata.arcgis.com/api/v3/datasets/091104d846af4a6881ab5ff03d516fee_0/downloads/data?format=csv&spatialRefId=4326&where=1%3D1'

# string variable
project_name = 'district_boundaries'

# Create a string for the csv file name.
calgem_facility_tank_settings_file = f'{project_name}.csv'

# Create a dataframe for each dataset from CalGEM, and formats the data as CSV.
df_facility_tank_settings = pd.read_csv(facility_tank_settings_url)

# Add a 30-second delay
time.sleep(15)

# Writes the downloaded CalGEM data to the CSV file.
df_facility_tank_settings.to_csv(calgem_facility_tank_settings_file)

# Create a connection to the sqlite database.
disk_engine = create_engine(f'mysql://{DB_USER}:{DB_PASSWD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Write the data from each dataset into the corresponding database table.
df_facility_tank_settings.to_sql(project_name, disk_engine, if_exists='replace')