In [2]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import time
import requests
from io import StringIO

In [4]:
# Define the years we plan to download from CalGEM
years = ['2023']

# Set pandas to display all columns
pd.set_option('display.max_columns', None)

# Load additional CSV file to merge with production data
# This file contains additional well information that will be merged later
additional_data = pd.read_csv(r'C:\Users\JWhitson\code\RG3_WaterBoard_AOP\Data_folder\Wellfinder_allwells.csv')

# Iterate over each year we plan to download
for year in years:
    print(f'Currently processing year:({year})')
    
    # Create a dictionary of filenames for each dataset we plan to work with
    calgem_filenames = {
        "production_file": f"{year}CaliforniaOilAndGasWellMonthlyProduction.csv",
        "injection_file": f"{year}CaliforniaOilAndGasWellMonthlyInjection.csv"
    }
    
    # Construct the CalGEM URL needed to download the dataset
    production_url = f"https://calgem-pid.conservation.ca.gov/pid/{calgem_filenames['production_file']}"
    injections_url = f"https://calgem-pid.conservation.ca.gov/pid/{calgem_filenames['injection_file']}"
    
    # Custom headers for web scraping (to make the request appear as if it's coming from a browser)
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
    }
    
    # Download the production data from CalGEM and load it into a dataframe
    response_production = requests.get(production_url, headers=headers)
    response_production.raise_for_status()  # Ensure the request was successful
    df_production_data = pd.read_csv(StringIO(response_production.text))
    
    # Download the injection data from CalGEM and load it into a dataframe
    response_injection = requests.get(injections_url, headers=headers)
    response_injection.raise_for_status()  # Ensure the request was successful
    df_injection_data = pd.read_csv(StringIO(response_injection.text))
    
    # Remove the last two digits of the API number in the injection dataframe to match the format of the additional data
    df_injection_data['APINumber'] = df_injection_data['APINumber'].astype(str).str[:-2]
    
    # Filter only rows with 'Reported' values in the ReportedOrEstimated column
    df_injection_data = df_injection_data[df_injection_data['ReportedOrEstimated'] == 'Reported']
    
    # Select specific columns from the injection dataframe
    df_injection_data = df_injection_data[[
      'APINumber', 
      'GasAirInjected', 
      'SteamWaterInjected', 
      'SurfaceInjectionPressure', 
      'CasingInjectionPressure', 
      'InjectionDate'
    ]]
    
    # Sum the InjectionDate column to make it yearly for each APINumber
    df_injection_data = df_injection_data.groupby('APINumber').agg({
        'GasAirInjected': 'sum',
        'SteamWaterInjected': 'sum',
        'SurfaceInjectionPressure': 'mean',
        'CasingInjectionPressure': 'mean',
        'InjectionDate': 'count'
    }).reset_index()
    df_injection_data.rename(columns={'InjectionDate': 'TotalInjectionCount'}, inplace=True)
    
    # Add a column for the year to the production and injection dataframes for reference
    df_production_data['ReportYear_production'] = year
    df_injection_data['ReportYear_injection'] = year
    
    # Remove the last two digits of the API number in the production dataframe to match the format of the additional data
    df_production_data['APINumber'] = df_production_data['APINumber'].astype(str).str[:-2]
    additional_data['API'] = additional_data['API'].astype(str)
    
    # Filter only rows with 'Reported' values in the ReportedOrEstimated column
    df_production_data = df_production_data[df_production_data['ReportedOrEstimated'] == 'Reported']

    # Isolate specific columns from the production data that are needed
    df_production_data = df_production_data[[
        'APINumber',
        'WellTypeCode',
        'ProductionReportDate',
        'CasingPressure',
        'TubingPressure',
        'APIGravityofOil',
        'OilorCondensateProduced',
        'GasProduced',
        'WaterProduced',
        'ReportedOrEstimated',
        'ReportYear_production'
    ]]
    
    # Rename 'OilorCondensateProduced' to 'Oil Produced' for clarity
    df_production_data.rename(columns={'OilorCondensateProduced': 'Oil Produced'}, inplace=True)

    # Ensure the columns for aggregation are numeric and handle missing values
    df_production_data['Oil Produced'] = pd.to_numeric(df_production_data['Oil Produced'], errors='coerce').fillna(0)
    df_production_data['GasProduced'] = pd.to_numeric(df_production_data['GasProduced'], errors='coerce').fillna(0)
    df_production_data['WaterProduced'] = pd.to_numeric(df_production_data['WaterProduced'], errors='coerce').fillna(0)

    # Aggregate the production data (Oil, Gas, Water) for each APINumber for the entire year
    production_summary = df_production_data.groupby('APINumber').agg({
        'Oil Produced': 'sum',
        'GasProduced': 'sum',
        'WaterProduced': 'sum'
    }).reset_index()

    # Print the aggregated dataframe to see the results
    print('Aggregated Dataframe (Summed Oil, Gas, Water Production):')
    print(production_summary)

    # Merge injection data with additional wellfinder data
    df_final_merged = pd.merge(additional_data, df_injection_data, left_on='API', right_on='APINumber', how='left')

    # Merge production data with the merged wellfinder and injection data
    df_final_merged = pd.merge(df_final_merged, production_summary, left_on='API', right_on='APINumber', how='left')

    # Add a leading zero to API columns, ensuring consistency and removing any NaNs
    df_final_merged['API'] = df_final_merged['API'].fillna('').astype(str).str.zfill(10)

    # Print the dataframe after adding leading zeros
    print('Dataframe after adding leading zeros:')
    print(df_final_merged.head())

    # Drop unnecessary columns such as 'ProductionReportDate'
    df_final_merged.drop(columns=['APINumber_x', 'APINumber_y'], inplace=True)

    # Save the final merged dataframe to a CSV file
    final_output_filename = f"{year}_final_merged_production_injection_data.csv"
    df_final_merged.to_csv(final_output_filename, index=False)
    print(f"Final merged data for year {year} saved to {final_output_filename}")


Currently processing year:(2023)
Aggregated Dataframe (Summed Oil, Gas, Water Production):
       APINumber  Oil Produced  GasProduced  WaterProduced
0      400100001           0.0          0.0            0.0
1      400120004           0.0          0.0            0.0
2      400120008           0.0          0.0            0.0
3      400120009           0.0          0.0            0.0
4      400120012           0.0          0.0            0.0
...          ...           ...          ...            ...
77026  425921745           0.0          0.0            0.0
77027  425921746       23938.0       7212.0      2108671.0
77028  425921754       36727.0      11090.0      1886497.0
77029  425921755           1.0          0.0            0.0
77030  425921756       49495.0      14937.0      1531718.0

[77031 rows x 4 columns]
Dataframe after adding leading zeros:
               Operator Name               Lease Name Well Number         API  \
0          Cal-L Expl. Corp.                  Alegria   