In [1]:
# Step 1: Import Libraries
import numpy as np
import pandas as pd
from sqlalchemy import create_engine
import time
import requests
from io import StringIO

In [2]:
import pandas as pd
import requests
from io import StringIO

# Define the years we plan to download from CalGEM
years = ['2023']

# Set pandas to display all columns
pd.set_option('display.max_columns', None)

# Load additional CSV file to merge with production data
# This file contains additional well information that will be merged later
additional_data = pd.read_csv(r'C:\Users\JWhitson\code\RG3_WaterBoard_AOP\Data_folder\R3 well list_Wellfinder.csv')

# Print the wellfinder dataframe to see what additional data is available
print('Wellfinder Dataframe:')
print(additional_data)

# Iterate over each year we plan to download
for year in years:
    print(f'Currently processing year:({year})')
    
    # Create a dictionary of filenames for each dataset we plan to work with
    calgem_filenames = {
        "production_file": f"{year}CaliforniaOilAndGasWellMonthlyProduction.csv"
    }
    
    # Construct the CalGEM URL needed to download the dataset
    production_url = f"https://calgem-pid.conservation.ca.gov/pid/{calgem_filenames['production_file']}"
    
    # Custom headers for web scraping (to make the request appear as if it's coming from a browser)
    headers = {
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'
    }
    
    # Download the production data from CalGEM and load it into a dataframe
    response_production = requests.get(production_url, headers=headers)
    response_production.raise_for_status()  # Ensure the request was successful
    df_production_data = pd.read_csv(StringIO(response_production.text))
    
    # Add a column for the year to the production dataframe for reference
    df_production_data['ReportYear_production'] = year
    
    # Remove the last two digits of the API number to match the format of the additional data
    df_production_data['APINumber'] = df_production_data['APINumber'].astype(str).str[:-2]
    additional_data['API'] = additional_data['API'].astype(str)
    
    # Filter only rows with 'Reported' values in the ReportedOrEstimated column
    df_production_data = df_production_data[df_production_data['ReportedOrEstimated'] == 'Reported']

    # Isolate specific columns from the production data that are needed
    df_production_data = df_production_data[[
        'APINumber',
        'WellTypeCode',
        'ProductionReportDate',
        'CasingPressure',
        'TubingPressure',
        'APIGravityofOil',
        'OilorCondensateProduced',
        'GasProduced',
        'WaterProduced',
        'ReportedOrEstimated',
        'ReportYear_production'
    ]]
    
    # Rename 'OilorCondensateProduced' to 'Oil Produced' for clarity
    df_production_data.rename(columns={'OilorCondensateProduced': 'Oil Produced'}, inplace=True)

    # Ensure the columns for aggregation are numeric and handle missing values
    df_production_data['Oil Produced'] = pd.to_numeric(df_production_data['Oil Produced'], errors='coerce').fillna(0)
    df_production_data['GasProduced'] = pd.to_numeric(df_production_data['GasProduced'], errors='coerce').fillna(0)
    df_production_data['WaterProduced'] = pd.to_numeric(df_production_data['WaterProduced'], errors='coerce').fillna(0)

    # Aggregate the production data (Oil, Gas, Water) for each APINumber for the entire year
    production_summary = df_production_data.groupby('APINumber').agg({
        'Oil Produced': 'sum',
        'GasProduced': 'sum',
        'WaterProduced': 'sum'
    }).reset_index()

    # Print the aggregated dataframe to see the results
    print('Aggregated Dataframe (Summed Oil, Gas, Water Production):')
    print(production_summary)

    # Merge additional data with aggregated production data on the API number
    # This step combines the wellfinder information with the aggregated production data
    df_final_merged = pd.merge(additional_data, production_summary, left_on='API', right_on='APINumber', how='left')

    # Add a leading zero to APINumber and API columns, ensuring consistency and removing any NaNs
    df_final_merged['APINumber'] = df_final_merged['APINumber'].fillna('').astype(str).str.zfill(10)
    df_final_merged['API'] = df_final_merged['API'].fillna('').astype(str).str.zfill(10)

    # Print the dataframe after adding leading zeros
    print('Dataframe after adding leading zeros:')
    print(df_final_merged.head())

    # Drop unnecessary columns such as 'ProductionReportDate'
    df_final_merged.drop(columns=['APINumber'], inplace=True)

    # Save the final merged dataframe to a CSV file
    final_output_filename = f"{year}_final_merged_production_data.csv"
    df_final_merged.to_csv(final_output_filename, index=False)
    print(f"Final merged data for year {year} saved to {final_output_filename}")


Wellfinder Dataframe:
                  Operator Name               Lease Name Well Number  \
0             Cal-L Expl. Corp.                  Alegria           1   
1        Blackwood & Nichols Co                Hollister           1   
2        Republic Petroleum Co.       Republic-Hollister           1   
3     Oil & Gas Prop Mgmt. Inc.                Hollister           1   
4                       Bmw Co.  Aqua Caliente Core Hole           1   
...                         ...                      ...         ...   
4995  El Dorado Exploration Co.                 BCB-Doud      D-3-32   
4996  El Dorado Exploration Co.              Doud Estate       D-3-4   
4997  El Dorado Exploration Co.                 BCB-Doud      D-4-32   
4998  El Dorado Exploration Co.                V.H.-Doud      D-5-32   
4999  El Dorado Exploration Co.                V.H.-Doud      D-6-32   

            API Well Status Well Type Well Type Label Well Symbol  \
0     408300962     Plugged        OG       