# **Program to download and pre-process North Dakota well data for sub-county analysis**

# **1.a  Download North Dakota monthly production data from DMR ~10min**



In [1]:
import requests
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta


In [None]:
# create list of all yyyy-mm for data download (Note, May 2015 is the first month ND data is available in xlxs)
myList = pd.date_range('2015-04-10',datetime.today()- relativedelta(months=3), freq='MS').strftime("%Y_%m").tolist()  # with month in numeric format

# Access the ND DMR data @ https://www.dmr.nd.gov/oilgas/mprindex.asp
website = "https://www.dmr.nd.gov/oilgas/mpr/"

In [None]:
# Review the list of monthly data
myList

In [None]:
NDMWD = []  #list()  # North Dakota Monthly Well (NDMWD) Data download process ~ 10 minutes
for month in myList:
    target_url = website + month + '.xlsx'
    print(target_url)
    r = requests.get(target_url)
    wb = pd.read_excel(r.content)
    NDMWD.append(wb)

NDMWD = pd.concat(NDMWD)

https://www.dmr.nd.gov/oilgas/mpr/2015_05.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_06.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_07.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_08.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_09.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_10.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_11.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2015_12.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_01.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_02.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_03.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_04.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_05.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_06.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_07.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_08.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_09.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_10.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_11.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2016_12.xlsx
https://www.dmr.nd.gov/oilgas/mpr/2017_01.xlsx
https://www.d

In [None]:
#Create ND monthly well data table in csv format, which can be downloaded from folder on left hand side
NDMWD.to_csv('NDMWD.csv', index=False)

In [None]:
NDMWD.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1706085 entries, 0 to 20182
Data columns (total 21 columns):
 #   Column      Dtype         
---  ------      -----         
 0   ReportDate  datetime64[ns]
 1   API_WELLNO  int64         
 2   FileNo      int64         
 3   Company     object        
 4   WellName    object        
 5   Quarter     object        
 6   Section     float64       
 7   Township    float64       
 8   Range       float64       
 9   County      object        
 10  FieldName   object        
 11  Pool        object        
 12  Oil         float64       
 13  Wtr         float64       
 14  Days        float64       
 15  Runs        int64         
 16  Gas         float64       
 17  GasSold     int64         
 18  Flared      float64       
 19  Lat         float64       
 20  Long        float64       
dtypes: datetime64[ns](1), float64(10), int64(4), object(6)
memory usage: 286.4+ MB


# **1.b Extract first month oil and gas produciton data, latitude and longitude**

In [None]:
#NDMWD = pd.read_csv('NDMWD.csv')
NDMWD.drop_duplicates(inplace=True)
NDMWD = NDMWD[NDMWD['Days'] >= 25].reset_index(drop=True)
NDMWD['ReportDate'] = pd.to_datetime(NDMWD['ReportDate'], errors = 'coerce')
NDMWD.tail(2)

Unnamed: 0,ReportDate,API_WELLNO,FileNo,Company,WellName,Quarter,Section,Township,Range,County,...,Pool,Oil,Wtr,Days,Runs,Gas,GasSold,Flared,Lat,Long
1234977,2023-06-01,33009012640000,6347,EMPIRE NORTH DAKOTA LLC,RICE 2,SENW,26.0,163.0,79.0,BOT,...,SPEARFISH/MADISON,25.0,268.0,30.0,0,0.0,0,0.0,48.917191,-100.876462
1234978,2023-06-01,33009019570000,13685,EMPIRE NORTH DAKOTA LLC,RICE 3,NENW,26.0,163.0,79.0,BOT,...,SPEARFISH/MADISON,51.0,6565.0,30.0,0,0.0,0,0.0,48.920808,-100.877109


In [None]:
ffmWells = NDMWD[['API_WELLNO', 'Pool','ReportDate']].groupby('API_WELLNO').min().rename(columns={'ReportDate':'ffMonth'}).reset_index()
ffmWells.tail(3)

Unnamed: 0,API_WELLNO,Pool,ffMonth
20439,33105058760000,BAKKEN,2023-05-01
20440,33105061150000,MADISON,2023-06-01
20441,33105903870000,DAKOTA,2018-01-01


In [None]:
ffmWells.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20442 entries, 0 to 20441
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   API_WELLNO  20442 non-null  int64         
 1   Pool        20442 non-null  object        
 2   ffMonth     20442 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 479.2+ KB


In [None]:
ffmWells = ffmWells.merge(NDMWD[['API_WELLNO','Pool','ReportDate','Oil','Gas','Wtr','Days','Runs', 'Lat', 'Long']], how='left', left_on=['API_WELLNO','Pool','ffMonth'], right_on = ['API_WELLNO','Pool','ReportDate'])
ffmWells.dropna(inplace=True)
ffmWells.reset_index(drop=True)
ffmWells.tail()

Unnamed: 0,API_WELLNO,Pool,ffMonth,ReportDate,Oil,Gas,Wtr,Days,Runs,Lat,Long
20437,33105058570000,BAKKEN,2023-05-01,2023-05-01,26701.0,24497.0,64246.0,31.0,26751.0,48.167755,-103.719029
20438,33105058750000,BAKKEN,2023-03-01,2023-03-01,40250.0,31819.0,93444.0,25.0,39436.0,48.169415,-103.699984
20439,33105058760000,BAKKEN,2023-05-01,2023-05-01,31521.0,26441.0,69949.0,31.0,31478.0,48.169418,-103.700107
20440,33105061150000,MADISON,2023-06-01,2023-06-01,938.0,27.0,686.0,30.0,1282.0,48.016557,-103.859187
20441,33105903870000,DAKOTA,2018-01-01,2018-01-01,410.0,0.0,0.0,31.0,946.0,48.404169,-103.475385


In [None]:
ffmWells.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20412 entries, 0 to 20441
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   API_WELLNO  20412 non-null  int64         
 1   Pool        20412 non-null  object        
 2   ffMonth     20412 non-null  datetime64[ns]
 3   ReportDate  20412 non-null  datetime64[ns]
 4   Oil         20412 non-null  float64       
 5   Gas         20412 non-null  float64       
 6   Wtr         20412 non-null  float64       
 7   Days        20412 non-null  float64       
 8   Runs        20412 non-null  float64       
 9   Lat         20412 non-null  float64       
 10  Long        20412 non-null  float64       
dtypes: datetime64[ns](2), float64(7), int64(1), object(1)
memory usage: 1.9+ MB


In [None]:
ffmWells.drop('ReportDate', axis=1, inplace=True )
ffmWells['bpd'] =  ffmWells.Oil/ffmWells.Days
ffmWells['Mcfpd'] =  ffmWells.Gas/ffmWells.Days

In [None]:
ffmWells.tail()

Unnamed: 0,API_WELLNO,Pool,ffMonth,Oil,Gas,Wtr,Days,Runs,Lat,Long,bpd,Mcfpd
20437,33105058570000,BAKKEN,2023-05-01,26701.0,24497.0,64246.0,31.0,26751.0,48.167755,-103.719029,861.322581,790.225806
20438,33105058750000,BAKKEN,2023-03-01,40250.0,31819.0,93444.0,25.0,39436.0,48.169415,-103.699984,1610.0,1272.76
20439,33105058760000,BAKKEN,2023-05-01,31521.0,26441.0,69949.0,31.0,31478.0,48.169418,-103.700107,1016.806452,852.935484
20440,33105061150000,MADISON,2023-06-01,938.0,27.0,686.0,30.0,1282.0,48.016557,-103.859187,31.266667,0.9
20441,33105903870000,DAKOTA,2018-01-01,410.0,0.0,0.0,31.0,946.0,48.404169,-103.475385,13.225806,0.0


In [None]:
ffmWells.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20412 entries, 0 to 20441
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   API_WELLNO  20412 non-null  int64         
 1   Pool        20412 non-null  object        
 2   ffMonth     20412 non-null  datetime64[ns]
 3   Oil         20412 non-null  float64       
 4   Gas         20412 non-null  float64       
 5   Wtr         20412 non-null  float64       
 6   Days        20412 non-null  float64       
 7   Runs        20412 non-null  float64       
 8   Lat         20412 non-null  float64       
 9   Long        20412 non-null  float64       
 10  bpd         20412 non-null  float64       
 11  Mcfpd       20412 non-null  float64       
dtypes: datetime64[ns](1), float64(9), int64(1), object(1)
memory usage: 2.0+ MB


# **1.c Wells latitude and longitude to MHB allocation**

In [None]:
# Maidenhead grid locators courtesy of Walter Underwood K6WRU:
# https://ham.stackexchange.com/questions/221/how-can-one-convert-from-lat-long-to-grid-square


In [None]:
upper = 'ABCDEFGHIJKLMNOPQRSTUVWX'
lower = 'abcdefghijklmnopqrstuvwx'

In [None]:
def gridCalc(row):
    dec_lat = row['Lat']
    dec_lon = row['Long']
    if not (-180<=dec_lon<180):
        sys.stderr.write('longitude must be -180<=lon<180, given %f\n'%dec_lon)
        sys.exit(32)
    if not (-90<=dec_lat<90):
        sys.stderr.write('latitude must be -90<=lat<90, given %f\n'%dec_lat)
        sys.exit(33) # can't handle north pole, sorry, [A-R]

    adj_lat = dec_lat + 90.0
    adj_lon = dec_lon + 180.0

    grid_lat_sq = upper[int(adj_lat/10)];
    grid_lon_sq = upper[int(adj_lon/20)];

    grid_lat_field = str(int(adj_lat%10))
    grid_lon_field = str(int((adj_lon/2)%10))

    adj_lat_remainder = (adj_lat - int(adj_lat)) * 60
    adj_lon_remainder = ((adj_lon) - int(adj_lon/2)*2) * 60

    grid_lat_subsq = lower[int(adj_lat_remainder/2.5)]
    grid_lon_subsq = lower[int(adj_lon_remainder/5)]

    return grid_lon_sq + grid_lat_sq + grid_lon_field + grid_lat_field + grid_lon_subsq + grid_lat_subsq

In [None]:
ffmWells['MHB'] = ffmWells.apply(gridCalc, axis=1)

In [None]:
ffmWells.tail()

Unnamed: 0,API_WELLNO,Pool,ffMonth,Oil,Gas,Wtr,Days,Runs,Lat,Long,bpd,Mcfpd,MHB
20437,33105058570000,BAKKEN,2023-05-01,26701.0,24497.0,64246.0,31.0,26751.0,48.167755,-103.719029,861.322581,790.225806,DN88de
20438,33105058750000,BAKKEN,2023-03-01,40250.0,31819.0,93444.0,25.0,39436.0,48.169415,-103.699984,1610.0,1272.76,DN88de
20439,33105058760000,BAKKEN,2023-05-01,31521.0,26441.0,69949.0,31.0,31478.0,48.169418,-103.700107,1016.806452,852.935484,DN88de
20440,33105061150000,MADISON,2023-06-01,938.0,27.0,686.0,30.0,1282.0,48.016557,-103.859187,31.266667,0.9,DN88ba
20441,33105903870000,DAKOTA,2018-01-01,410.0,0.0,0.0,31.0,946.0,48.404169,-103.475385,13.225806,0.0,DN88gj


In [None]:
ffmWells.to_csv('ffmWells.csv', index=False)

# **2.a Download well bore data from ND DMR ~5min**

In [None]:
!pip install dbfread

Collecting dbfread
  Downloading dbfread-2.0.7-py2.py3-none-any.whl (20 kB)
Installing collected packages: dbfread
Successfully installed dbfread-2.0.7


In [None]:
import pandas as pd
import plotly.express as px
from dbfread import DBF
import ipywidgets as widgets
import csv
from math import radians, sin, cos, sqrt, atan2

In [None]:
# run command to download data.  In case it is not able to download, open the link and upload the data to this notebook from left hand side menu.
!wget https://gis.dmr.nd.gov/downloads/oilgas/shapefile/OGD_Horizontals.zip

--2023-09-08 18:37:57--  https://gis.dmr.nd.gov/downloads/oilgas/shapefile/OGD_Horizontals.zip
Resolving gis.dmr.nd.gov (gis.dmr.nd.gov)... 165.234.156.197
Connecting to gis.dmr.nd.gov (gis.dmr.nd.gov)|165.234.156.197|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 314193504 (300M) [application/x-zip-compressed]
Saving to: ‘OGD_Horizontals.zip’


2023-09-08 18:38:06 (35.8 MB/s) - ‘OGD_Horizontals.zip’ saved [314193504/314193504]



In [None]:
!unzip OGD_Horizontals.zip

Archive:  OGD_Horizontals.zip
  inflating: OGD_Horizontals.cpg     
  inflating: OGD_Horizontals.dbf     
  inflating: OGD_Horizontals.prj     
  inflating: OGD_Horizontals.sbn     
  inflating: OGD_Horizontals.sbx     
  inflating: OGD_Horizontals.shp     
  inflating: OGD_Horizontals.shp.xml  
  inflating: OGD_Horizontals.shx     
  inflating: OGD_Horizontals_Line.cpg  
  inflating: OGD_Horizontals_Line.dbf  
  inflating: OGD_Horizontals_Line.prj  
  inflating: OGD_Horizontals_Line.sbn  
  inflating: OGD_Horizontals_Line.sbx  
  inflating: OGD_Horizontals_Line.shp  
  inflating: OGD_Horizontals_Line.shp.xml  
  inflating: OGD_Horizontals_Line.shx  


In [None]:
# to write OGD_Horizontals.dbf to csv file ~ 2 minutes
filename= 'OGD_Horizontals.dbf'
table = DBF(filename)
f=open('ndWells.csv', 'w')
writer = csv.writer(f)
writer.writerow(table.field_names)
for record in table:
  writer.writerow(list(record.values()))

In [None]:
dfWells = pd.read_csv('ndWells.csv')
dfWells.head(3)

Unnamed: 0,wl_permit,api_wellno,api_format,long,lat,well_sub,measdpth,inclinatio,azimuth,tvd,coordns,coordnsdir,coordew,coordewdir,surveytype
0,1741,33007000260000,33-007-00026-00-00,-103.392973,46.866931,LAT1,9000.0,0.77,4.33,8999.030273,38.310001,N,35.09,W,STP
1,1741,33007000260000,33-007-00026-00-00,-103.392971,46.866931,LAT1,9015.0,3.3,88.0,9014.021484,38.425602,N,34.650803,W,SPT
2,1741,33007000260000,33-007-00026-00-00,-103.392969,46.866931,LAT1,9022.0,5.3,88.0,9021.000977,38.44392,N,34.126297,W,SPT


# **2.b Filter out first laterals and estimate lateral length**

In [None]:
dfWells['myLateral'] = ['1stLat' if x=='LAT1' else '1stLat' if x=='STK1' else 'VERT' if x=='VERT' else 'othLat' for x in dfWells['well_sub']]
dfWells.tail(3)

Unnamed: 0,wl_permit,api_wellno,api_format,long,lat,well_sub,measdpth,inclinatio,azimuth,tvd,coordns,coordnsdir,coordew,coordewdir,surveytype,myLateral
4885420,90466,33105904660000,33-105-90466-00-00,-103.411653,48.28781,VERT,4319.0,0.2,134.399994,4318.772949,7.135031,S,14.252914,W,SPT,VERT
4885421,90466,33105904660000,33-105-90466-00-00,-103.411653,48.28781,VERT,4415.0,0.2,14.5,4414.772461,7.090045,S,14.091251,W,SPT,VERT
4885422,90466,33105904660000,33-105-90466-00-00,-103.411652,48.287812,VERT,4511.0,0.3,7.6,4510.771484,6.678713,S,14.016061,W,SPT,VERT


In [None]:
flWells = dfWells.groupby(['api_wellno','myLateral']).agg(latMin=('lat', 'min'),latMax=('lat', 'max'),
                                                        longMin=('long', 'min'),longMax=('long', 'max'),
                                                        tvdMin=('tvd', 'min'),tvdMax=('tvd', 'max')).reset_index()

In [None]:
def latft(latMin, latMax, longMin, longMax):

    R = 20925721

    lat1 = radians(latMin)
    lon1 = radians(longMin)
    lat2 = radians(latMax)
    lon2 = radians(longMax)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    #print("latL_km:", distance)
    return distance

In [None]:
flWells['totll'] = flWells.apply(lambda row: latft(row['latMin'],row['latMax'],row['longMin'],row['longMax']),  axis=1)

In [None]:
flWellsOnly = flWells[['api_wellno','myLateral','totll']][flWells['myLateral'] == '1stLat'].reset_index(drop=True)
flWellsOnly.tail()

Unnamed: 0,api_wellno,myLateral,totll
19958,33105060500000,1stLat,10745.854055
19959,33105060730000,1stLat,545.682062
19960,33105902990000,1stLat,2102.817016
19961,33105904500000,1stLat,1787.41783
19962,33105904660000,1stLat,2305.694952


In [None]:
flWellsOnly[['api_wellno','myLateral','totll']].to_csv('flWellsOnly.csv', index=False)

In [None]:
flWells[flWells['api_wellno']==33105040690000]

Unnamed: 0,api_wellno,myLateral,latMin,latMax,longMin,longMax,tvdMin,tvdMax,totll
37356,33105040690000,1stLat,48.282229,48.282416,-102.980406,-102.979728,7251.50293,8439.927734,178.459657
37357,33105040690000,VERT,48.282314,48.282738,-102.983372,-102.979511,1768.457642,8985.621094,951.301084


In [None]:
## 3D plot of specific well to verify estimation
plWell = dfWells[['api_wellno','lat','long','tvd','well_sub']][dfWells['api_wellno'] == 33105040690000]

In [None]:
fig = px.scatter_3d(plWell, x='long', y='lat', z='tvd', color='well_sub')
fig.update_traces(marker_size = 3)
fig.update_scenes(yaxis_autorange="reversed")
fig.update_scenes(xaxis_autorange="reversed")
fig.update_scenes(zaxis_range=[16000, 0])
fig.update_layout({
    'plot_bgcolor': 'rgba(100, 0, 0, 0)',
    'paper_bgcolor': 'rgba(0, 0, 0, 0)',
                      })

# **3.a Download and unzip FracFocus.org well completion data ~5min**

In [2]:
import pandas as pd
import requests
import zipfile
import io
import datetime

In [3]:
link2FFcsvZip = 'http://fracfocusdata.org/digitaldownload/FracFocusCSV.zip'

In [4]:
request = requests.get(link2FFcsvZip)
zipBag = zipfile.ZipFile(io.BytesIO(request.content))

In [5]:
zipFiles = zipBag.namelist()
zipFiles

['DisclosureList_1.csv',
 'FracFocusRegistry_1.csv',
 'FracFocusRegistry_10.csv',
 'FracFocusRegistry_11.csv',
 'FracFocusRegistry_12.csv',
 'FracFocusRegistry_13.csv',
 'FracFocusRegistry_14.csv',
 'FracFocusRegistry_2.csv',
 'FracFocusRegistry_3.csv',
 'FracFocusRegistry_4.csv',
 'FracFocusRegistry_5.csv',
 'FracFocusRegistry_6.csv',
 'FracFocusRegistry_7.csv',
 'FracFocusRegistry_8.csv',
 'FracFocusRegistry_9.csv',
 'readme csv.txt',
 'WaterSource_1.csv']

In [6]:
#Unzip files and create corresponding dataframes
RegistryUpload_files=[]
FracFocusRegistry_files=[]

In [7]:
for file_name in zipFiles:
    if (('DisclosureList' in file_name)) & (file_name.endswith('.csv')):
        RegistryUpload_files.append(file_name)
    if (('FracFocusRegistry' in file_name)) & (file_name.endswith('.csv')):
        FracFocusRegistry_files.append(file_name)

In [8]:
RegistryUpload_files

['DisclosureList_1.csv']

In [9]:
dfs = list()
for file_name in RegistryUpload_files:
    df = pd.read_csv(zipBag.open(file_name), low_memory=False, dtype={'APINumber': 'str','StateNumber': 'str', 'CountyNumber': 'str'})
    dfs.append(df)

RegistryUpload = pd.concat(dfs, axis=0, ignore_index=True)
RegistryUpload.head(2)

Unnamed: 0,DisclosureId,JobStartDate,JobEndDate,APINumber,StateName,CountyName,OperatorName,WellName,Latitude,Longitude,Projection,TVD,TotalBaseWaterVolume,TotalBaseNonWaterVolume,FFVersion,FederalWell,IndianWell
0,02cd05a7-4b73-4722-9fba-1f798842a879,5/1/1955 12:00:00 AM,5/1/1955 12:00:00 AM,42317372620000,Texas,Martin,Pioneer Natural Resources,Rogers 42 #5,32.283431,-101.906575,NAD27,,,,1,False,False
1,416d8b17-822f-4743-8c79-baf015a6de24,5/19/1982 12:00:00 AM,5/19/1982 12:00:00 AM,49009219470000,Wyoming,Converse,"Chesapeake Operating, Inc.",WILLIAM VALENTINE 1,42.97281,-105.95384,NAD27,,,,1,False,False


In [10]:
RegistryUpload.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226155 entries, 0 to 226154
Data columns (total 17 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   DisclosureId             226155 non-null  object 
 1   JobStartDate             226140 non-null  object 
 2   JobEndDate               226155 non-null  object 
 3   APINumber                226155 non-null  object 
 4   StateName                226155 non-null  object 
 5   CountyName               226151 non-null  object 
 6   OperatorName             226155 non-null  object 
 7   WellName                 226155 non-null  object 
 8   Latitude                 226120 non-null  float64
 9   Longitude                226120 non-null  float64
 10  Projection               226155 non-null  object 
 11  TVD                      196015 non-null  float64
 12  TotalBaseWaterVolume     195986 non-null  float64
 13  TotalBaseNonWaterVolume  175828 non-null  float64
 14  FFVe

In [11]:
FracFocusRegistry_files

['FracFocusRegistry_1.csv',
 'FracFocusRegistry_10.csv',
 'FracFocusRegistry_11.csv',
 'FracFocusRegistry_12.csv',
 'FracFocusRegistry_13.csv',
 'FracFocusRegistry_14.csv',
 'FracFocusRegistry_2.csv',
 'FracFocusRegistry_3.csv',
 'FracFocusRegistry_4.csv',
 'FracFocusRegistry_5.csv',
 'FracFocusRegistry_6.csv',
 'FracFocusRegistry_7.csv',
 'FracFocusRegistry_8.csv',
 'FracFocusRegistry_9.csv']

In [None]:
dfs = list()
for file_name in FracFocusRegistry_files:
    df = pd.read_csv(zipBag.open(file_name), low_memory=False, dtype={'APINumber': 'str','StateNumber': 'str', 'CountyNumber': 'str'})
    dfs.append(df)

FracFocusRegistry = pd.concat(dfs, axis=0, ignore_index=True)
FracFocusRegistry.head(2)

In [None]:
FracFocusRegistry.info()

# **3.b Estimate water and sand used in copletions of ND wells only**

In [None]:
# Filter out ND wells and list water used in completions
ffWellsND = RegistryUpload[['pKey','APINumber','JobEndDate','TotalBaseWaterVolume']][RegistryUpload['StateNumber']== '33'].reset_index(drop=True)
ffWellsND['JobEndDate'] = pd.to_datetime(ffWellsND['JobEndDate'], errors = 'coerce').dt.date
ffWellsND.tail()

Unnamed: 0,pKey,APINumber,JobEndDate,TotalBaseWaterVolume
16108,bed265ca-f4ab-4307-86c9-76a31d2e06b1,33061051230000,2023-08-14,10222700.0
16109,526e6ef0-bc64-40e6-864c-2a22e406da99,33061051240000,2023-08-14,10225540.0
16110,f79f6237-30f0-43bd-af83-715458dd2265,33061051250000,2023-08-14,10247892.0
16111,050e7b83-b8f1-4a60-a729-99cffe2dae6e,33061051410000,2023-08-23,9756281.0
16112,5ecae102-b645-4185-adbf-4a30bf42dbb6,33061051400000,2023-08-23,9746429.0


In [None]:
#Filter out sand used during completions
swData = FracFocusRegistry[['UploadKey','APINumber','JobEndDate','IngredientName','Purpose','TradeName','MassIngredient','PercentHFJob','IngredientKey','TotalBaseWaterVolume']][FracFocusRegistry['StateNumber']== '33'].reset_index(drop=True)
swData['JobEndDate'] = pd.to_datetime(swData['JobEndDate'], errors = 'coerce').dt.date

In [None]:
tnWords = ['sand','mesh','proppant','ceramic','quartz','silica','crystalline']
swDataFilter = swData[swData['TradeName'].str.contains('|'.join(tnWords), case=False, na=False, regex = True)].copy().reset_index(drop=True)
swDataFilter.tail(2)

Unnamed: 0,UploadKey,APINumber,JobEndDate,IngredientName,Purpose,TradeName,MassIngredient,PercentHFJob,IngredientKey,TotalBaseWaterVolume
44542,8607b4f6-3380-42dd-8ec8-a396d01fd988,33053061280000,2015-04-02,"Crystalline silica, quartz",Proppant,SAND - PREMIUM WHITE,7363600.0,13.596038,ad1a2155-6dfd-40a9-96e1-189eb344478e,5377176.0
44543,8607b4f6-3380-42dd-8ec8-a396d01fd988,33053061280000,2015-04-02,"Crystalline silica, quartz",Proppant,SAND - COMMON WHITE,1805900.0,3.334386,93c1b66d-f3f6-45a0-b17d-9aa939f357df,5377176.0


In [None]:
exclusionList = ['water','transport','acid','stabilizer','carrier','clay','ammonium','chloride','Potassium','KCL','chlorine','naphtha','petroleum','aromatic','surfactant','alcohol','gum','tetrahydrate','carbonate','glycol']
swDataFilter = swDataFilter[~swDataFilter['TradeName'].str.contains('|'.join(exclusionList), case=False, na=False)].copy().reset_index(drop=True)
swDataFilter.tail(2)

Unnamed: 0,UploadKey,APINumber,JobEndDate,IngredientName,Purpose,TradeName,MassIngredient,PercentHFJob,IngredientKey,TotalBaseWaterVolume
25416,8607b4f6-3380-42dd-8ec8-a396d01fd988,33053061280000,2015-04-02,"Crystalline silica, quartz",Proppant,SAND - PREMIUM WHITE,7363600.0,13.596038,ad1a2155-6dfd-40a9-96e1-189eb344478e,5377176.0
25417,8607b4f6-3380-42dd-8ec8-a396d01fd988,33053061280000,2015-04-02,"Crystalline silica, quartz",Proppant,SAND - COMMON WHITE,1805900.0,3.334386,93c1b66d-f3f6-45a0-b17d-9aa939f357df,5377176.0


In [None]:
#Estimate mass from PercentHFJob
swDataFilter['derWeight'] = swDataFilter['TotalBaseWaterVolume']*.0935*swDataFilter['PercentHFJob']

In [None]:
def weightCondition(s):
    if (s['MassIngredient'] > 0):
        return s['MassIngredient']
    else:
        return s['derWeight']

In [None]:
swDataFilter['bestWeight'] = swDataFilter.apply(weightCondition, axis=1)

In [None]:
SandUsed = swDataFilter[['UploadKey','APINumber','JobEndDate','bestWeight']].groupby(['UploadKey','APINumber','JobEndDate']).sum().rename(columns={'bestWeight':'SandLB'}).reset_index()

In [None]:
SandUsed = SandUsed[SandUsed['SandLB'] > 0]
SandUsed.reset_index(drop=True, inplace=True)
SandUsed.tail()

Unnamed: 0,UploadKey,APINumber,JobEndDate,SandLB
7767,ffeb8d65-66a6-47e5-835e-b002806305bc,33053091850000,2023-01-12,4738717.0
7768,ffef3fcc-9185-41d6-85e0-ea992071237f,33053084960000,2018-08-22,7154146.0
7769,fff39d2a-da18-432b-899f-012837eef9b6,33053091710000,2020-08-23,13044420.0
7770,fff9db51-a9e1-4698-b29d-9ff597aae7fd,33025036880000,2021-04-05,7992000.0
7771,fffb8798-b804-447b-856f-cde632d7bb7d,33061038180000,2017-04-30,13672500.0


In [None]:
#Merge water and sand used during completions
ffWellsND = ffWellsND.merge(SandUsed[['UploadKey','SandLB']], how='left', left_on = 'pKey', right_on = 'UploadKey' )

In [None]:
ffWellsND.drop(['pKey','UploadKey'], axis=1, inplace=True)

In [None]:
#Filter out first frac jobs
ffWellsND = ffWellsND.sort_values(['APINumber','JobEndDate']).reset_index(drop = True)
ffWellsND["sfID"] = ffWellsND.groupby(['APINumber']).cumcount()
ffWellsND = ffWellsND[ffWellsND['sfID']==0].reset_index(drop = True)  # filters out refrac jobs, keeps only the first frac jobs
ffWellsND.drop('sfID', inplace=True, axis=1)
ffWellsND.rename({'TotalBaseWaterVolume': 'WaterGal'}, axis=1, inplace=True)
ffWellsND["APINumber"] = ffWellsND["APINumber"].astype(int)
ffWellsND.tail()

Unnamed: 0,APINumber,JobEndDate,WaterGal,SandLB
15153,33105059720000,2023-05-06,11151006.72,
15154,33105059730000,2023-05-06,10978190.58,
15155,33105121850000,2012-02-24,,
15156,33610338000000,2022-06-08,13496802.0,10357262.37
15157,33610338100000,2022-06-09,14820967.0,11223894.87


In [None]:
ffWellsND.to_csv('ffWellsND.csv', index=False)
ffWellsND.tail()

Unnamed: 0,APINumber,JobEndDate,WaterGal,SandLB
15153,33105059720000,2023-05-06,11151006.72,
15154,33105059730000,2023-05-06,10978190.58,
15155,33105121850000,2012-02-24,,
15156,33610338000000,2022-06-08,13496802.0,10357262.37
15157,33610338100000,2022-06-09,14820967.0,11223894.87


# **4. Combine files and write data file for Sub-county analysis**

In [None]:
# In disagregated form, read the files from previously derived csv files
#ffmWells = pd.read_csv('ffmWells.csv')
#flWellsOnly = pd.read_csv('flWellsOnly.csv')
#ffWellsND = pd.read_csv('ffWellsND.csv')

In [None]:
ND_Data = ffmWells[['API_WELLNO','Pool','ffMonth','bpd','Mcfpd','MHB']].merge(flWellsOnly, how = 'left', left_on = 'API_WELLNO', right_on = 'api_wellno')
ND_Data.drop(['api_wellno','myLateral'], axis=1, inplace=True)
ND_Data.columns = ['API14', 'Pool', 'feDate', 'maxPo', 'maxPg','MHB','LL']
ND_Data.tail()

Unnamed: 0,API14,Pool,feDate,maxPo,maxPg,MHB,LL
20407,33105058570000,BAKKEN,2023-05-01,861.322581,790.225806,DN88de,10753.179882
20408,33105058750000,BAKKEN,2023-03-01,1610.0,1272.76,DN88de,10729.605131
20409,33105058760000,BAKKEN,2023-05-01,1016.806452,852.935484,DN88de,10727.102474
20410,33105061150000,MADISON,2023-06-01,31.266667,0.9,DN88ba,
20411,33105903870000,DAKOTA,2018-01-01,13.225806,0.0,DN88gj,


In [None]:
ND_Data = ND_Data.merge(ffWellsND, how = 'left', left_on = 'API14', right_on = 'APINumber')
ND_Data.tail()

Unnamed: 0,API14,Pool,feDate,maxPo,maxPg,MHB,LL,APINumber,JobEndDate,WaterGal,SandLB
20407,33105058570000,BAKKEN,2023-05-01,861.322581,790.225806,DN88de,10753.179882,33105060000000.0,2023-01-27,11088845.0,10028631.33
20408,33105058750000,BAKKEN,2023-03-01,1610.0,1272.76,DN88de,10729.605131,33105060000000.0,2023-02-15,10865393.0,10498341.15
20409,33105058760000,BAKKEN,2023-05-01,1016.806452,852.935484,DN88de,10727.102474,33105060000000.0,2023-02-15,10332110.0,10058611.32
20410,33105061150000,MADISON,2023-06-01,31.266667,0.9,DN88ba,,,,,
20411,33105903870000,DAKOTA,2018-01-01,13.225806,0.0,DN88gj,,,,,


In [None]:
#Drop wells with missing values
ND_Data.dropna(inplace=True)
ND_Data.reset_index(inplace=True, drop=True)
ND_Data.tail()

Unnamed: 0,API14,Pool,feDate,maxPo,maxPg,MHB,LL,APINumber,JobEndDate,WaterGal,SandLB
6979,33105058550000,BAKKEN,2023-03-01,651.892857,557.142857,DN88de,10210.408035,33105060000000.0,2023-01-27,12016673.0,11317940.73
6980,33105058560000,BAKKEN,2023-03-01,795.354839,684.580645,DN88de,10717.767091,33105060000000.0,2023-01-27,12593303.0,11893604.49
6981,33105058570000,BAKKEN,2023-05-01,861.322581,790.225806,DN88de,10753.179882,33105060000000.0,2023-01-27,11088845.0,10028631.33
6982,33105058750000,BAKKEN,2023-03-01,1610.0,1272.76,DN88de,10729.605131,33105060000000.0,2023-02-15,10865393.0,10498341.15
6983,33105058760000,BAKKEN,2023-05-01,1016.806452,852.935484,DN88de,10727.102474,33105060000000.0,2023-02-15,10332110.0,10058611.32


In [None]:
ND_Bakken = ND_Data[ND_Data['Pool']=='BAKKEN']  #Filter out Bakken wells only
ND_Bakken = ND_Bakken[ND_Bakken['feDate']>'2015-06-01'].reset_index(drop = True)  #Filter out first two month to eliminate wells that started before data is available
ND_Bakken

Unnamed: 0,API14,Pool,feDate,maxPo,maxPg,MHB,LL,APINumber,JobEndDate,WaterGal,SandLB
0,33007017390000,BAKKEN,2018-03-01,21.400000,10.760000,DN87kg,9988.611253,3.300702e+13,2013-06-14,1519434.0,100331.00
1,33007017630000,BAKKEN,2015-07-01,243.903226,384.000000,DN86iw,11573.584346,3.300702e+13,2015-01-30,8957550.0,5276217.00
2,33007017970000,BAKKEN,2015-08-01,1.833333,8.300000,DN87ke,9962.164000,3.300702e+13,2013-09-01,422625.0,222478.00
3,33007018050000,BAKKEN,2018-12-01,150.066667,110.066667,DN87fb,9701.463543,3.300702e+13,2018-07-09,92042880.0,2330112.36
4,33007018060000,BAKKEN,2015-07-01,76.517241,57.172414,DN87kf,10134.729658,3.300702e+13,2013-12-11,2640162.0,1555981.00
...,...,...,...,...,...,...,...,...,...,...,...
3602,33105058550000,BAKKEN,2023-03-01,651.892857,557.142857,DN88de,10210.408035,3.310506e+13,2023-01-27,12016673.0,11317940.73
3603,33105058560000,BAKKEN,2023-03-01,795.354839,684.580645,DN88de,10717.767091,3.310506e+13,2023-01-27,12593303.0,11893604.49
3604,33105058570000,BAKKEN,2023-05-01,861.322581,790.225806,DN88de,10753.179882,3.310506e+13,2023-01-27,11088845.0,10028631.33
3605,33105058750000,BAKKEN,2023-03-01,1610.000000,1272.760000,DN88de,10729.605131,3.310506e+13,2023-02-15,10865393.0,10498341.15


In [None]:
ND_Bakken.to_excel('ND_BakkenPublic.xlsx', sheet_name='ND_BakkenPublic', index=False)

In [None]:
#download the file
from google.colab import files
files.download('ND_BakkenPublic.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>