In [53]:
import pandas as pd
import numpy as py

In [54]:
# Import csv file raw data
raw_df = pd.read_csv('../data/InjectionWells.csv')

In [55]:
raw_df.shape

(11126, 21)

In [56]:
raw_df.isnull().sum()

API#                 1
Operator             1
Operator ID          1
WellType             1
WellName             2
WellNumber           2
OrderNumbers         2
Approval Date        1
County               1
Sec                  1
Twp                  1
Rng                  1
QQQQ                 1
LAT                  1
LONG                 1
PSI               1437
BBLS              1437
ZONE                 1
Unnamed: 18      11126
Unnamed: 19      11126
Unnamed: 20      11126
dtype: int64

### Addressing null values
---
Because there is no information for the three Unnamed columns, they will be dropped.

The 1,437 null values for the PSI and BBLS will also be dropped as there is no information to fill in those values.  That still leaves 9.689 rows of values for analysis.

In [57]:
# Funtion for dropping columns from a dataframe, selected from a list

def Drop_columns(raw_df: pd.DataFrame) -> pd.DataFrame:
    """
    This function drops unneeded columns from the data frame.
    Args: 
    raw_df: This is the data frame to drop the columns from.
    Return:
    This returns the data frame with the unneeded columns removed.
    """
    column_list = ['Twp', 'Rng', 'QQQQ', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20']
    
    updated_column_df = raw_df.copy()

    for column in column_list:
        updated_column_df.drop(column, axis=1, inplace=True)

    return updated_column_df

Applying Drop_columns function

In [58]:
updated_column_df = Drop_columns(raw_df)

In [59]:
ok_wells_df = updated_column_df.copy()

In [None]:
# Drop any remaining null values
ok_wells_df = ok_wells_df.dropna()

In [62]:
ok_wells_df.dtypes

API#             float64
Operator          object
Operator ID      float64
WellType          object
WellName          object
WellNumber        object
OrderNumbers     float64
Approval Date     object
County            object
Sec               object
LAT              float64
LONG             float64
PSI               object
BBLS              object
ZONE              object
dtype: object

Adjusting for datetime datatype for Approval Date column, splitting out Month and Year.

In [63]:
ok_wells_df['Approval Date'] = pd.to_datetime(ok_wells_df['Approval Date'])

In [64]:
ok_wells_df['Month'] = ok_wells_df['Approval Date'].dt.month
ok_wells_df['Year'] = ok_wells_df['Approval Date'].dt.year

In [None]:
# Moving columns to the front of the column list
column_move_1 = ok_wells_df.pop('Month')
ok_wells_df.insert(8, 'Month', column_move_1)

column_move_2 = ok_wells_df.pop('Year')
ok_wells_df.insert(9, 'Year', column_move_2)

In [None]:
# Changing column datatype
ok_wells_df['Month'] = ok_wells_df['Month'].astype(int)
ok_wells_df['Year'] = ok_wells_df['Year'].astype(int)
ok_wells_df['Operator ID'] = ok_wells_df['Operator ID'].astype(int)


In [None]:
# Datatype confirmation
ok_wells_df.dtypes

API#                    float64
Operator                 object
Operator ID               int64
WellType                 object
WellName                 object
WellNumber               object
OrderNumbers            float64
Approval Date    datetime64[ns]
Month                     int64
Year                      int64
County                   object
Sec                      object
LAT                     float64
LONG                    float64
PSI                      object
BBLS                     object
ZONE                     object
dtype: object

In [None]:
# Renaming columns for easier understanding.

ok_wells_df.rename(columns = {
    'API#': 'api_number', 
  'Operator': 'operator',
  'Operator ID': 'operator_id',
  'WellType': 'well_type',
  'WellName': 'well_name',
  'WellNumber': 'well_number',
  'OrderNumbers': 'order_number',
  'Approval Date': 'approval_date',
  'Month': 'month',
  'Year': 'year',
  'County': 'county',
  'Sec': 'section',
  'LAT': 'latitude',
  'LONG': 'longitude',
  'ZONE': 'zone',
  'PSI': 'psi',
  'BBLS': 'barrels_fluid'
 }, inplace=True)

In [None]:
# Value counts for counties in Oklahoma
county = ok_wells_df['county'].value_counts()
county

county
CARTER      1485
STEPHENS    1054
CREEK        672
PONTOTOC     570
SEMINOLE     447
            ... 
CRAIG          4
LEFLORE        3
GREER          3
ATOKA          1
OFUSKEE        1
Name: count, Length: 66, dtype: int64

### Setup for SQL Tables
---

In [None]:
# wells SQL table

wells = ['well_number','well_type','well_name','operator_id']

wells_df = ok_wells_df[wells].copy()
wells_df.to_csv('../data/wells.csv', index=False)

In [None]:
# well_locations SQL table

well_locations = ['well_number', 'latitude', 'longitude', 'zone']

well_locations_df = ok_wells_df[well_locations].copy()
well_locations_df.to_csv('../data/well_locations.csv', index=False)

In [None]:
# well_intake SQL table

well_intake = ['well_number', 'psi', 'barrels_fluid']

well_intake_df = ok_wells_df[well_intake].copy()
well_intake_df.to_csv('../data/well_intake.csv', index=False)

In [None]:
# operators SQL table

operators = ['operator_id', 'operator', 'county', 'section']

operators_df = ok_wells_df[operators].copy()
operators_df.to_csv('../data/operators.csv', index=False)

In [None]:
# orders SQL table

orders = ['order_number', 'well_number', 'approval_date', 'year']

orders_df = ok_wells_df[orders].copy()
orders_df.to_csv('../data/orders.csv', index=False)