## Feature Creation notebook
### Goal is to start with dict of dataframes of wells and a few other pieces and create a single dataframe with all the necessary features for all used wells
#### This work is similar to what has been done before but data loading & feature creation is separate and dask is used to speed feature creation
##### by Justin Gosses 2018-07-07

#### Inputs used during this notebook are:
    1. Dict of dataframes of used well created by notebooks in `/loadLAS` directory
    2. A dataframe of nearest neighbor information from a notebook found in the `WellsKNN/` directory
    3. picks_dic a data dictionary for the pick list below = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
    4. pick list = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
    5. well list = pd.read_csv('../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
    6. lattitude and longitude for eahc well = pd.read_csv('../../well_lat_lng.csv')

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import welly
from welly import Well
import lasio
import glob
from sklearn import neighbors
import pickle
import math
import dask
import dask.dataframe as dd
from dask.distributed import Client
# import pdvega
# import vega
import dask.dataframe as dd
from dask.distributed import Client
welly.__version__

'0.3.5'

In [2]:
print(dask.__version__)
print(pd.__version__)

0.18.1
0.23.1


In [3]:
%%timeit
import os
env = %env


82.2 µs ± 1.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [4]:
from IPython.display import display

In [5]:
#### Test results Part 1
#### Had to change display options to get this to print in full!
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100000

In [6]:
knn_dir = "../WellsKNN/"
load_dir = "../loadLAS"

--------------------

## We're going to load a pickle file of a previously created dataframe

### That dataframe merges:
1. picks_dic = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
2. picks = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
3. wells = pd.read_csv('../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
4. gis = pd.read_csv('../../well_lat_lng.csv')

### It also excludes any wells that have nulls or zeros for Top McMurray or Base McMurray picks
This was done in notebooks: 
1. notebooks_2018/mapmaking/Map_Exploration_v2-KDtree.ipynb
2. notebooks_2018/Test_RUN_2018_02/DataCleaningPrepof_KNN_neighborPickDepth_df_creation_vA_20180210

In [7]:
wells_df_new_cleaned_plus_nn_wNoNulls =  pd.read_pickle(knn_dir+'/'+'wells_df__NB_KDtreePost062018_vA__NoMcTopLeak_v2.p')

In [8]:
wells_df_new_cleaned_plus_nn_wNoNulls.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1926 entries, 0 to 2192
Data columns (total 15 columns):
SitID                           1926 non-null int64
McMurray_Base_HorID             1926 non-null int64
McMurray_Top_HorID              1926 non-null int64
McMurray_Base_DEPTH             1926 non-null float64
McMurray_Top_DEPTH              1926 non-null float64
McMurray_Base_Qual              1926 non-null int64
McMurray_Top_Qual               1926 non-null int64
lat                             1926 non-null float64
lng                             1926 non-null float64
UWI                             1926 non-null object
Neighbors_Obj                   1926 non-null object
NN1_McMurray_Top_DEPTH          1921 non-null float64
NN1_McMurray_Base_DEPTH         1926 non-null float64
NN1_thickness                   1921 non-null float64
MM_Top_Depth_predBy_NN1thick    1921 non-null float64
dtypes: float64(8), int64(5), object(2)
memory usage: 240.8+ KB


In [9]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

Unnamed: 0,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,UWI,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick
0,102496,14000,13000,561.0,475.0,1,3,54.785907,-110.12932,00/12-08-067-01W4/0,"[{'neighbor': 1, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0575310019555}, {'neighbor': 2, 'UWI': '00/11-08-068-01W4/0', 'distance': 0.0874943032488}, {'neighbor': 3, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.104516690327}, {'neighbor': 4, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.109998964722}, {'neighbor': 5, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.140172829054}, {'neighbor': 6, 'UWI': '00/06-26-068-02W4/0', 'distance': 0.145394974621}, {'neighbor': 7, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.14676425482}]",544.0,630.0,86.0,475.0
1,102497,14000,13000,604.5,515.0,1,3,54.782284,-110.269446,00/07-08-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0473400060837}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0733221383008}, {'neighbor': 3, 'UWI': '00/07-36-067-03W4/0', 'distance': 0.0772558585092}, {'neighbor': 4, 'UWI': '00/10-14-067-03W4/0', 'distance': 0.078467559265}, {'neighbor': 5, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0826737666313}, {'neighbor': 6, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0908970121511}, {'neighbor': 7, 'UWI': '00/06-13-068-03W4/0', 'distance': 0.116811594031}]",529.0,613.0,84.0,520.5
2,102498,14000,13000,564.0,480.0,1,3,54.785892,-110.186851,00/09-11-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/12-08-067-01W4/0', 'distance': 0.0575310019555}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0664937861909}, {'neighbor': 3, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0826737666313}, {'neighbor': 4, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.0872840016555}, {'neighbor': 5, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0934369191754}, {'neighbor': 6, 'UWI': '00/11-08-068-01W4/0', 'distance': 0.1080816692}, {'neighbor': 7, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.120139676315}]",529.0,613.0,84.0,480.0
3,102500,14000,13000,636.5,549.0,1,3,54.829624,-110.269422,00/10-29-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0435570060725}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0458903640103}, {'neighbor': 3, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0473400060837}, {'neighbor': 4, 'UWI': '00/07-36-067-03W4/0', 'distance': 0.0520161525875}, {'neighbor': 5, 'UWI': '00/06-13-068-03W4/0', 'distance': 0.078988404744}, {'neighbor': 6, 'UWI': '00/10-14-067-03W4/0', 'distance': 0.0816950542016}, {'neighbor': 7, 'UWI': '00/11-29-068-02W4/0', 'distance': 0.0875230112656}]",529.0,613.0,84.0,552.5
4,102501,14000,13000,613.0,529.0,1,2,54.840471,-110.224832,00/06-34-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0458903640103}, {'neighbor': 2, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.0501344694696}, {'neighbor': 3, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0552825613462}, {'neighbor': 4, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0664937861909}, {'neighbor': 5, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0733221383008}, {'neighbor': 6, 'UWI': '00/06-26-068-02W4/0', 'distance': 0.0771552400618}, {'neighbor': 7, 'UWI': '00/11-29-068-02W4/0', 'distance': 0.0918035212614}]",514.0,603.0,89.0,524.0


In [10]:
print(len(wells_df_new_cleaned_plus_nn_wNoNulls))

1926


In [11]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

Unnamed: 0,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,UWI,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick
0,102496,14000,13000,561.0,475.0,1,3,54.785907,-110.12932,00/12-08-067-01W4/0,"[{'neighbor': 1, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0575310019555}, {'neighbor': 2, 'UWI': '00/11-08-068-01W4/0', 'distance': 0.0874943032488}, {'neighbor': 3, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.104516690327}, {'neighbor': 4, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.109998964722}, {'neighbor': 5, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.140172829054}, {'neighbor': 6, 'UWI': '00/06-26-068-02W4/0', 'distance': 0.145394974621}, {'neighbor': 7, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.14676425482}]",544.0,630.0,86.0,475.0
1,102497,14000,13000,604.5,515.0,1,3,54.782284,-110.269446,00/07-08-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0473400060837}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0733221383008}, {'neighbor': 3, 'UWI': '00/07-36-067-03W4/0', 'distance': 0.0772558585092}, {'neighbor': 4, 'UWI': '00/10-14-067-03W4/0', 'distance': 0.078467559265}, {'neighbor': 5, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0826737666313}, {'neighbor': 6, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0908970121511}, {'neighbor': 7, 'UWI': '00/06-13-068-03W4/0', 'distance': 0.116811594031}]",529.0,613.0,84.0,520.5
2,102498,14000,13000,564.0,480.0,1,3,54.785892,-110.186851,00/09-11-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/12-08-067-01W4/0', 'distance': 0.0575310019555}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0664937861909}, {'neighbor': 3, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0826737666313}, {'neighbor': 4, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.0872840016555}, {'neighbor': 5, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0934369191754}, {'neighbor': 6, 'UWI': '00/11-08-068-01W4/0', 'distance': 0.1080816692}, {'neighbor': 7, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.120139676315}]",529.0,613.0,84.0,480.0
3,102500,14000,13000,636.5,549.0,1,3,54.829624,-110.269422,00/10-29-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0435570060725}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0458903640103}, {'neighbor': 3, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0473400060837}, {'neighbor': 4, 'UWI': '00/07-36-067-03W4/0', 'distance': 0.0520161525875}, {'neighbor': 5, 'UWI': '00/06-13-068-03W4/0', 'distance': 0.078988404744}, {'neighbor': 6, 'UWI': '00/10-14-067-03W4/0', 'distance': 0.0816950542016}, {'neighbor': 7, 'UWI': '00/11-29-068-02W4/0', 'distance': 0.0875230112656}]",529.0,613.0,84.0,552.5
4,102501,14000,13000,613.0,529.0,1,2,54.840471,-110.224832,00/06-34-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0458903640103}, {'neighbor': 2, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.0501344694696}, {'neighbor': 3, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0552825613462}, {'neighbor': 4, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0664937861909}, {'neighbor': 5, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0733221383008}, {'neighbor': 6, 'UWI': '00/06-26-068-02W4/0', 'distance': 0.0771552400618}, {'neighbor': 7, 'UWI': '00/11-29-068-02W4/0', 'distance': 0.0918035212614}]",514.0,603.0,89.0,524.0


### This renames the columms of the dataframe above to match previous versions of the feature creation code

In [12]:
df_new = wells_df_new_cleaned_plus_nn_wNoNulls
# df_new["UWI (AGS)"] = df_new["UWI (AGS)_x"]
df_new["UWI"] = df_new["UWI"]
df_new["HorID"] = df_new["McMurray_Top_HorID"]
df_new["Pick"] = df_new["McMurray_Top_DEPTH"]
df_new["Quality"] = df_new["McMurray_Top_Qual"]
df_new["HorID_paleoz"] = df_new["McMurray_Base_HorID"]
df_new["Pick_paleoz"] = df_new["McMurray_Base_DEPTH"]
df_new["Quality_paleoz"] = df_new["McMurray_Base_Qual"]
df_new = df_new[["SitID","UWI","HorID","Pick","Quality","HorID_paleoz","Pick_paleoz","Quality_paleoz",'lat','lng','MM_Top_Depth_predBy_NN1thick','NN1_thickness']]
df_new

Unnamed: 0,SitID,UWI,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,lat,lng,MM_Top_Depth_predBy_NN1thick,NN1_thickness
0,102496,00/12-08-067-01W4/0,13000,475.00,3,14000,561.00,1,54.785907,-110.129320,475.00,86.00
1,102497,00/07-08-067-02W4/0,13000,515.00,3,14000,604.50,1,54.782284,-110.269446,520.50,84.00
2,102498,00/09-11-067-02W4/0,13000,480.00,3,14000,564.00,1,54.785892,-110.186851,480.00,84.00
3,102500,00/10-29-067-02W4/0,13000,549.00,3,14000,636.50,1,54.829624,-110.269422,552.50,84.00
4,102501,00/06-34-067-02W4/0,13000,529.00,2,14000,613.00,1,54.840471,-110.224832,524.00,89.00
5,102503,00/11-04-067-03W4/0,13000,488.50,2,14000,553.50,1,54.771449,-110.402983,489.00,64.50
6,102505,00/10-08-067-03W4/0,13000,501.50,2,14000,572.50,1,54.785901,-110.422131,508.00,64.50
7,102507,00/10-14-067-03W4/0,13000,553.50,2,14000,606.50,1,54.800533,-110.345762,542.00,64.50
8,102514,00/10-28-067-03W4/0,13000,493.50,3,14000,558.00,1,54.829633,-110.396621,487.00,71.00
9,102517,00/07-36-067-03W4/0,13000,536.50,2,14000,615.00,1,54.840441,-110.320301,562.00,53.00


In [13]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1926 entries, 0 to 2192
Data columns (total 12 columns):
SitID                           1926 non-null int64
UWI                             1926 non-null object
HorID                           1926 non-null int64
Pick                            1926 non-null float64
Quality                         1926 non-null int64
HorID_paleoz                    1926 non-null int64
Pick_paleoz                     1926 non-null float64
Quality_paleoz                  1926 non-null int64
lat                             1926 non-null float64
lng                             1926 non-null float64
MM_Top_Depth_predBy_NN1thick    1921 non-null float64
NN1_thickness                   1921 non-null float64
dtypes: float64(6), int64(5), object(1)
memory usage: 195.6+ KB


In [14]:
unique_quality_str = df_new.Quality.unique()
print(unique_quality_str)

[3 2 1]


In [15]:
print(len(df_new))

1926


In [16]:
#### Number of unique wells based on UWI
len(df_new.UWI.unique())

1926

In [17]:
df_new_test = df_new[['UWI']]

In [18]:
print(any(df_new_test.UWI == '00/11-04-067-03W4/0'))

True


---------------

## We're now going to load in all the las files but exclude any that arean't in the dataframe shown above. Additionally, we'll ignore any placed in the `excluded_problem_wells` folder

In [19]:
def loadAndNoFeatures():
    count=0
    data_df=[]
    count_limit =2500
    list_of_failed_wells = []
    ### dictionary that holds every well as key:value or "UWI":df pair
    df_w_dict ={}
    while count < count_limit:
        for file in glob.glob('../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'):
            #### NOTE: limiting wells being read-in to 101 here !!!!!!!!!!!!!!!!
            count+=1
            if count > count_limit:
                print("hit limit of count below file for loop")
                answer = [df_w_dict,list_of_failed_wells]
                return answer
            else:
                l_df = lasio.read(file).df()
                
                str_uwi= file[-23:-4].replace("-", "/",1)[:17]+file[-6:-4].replace("-", "/",1)
                #l_df.DEPT = l_df.DEPT.astype(float)
                ##bottom_well_depth = l_df['DEPT'].max()
                if any(df_new.UWI == str_uwi):
                    if df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0] > -1:
                        l_df = l_df.reset_index()
#                         print(l_df['DEPT'])
                        print("got to UWI apppend")
                        l_df['UWI'] = str_uwi
                        df_w_dict[l_df['UWI'][0]]= l_df
#                         print("UWI added is ",str_uwi," and type is ",type(str_uwi))
#                         l_df['SitID']=df_new[df_new['UWI']==str_uwi]['SitID'].iloc[0]
# #                         l_df['UWI (AGS)']=df_new[df_new['UWI']==str_uwi]['UWI (AGS)'].iloc[0]
#                         l_df['Pick']=df_new[df_new['UWI']==str_uwi]['Pick'].iloc[0] 
#                         l_df['HorID']=df_new[df_new['UWI']==str_uwi]['HorID'].iloc[0]
#                         l_df['Quality']=df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0]
#                         #### adding in paleozoic surface pick
#                         l_df['Pick_paleoz']=df_new[df_new['UWI']==str_uwi]['Pick_paleoz'].iloc[0] 
#                         l_df['HorID_paleoz']=df_new[df_new['UWI']==str_uwi]['HorID_paleoz'].iloc[0]
#                         l_df['Quality_paleoz']=df_new[df_new['UWI']==str_uwi]['Quality_paleoz'].iloc[0]
                        
#                         #### new as of 2018-02
#                         l_df['MM_Top_Depth_predBy_NN1thick']=df_new[df_new['UWI']==str_uwi]['MM_Top_Depth_predBy_NN1thick'].iloc[0]
#                         l_df['NN1_thickness']=df_new[df_new['UWI']==str_uwi]['NN1_thickness'].iloc[0]
                        
#                         print(l_df)
#                         print("got to end of col append & pick is ",l_df.Pick.unique()[0])  
#                         try:
#                             print("in first try statement, count = ",count)
#                             float(l_df.Pick.unique()[0])
#                             l_df.Pick = l_df.Pick.astype(float)
#                             print("str_uwi = ",str_uwi)
#                             if (('DEPT' not in l_df.columns) and ('DEPTH' not in l_df.columns)):
#                                 print("str_uwi = ",str_uwi, " did not progress as 'DEPT' or 'DEPTH was not a column'")
#                                 list_of_failed_wells.append(str_uwi)
#                             else:
#                                 try:
#                                     l_df.DEPT = l_df.DEPT.astype(float)
#                                 except:
#                                     try:
#                                         l_df.DEPT = l_df.DEPTH.astype(float)
#                                         l_df.drop([DEPTH])
#                                     except:
#                                         print("DEPT or DEPTH is in ",str_uwi," but can't be changed to float type???")
#                                 try:
#                                     l_df['new_pick']=l_df['Pick']-l_df['DEPT']
#         #                             l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 1 if(x==0) else 0)
#                                     l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 10 if x==0 else ( 5 if (-5 < x and x <5) else 0))
#         #                             lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)
#                                     #### doing the same as below but for BASE mcMurray or Paleozoic surface pick
#                                     float(l_df.Pick_paleoz.unique()[0])
#                                     l_df.Pick_paleoz = l_df.Pick_paleoz.astype(float)
#                                     #l_df.DEPT = l_df.DEPT.astype(float)
#                                     l_df['new_pick_paleoz']=l_df['Pick_paleoz']-l_df['DEPT']
#                                     l_df['new_pick2_paleoz']=l_df['new_pick_paleoz'].apply(lambda x: 1 if(x==0) else 0)

#                                     #### new as of 2018-02
#                                     try:
#                                         l_df_new = NN1_TopMcMDepth(l_df,'MM_Top_Depth_predBy_NN1thick')
#                                     except:
#                                             pass
#                                     try:
#                                         l_df_new = NN1_TopMcMDepth_Abs(l_df,'MM_Top_Depth_predBy_NN1thick')
#                                     except:
#                                             pass

#                                     print("got to below astype part")
#                                     #### instead of concat into a single dataframe, run functions & then add to dictionary   
#                                     ##### run functions to create features on array basis for each well in separate dataframe
#                                     ##### this makes certain things easier, compared to everything in a single dataframe, like making sure you don't accidentally grab data from next well up
#                                     ##### and will make it easier to write data back to LAS if we run into memory limitations later
#                                     curves = ['GR','ILD']
#                                     windows = [5,7,11,21]
#                                     directions = ["around","below","above"]
#                                     comboArg_A = [curves,windows,directions]
#                                     all_comboArgs_A = list(itertools.product(*comboArg_A))
#                                     for eachArgList in all_comboArgs_A:
#                                         try:
#                                             l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
#                                         except:
#                                             pass
#                                         try:
#                                             l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
#                                         except:
#                                             pass
#                                         try:
#                                             l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
#                                         except:
#                                             pass
#                                         try:
#                                             l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
#                                         except:
#                                             pass
#                                     #### add resultant dataframe to dictionary
#                                     if l_df['DEPT'].max() < 600:
#                                         df_w_dict[l_df_new['UWI'][0]]= l_df_new
#                                 except:
#                                     pass
#                         except ValueError as e:
#                             print("e = ",e)
#                             print ('Error picking')
#                             template = "An exception of type {0} occurred. Arguments:\n{1!r}"
#                             message = template.format(type(e).__name__, e.args)
#                             print("message = ",message)
#                             print("file = ",file)
#                             print("Got except, UWI added is ",str_uwi," and type is ",type(str_uwi))
#                             list_of_failed_wells.append(str_uwi)
#                             #continue;
                    else:
                        print("could not find UWI match for the well")
                        pass
                else:
                    pass
            #print("result = ",df_w_dict)
    #else: 
    #    return df_w_dict, list_of_failed_wells
    answer = [df_w_dict,list_of_failed_wells]
    
    return initial_well_dict

In [20]:
initial_well_dict = loadAndNoFeatures()

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

Header section Parameter regexp=~P was not found.


got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

In [21]:
#answer=[df_w_dict,list_of_failed_wells]
dict_of_well_df = initial_well_dict[0]
list_of_failed_wells = initial_well_dict[1]

In [22]:
print("list_of_failed_wells",list_of_failed_wells)

list_of_failed_wells []


In [23]:
print("len = ", len(dict_of_well_df))

len =  1907


In [24]:
print("check for well 00/11-04-067-03W4/0 = ",dict_of_well_df['00/11-04-067-03W4/0'])

check for well 00/11-04-067-03W4/0 =         DEPT     ILD   DPHI   NPHI       GR     CALI                  UWI
0    330.00   2.982  0.246  0.561  108.181  218.610  00/11-04-067-03W4/0
1    330.25   2.975  0.263  0.559  107.628  218.808  00/11-04-067-03W4/0
2    330.50   2.941  0.274  0.571  101.092  220.736  00/11-04-067-03W4/0
3    330.75   2.929  0.268  0.546  104.447  223.070  00/11-04-067-03W4/0
4    331.00   2.911  0.253  0.540  104.504  225.201  00/11-04-067-03W4/0
5    331.25   2.905  0.251  0.553  103.646  226.214  00/11-04-067-03W4/0
6    331.50   2.909  0.257  0.528  107.062  227.022  00/11-04-067-03W4/0
7    331.75   2.930  0.259  0.526  108.036  226.814  00/11-04-067-03W4/0
8    332.00   2.978  0.267  0.538  103.332  225.791  00/11-04-067-03W4/0
9    332.25   3.005  0.282  0.533  100.764  222.427  00/11-04-067-03W4/0
10   332.50   3.032  0.282  0.519  102.715  219.472  00/11-04-067-03W4/0
11   332.75   3.048  0.265  0.517  111.991  215.905  00/11-04-067-03W4/0
12   333.00  

In [25]:
print(type(dict_of_well_df))

<class 'dict'>


In [26]:
print(type(dict_of_well_df['00/11-04-067-03W4/0']))

<class 'pandas.core.frame.DataFrame'>


### We now have a dict of Pandas dataframes, were each dataframe is a well, that we will write to a pickle file

In [27]:
#### dumping dict of data frame to pickle file
dict_wells_df_and_Nofeatures_20180707 = dict_of_well_df
pickle.dump(dict_wells_df_and_Nofeatures_20180707, open( "dict_of__wells_df_No_features_class3_20180707.p", "wb" ) )

-----------------------------

## Next step will be to take this dict of dataframes and turn it into a single dataframe or perhaps a dask data frame. Then cycle or cast to add in columns for the information on nearest neighbors from the nearest neighbors dataframe based on a column for UWI. 

1. Create dataframe from dict 
2. Add material from KNN dataframe to this dataframe based on UWI
3. Go back to original dict of well dataframes and see if a dask dataframe can be created and then add KNN dataframe. 
4. Test steps (1,2) vs. (3) for speed

Open pickle of dick of well dataframes

In [28]:
dictOfWellDf =  pd.read_pickle('dict_of__wells_df_No_features_class3_20180707.p')

In [29]:
print(type(dictOfWellDf))

<class 'dict'>


In [30]:
def turnDictOfWellDfs_to_SingleDfOfAllWells(dictOfWellDf):
    """
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO
    and returns a single dataframe of all wells
    """
    # start by creating empty dataframe and list
    data_df = pd.DataFrame()
    list_of_df = []
    # get dict of well data frames into values format
    values = dictOfWellDf.values()
    # go through each item in values and add to a list
    for each in values:
        list_of_df.append(each)
    # concat the list into a single dataframe
    data_df = pd.concat(list_of_df)
    return data_df

In [31]:
df_all_wells_basic = turnDictOfWellDfs_to_SingleDfOfAllWells(dictOfWellDf)
print(type(df_all_wells_basic))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




<class 'pandas.core.frame.DataFrame'>


In [32]:
df_all_wells_basic.head()

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.0,,,,,,,0.46,,,,,,,,,,,00/10-32-080-20W4/0
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.55,,,,,,,,,,,00/10-32-080-20W4/0
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.28,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0
4,203.664,,,,150.602,,0.24,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0


In [33]:
df_all_wells_basic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482756 entries, 0 to 880
Data columns (total 33 columns):
CALI      487932 non-null float64
COND      1550 non-null float64
DELT      51769 non-null float64
DENS      1051 non-null float64
DEPT      1480984 non-null float64
DEPTH     1772 non-null float64
DPHI      1343386 non-null float64
DPHI:1    251 non-null float64
DPHI:2    251 non-null float64
DT        15473 non-null float64
GR        1481034 non-null float64
GR:1      251 non-null float64
GR:2      251 non-null float64
IL        3067 non-null float64
ILD       1470301 non-null float64
ILD:1     251 non-null float64
ILD:2     251 non-null float64
ILM       7037 non-null float64
LITH      361 non-null float64
LLD       2272 non-null float64
LLS       823 non-null float64
NPHI      1396128 non-null float64
PHID      3835 non-null float64
PHIN      976 non-null float64
RESD      1526 non-null float64
RHOB      107108 non-null float64
RT        224 non-null float64
SFL       4276 n

In [34]:
df_all_wells_basic.astype(bool).sum(axis=0)

CALI      1481038
COND      1482756
DELT      1482336
DENS      1482756
DEPT      1482691
DEPTH     1482756
DPHI      1479923
DPHI:1    1482756
DPHI:2    1482756
DT        1482756
GR        1482290
GR:1      1482756
GR:2      1482756
IL        1482756
ILD       1479640
ILD:1     1482756
ILD:2     1482756
ILM       1482756
LITH      1482756
LLD       1482756
LLS       1482756
NPHI      1481934
PHID      1482747
PHIN      1482756
RESD      1482756
RHOB      1438137
RT        1482756
SFL       1482751
SFLU      1482756
SN        1482756
SNP       1482756
SP        1481572
UWI       1482756
dtype: int64

In [35]:
df_all_wells_basic.isnull().sum()

CALI       994824
COND      1481206
DELT      1430987
DENS      1481705
DEPT         1772
DEPTH     1480984
DPHI       139370
DPHI:1    1482505
DPHI:2    1482505
DT        1467283
GR           1722
GR:1      1482505
GR:2      1482505
IL        1479689
ILD         12455
ILD:1     1482505
ILD:2     1482505
ILM       1475719
LITH      1482395
LLD       1480484
LLS       1481933
NPHI        86628
PHID      1478921
PHIN      1481780
RESD      1481230
RHOB      1375648
RT        1482532
SFL       1478480
SFLU      1475731
SN        1481390
SNP       1482124
SP        1467622
UWI             0
dtype: int64

should probably at some point find the wells with missing major values like GR and either take them out or find out if there is a naming change like GR2 and replace the names so those wells can be used.

In [36]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

Unnamed: 0,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,UWI,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz
0,102496,14000,13000,561.0,475.0,1,3,54.785907,-110.12932,00/12-08-067-01W4/0,"[{'neighbor': 1, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0575310019555}, {'neighbor': 2, 'UWI': '00/11-08-068-01W4/0', 'distance': 0.0874943032488}, {'neighbor': 3, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.104516690327}, {'neighbor': 4, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.109998964722}, {'neighbor': 5, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.140172829054}, {'neighbor': 6, 'UWI': '00/06-26-068-02W4/0', 'distance': 0.145394974621}, {'neighbor': 7, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.14676425482}]",544.0,630.0,86.0,475.0,13000,475.0,3,14000,561.0,1
1,102497,14000,13000,604.5,515.0,1,3,54.782284,-110.269446,00/07-08-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0473400060837}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0733221383008}, {'neighbor': 3, 'UWI': '00/07-36-067-03W4/0', 'distance': 0.0772558585092}, {'neighbor': 4, 'UWI': '00/10-14-067-03W4/0', 'distance': 0.078467559265}, {'neighbor': 5, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0826737666313}, {'neighbor': 6, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0908970121511}, {'neighbor': 7, 'UWI': '00/06-13-068-03W4/0', 'distance': 0.116811594031}]",529.0,613.0,84.0,520.5,13000,515.0,3,14000,604.5,1
2,102498,14000,13000,564.0,480.0,1,3,54.785892,-110.186851,00/09-11-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/12-08-067-01W4/0', 'distance': 0.0575310019555}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0664937861909}, {'neighbor': 3, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0826737666313}, {'neighbor': 4, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.0872840016555}, {'neighbor': 5, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0934369191754}, {'neighbor': 6, 'UWI': '00/11-08-068-01W4/0', 'distance': 0.1080816692}, {'neighbor': 7, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.120139676315}]",529.0,613.0,84.0,480.0,13000,480.0,3,14000,564.0,1
3,102500,14000,13000,636.5,549.0,1,3,54.829624,-110.269422,00/10-29-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0435570060725}, {'neighbor': 2, 'UWI': '00/06-34-067-02W4/0', 'distance': 0.0458903640103}, {'neighbor': 3, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0473400060837}, {'neighbor': 4, 'UWI': '00/07-36-067-03W4/0', 'distance': 0.0520161525875}, {'neighbor': 5, 'UWI': '00/06-13-068-03W4/0', 'distance': 0.078988404744}, {'neighbor': 6, 'UWI': '00/10-14-067-03W4/0', 'distance': 0.0816950542016}, {'neighbor': 7, 'UWI': '00/11-29-068-02W4/0', 'distance': 0.0875230112656}]",529.0,613.0,84.0,552.5,13000,549.0,3,14000,636.5,1
4,102501,14000,13000,613.0,529.0,1,2,54.840471,-110.224832,00/06-34-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0', 'distance': 0.0458903640103}, {'neighbor': 2, 'UWI': '00/09-11-068-02W4/0', 'distance': 0.0501344694696}, {'neighbor': 3, 'UWI': '00/10-08-068-02W4/0', 'distance': 0.0552825613462}, {'neighbor': 4, 'UWI': '00/09-11-067-02W4/0', 'distance': 0.0664937861909}, {'neighbor': 5, 'UWI': '00/07-08-067-02W4/0', 'distance': 0.0733221383008}, {'neighbor': 6, 'UWI': '00/06-26-068-02W4/0', 'distance': 0.0771552400618}, {'neighbor': 7, 'UWI': '00/11-29-068-02W4/0', 'distance': 0.0918035212614}]",514.0,603.0,89.0,524.0,13000,529.0,2,14000,613.0,1


In [37]:
def combine_DfOfAllWells_with_knnDf(df_all_wells_basic,knn_df):
    """
    Takes in 2 arguments, a dataframe of all wells with only basic info 
    & the dataframe with info on knn neighbor data
    and returns a single dataframe that merges the two input dataframes based on UWI column
    """
    df_all_wells_wKNN = pd.merge(df_all_wells_basic, knn_df, on='UWI')
    return df_all_wells_wKNN

In [38]:
%%time 
df_all_wells_wKNN = combine_DfOfAllWells_with_knnDf(df_all_wells_basic,wells_df_new_cleaned_plus_nn_wNoNulls)

CPU times: user 693 ms, sys: 390 ms, total: 1.08 s
Wall time: 1.08 s


In [39]:
df_all_wells_wKNN.head()

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.0,,,,,,,0.46,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.55,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.28,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
4,203.664,,,,150.602,,0.24,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1


In [40]:
len(df_all_wells_wKNN)

1482756

In [41]:
len(df_all_wells_wKNN.UWI.unique())

1907

The dataframe of nearest neighbor information had 1926 rows, this now has 1920 unique UWIs.
Did some of the wells in import not make it through or where kicked out from later steps????

Same thing as above but for Dask data frames

In [42]:
def turnDictOfWellDfs_to_SingleDfOfAllWells(dictOfDF):
    """
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO
    and returns a single dataframe of all wells
    """
    return dask_df_all_wells_basic

In [43]:
def combine_DfOfAllWells_with_knnDf(dask_df_all_wells_basic,knn_df):
    """
    Takes in 2 arguments, a dataframe of all wells with only basic info 
    & the dataframe with info on knn neighbor data
    and returns a single dataframe that merges the two input dataframes based on UWI column
    """
    return dask_df_all_wells_wKNN

--------------------

## After joining on the nearest neighbor dataframe, we can cast the original columns to floats instead of strings which some but not necessarily all might be. 
When we do this, be careful about variation in depth column name and rename DEPTH and DEPT to DEPTH

In [44]:
columns = list(df_all_wells_wKNN.columns.values)

In [45]:
columns

['CALI',
 'COND',
 'DELT',
 'DENS',
 'DEPT',
 'DEPTH',
 'DPHI',
 'DPHI:1',
 'DPHI:2',
 'DT',
 'GR',
 'GR:1',
 'GR:2',
 'IL',
 'ILD',
 'ILD:1',
 'ILD:2',
 'ILM',
 'LITH',
 'LLD',
 'LLS',
 'NPHI',
 'PHID',
 'PHIN',
 'RESD',
 'RHOB',
 'RT',
 'SFL',
 'SFLU',
 'SN',
 'SNP',
 'SP',
 'UWI',
 'SitID',
 'McMurray_Base_HorID',
 'McMurray_Top_HorID',
 'McMurray_Base_DEPTH',
 'McMurray_Top_DEPTH',
 'McMurray_Base_Qual',
 'McMurray_Top_Qual',
 'lat',
 'lng',
 'Neighbors_Obj',
 'NN1_McMurray_Top_DEPTH',
 'NN1_McMurray_Base_DEPTH',
 'NN1_thickness',
 'MM_Top_Depth_predBy_NN1thick',
 'HorID',
 'Pick',
 'Quality',
 'HorID_paleoz',
 'Pick_paleoz',
 'Quality_paleoz']

In [46]:
# List for turning everything except UWI, SiteID, and Neighbors obj into a float for easier working with later
columns_to_turn_to_floats = ['CALI',
 'COND',
 'DELT',
 'DENS',
 'DEPT',
 'DEPTH',
 'DPHI',
 'DPHI:1',
 'DPHI:2',
 'DT',
 'GR',
 'GR:1',
 'GR:2',
 'IL',
 'ILD',
 'ILD:1',
 'ILD:2',
 'ILM',
 'LITH',
 'LLD',
 'LLS',
 'NPHI',
 'PHID',
 'PHIN',
 'RESD',
 'RHOB',
 'RT',
 'SFL',
 'SFLU',
 'SN',
 'SNP',
 'SP',
 'McMurray_Base_HorID',
 'McMurray_Top_HorID',
 'McMurray_Base_DEPTH',
 'McMurray_Top_DEPTH',
 'McMurray_Base_Qual',
 'McMurray_Top_Qual',
 'lat',
 'lng',
 'NN1_McMurray_Top_DEPTH',
 'NN1_McMurray_Base_DEPTH',
 'NN1_thickness',
 'MM_Top_Depth_predBy_NN1thick',
 'HorID',
 'Pick',
 'Quality',
 'HorID_paleoz',
 'Pick_paleoz',
 'Quality_paleoz']

In [47]:
%%time
df_all_wells_wKNN[columns_to_turn_to_floats].astype(float)

CPU times: user 1.41 s, sys: 1.32 s, total: 2.73 s
Wall time: 2.74 s


Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.000,,,,,,,0.460,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.550,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.280,,,,,,,0.513,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
4,203.664,,,,150.602,,0.240,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
5,198.617,,,,150.852,,0.242,,,,95.967,,,,8.571,,,,,,,0.470,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
6,194.160,,,,151.102,,0.241,,,,78.239,,,,11.399,,,,,,,0.476,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
7,188.812,,,,151.352,,0.253,,,,64.757,,,,15.404,,,,,,,0.447,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
8,182.164,,,,151.602,,0.268,,,,50.213,,,,20.262,,,,,,,0.403,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0
9,174.410,,,,151.852,,0.277,,,,35.314,,,,25.061,,,,,,,0.366,,,,,,,,,,,14000.0,13000.0,384.66,377.95,1.0,3.0,55.978836,-113.095365,389.0,414.0,25.0,359.66,13000.0,377.95,3.0,14000.0,384.66,1.0


In [48]:
df_all_wells_wKNN

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.000,,,,,,,0.460,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.550,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.280,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
4,203.664,,,,150.602,,0.240,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
5,198.617,,,,150.852,,0.242,,,,95.967,,,,8.571,,,,,,,0.470,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
6,194.160,,,,151.102,,0.241,,,,78.239,,,,11.399,,,,,,,0.476,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
7,188.812,,,,151.352,,0.253,,,,64.757,,,,15.404,,,,,,,0.447,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
8,182.164,,,,151.602,,0.268,,,,50.213,,,,20.262,,,,,,,0.403,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1
9,174.410,,,,151.852,,0.277,,,,35.314,,,,25.061,,,,,,,0.366,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1


In [49]:
df_all_wells_wKNN.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482756 entries, 0 to 1482755
Data columns (total 53 columns):
CALI                            487932 non-null float64
COND                            1550 non-null float64
DELT                            51769 non-null float64
DENS                            1051 non-null float64
DEPT                            1480984 non-null float64
DEPTH                           1772 non-null float64
DPHI                            1343386 non-null float64
DPHI:1                          251 non-null float64
DPHI:2                          251 non-null float64
DT                              15473 non-null float64
GR                              1481034 non-null float64
GR:1                            251 non-null float64
GR:2                            251 non-null float64
IL                              3067 non-null float64
ILD                             1470301 non-null float64
ILD:1                           251 non-null float64
ILD:2       

-----------------

## Now we're going to find some depths! Most of the depths in the wells are from a column called DEPT but there a handfull of wells that use a column called DEPTH. For convience sake, we're going to move the DEPTH values were not NaN to the DEPT column so all depths are in the same column. We're also going to try to replace NaNs in GR with GR:1 and GR:2 where data exists.

In [50]:
def useDiffColNamesToFillInNA(dataframeOfWells,colReplaceList):
    """
    Takes in two arguments,
    Argument one is a dataframe of multiple wells
    Argument two is a list of lists. Where each sub-list is a  pair of column names. 
    The right col is used to fill in NANs where they exist in left column.
    The function returns a dataframe of wells with the NANs in certain columns replaced based on input arguments.
    Example = [[ColA,ColB],[ColF,ColG],[ColZ,ColE]]
    """
    for each in colReplaceList:
        print("each",each)
        dataframeOfWells[each[0]].fillna(dataframeOfWells[each[1]], inplace=True)
    return dataframeOfWells

In [51]:
### list of sub-lists. Items on left are replaced with volumns from right column if left column has a NaN
colReplaceList = [['DEPT','DEPTH'],['GR','GR:1'],['GR','GR:2']]

In [52]:
#### Create new dataframe
df_all_wells_wKNN_DEPTHtoDEPT = useDiffColNamesToFillInNA(df_all_wells_wKNN,colReplaceList)

each ['DEPT', 'DEPTH']
each ['GR', 'GR:1']
each ['GR', 'GR:2']


In [53]:
#### Look at DEPT to make sure it has gone up, it has!
df_all_wells_wKNN_DEPTHtoDEPT.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1482756 entries, 0 to 1482755
Data columns (total 53 columns):
CALI                            487932 non-null float64
COND                            1550 non-null float64
DELT                            51769 non-null float64
DENS                            1051 non-null float64
DEPT                            1482756 non-null float64
DEPTH                           1772 non-null float64
DPHI                            1343386 non-null float64
DPHI:1                          251 non-null float64
DPHI:2                          251 non-null float64
DT                              15473 non-null float64
GR                              1481285 non-null float64
GR:1                            251 non-null float64
GR:2                            251 non-null float64
IL                              3067 non-null float64
ILD                             1470301 non-null float64
ILD:1                           251 non-null float64
ILD:2       

---------------------

## Create columns for how close a row is (based on depth) from the official pick for that well. 
### We'll be doing this for Top and Base McMurray.

In [54]:
#### for top McMurray
df_all_wells_wKNN_DEPTHtoDEPT['diff_TMcM_Pick_v_DEPT'] = df_all_wells_wKNN_DEPTHtoDEPT['Pick'] - df_all_wells_wKNN_DEPTHtoDEPT['DEPT']
#### for base McMurray or Top Paleozoic
df_all_wells_wKNN_DEPTHtoDEPT['diff_TPal_Pick_v_DEPT'] = df_all_wells_wKNN_DEPTHtoDEPT['Pick_paleoz'] - df_all_wells_wKNN_DEPTHtoDEPT['DEPT']


In [55]:
#### print a few wells to double check
df_all_wells_wKNN_DEPTHtoDEPT[0:1000]

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,diff_TMcM_Pick_v_DEPT,diff_TPal_Pick_v_DEPT
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.000,,,,,,,0.460,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.348,235.058
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.550,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.098,234.808
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.848,234.558
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.280,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.598,234.308
4,203.664,,,,150.602,,0.240,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.348,234.058
5,198.617,,,,150.852,,0.242,,,,95.967,,,,8.571,,,,,,,0.470,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.098,233.808
6,194.160,,,,151.102,,0.241,,,,78.239,,,,11.399,,,,,,,0.476,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.848,233.558
7,188.812,,,,151.352,,0.253,,,,64.757,,,,15.404,,,,,,,0.447,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.598,233.308
8,182.164,,,,151.602,,0.268,,,,50.213,,,,20.262,,,,,,,0.403,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.348,233.058
9,174.410,,,,151.852,,0.277,,,,35.314,,,,25.061,,,,,,,0.366,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.098,232.808


## IT SHOULD BE NOTED THAT THE 'correct' PICK DEPTHS IN MANY CASES DO NOT PERFECTLY MATCH THE DEPTHS AVAILABLE IN THE LOGS. 
### In other words, the pick might be 105 but there is no row with 105.00 depth, only a 104.98 and a 105.02!
### This matters for what you count as a correct label!

### Create column for whether a row (based on depth) is within 0.0, +- 5, or >5 from the official pick.

In [56]:
#### Create a column that has a number that symbolizes whether a row is close or not to the 'real' pick
#### We'll do this first for Top McMurray and then top Paleozoic, which is basically base McMurray
df_all_wells_wKNN_DEPTHtoDEPT['cat_isTopMcMrNearby_known']=df_all_wells_wKNN_DEPTHtoDEPT['diff_TMcM_Pick_v_DEPT'].apply(lambda x: 100 if x==0 else ( 95 if (-0.5 < x and x <0.5) else 60 if (-5 < x and x <5) else 0))
#### Top paleozoic version
df_all_wells_wKNN_DEPTHtoDEPT['cat_isTopPalNearby_known']=df_all_wells_wKNN_DEPTHtoDEPT['diff_TPal_Pick_v_DEPT'].apply(lambda x: 100 if x==0 else ( 95 if (-0.5 < x and x <0.5) else 60 if (-5 < x and x <5) else 0))


In [57]:
#### drop previously created diff_TMcM_Pick_v_DEPT
#df_all_wells_wKNN_DEPTHtoDEPT.drop(columns=['diff_Pick_v_DEPT'])

In [58]:
#### print a few wells to double check
df_all_wells_wKNN_DEPTHtoDEPT.tail()

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,diff_TMcM_Pick_v_DEPT,diff_TPal_Pick_v_DEPT,cat_isTopMcMrNearby_known,cat_isTopPalNearby_known
1482751,,,,,359.0,,0.014,,,,61.724,,,,53.94,,,,,,,0.191,,,,,,,,,,,00/10-35-081-15W4/0,154240,14000,13000,348.0,321.0,1,3,56.066128,-112.234008,"[{'neighbor': 1, 'UWI': '00/10-06-082-14W4/0', 'distance': 0.0542149910726}, {'neighbor': 2, 'UWI': '00/06-25-082-15W4/0', 'distance': 0.0719230322011}, {'neighbor': 3, 'UWI': 'AA/10-29-081-14W4/0', 'distance': 0.0796287068399}, {'neighbor': 4, 'UWI': '00/10-08-082-15W4/0', 'distance': 0.0834816883035}, {'neighbor': 5, 'UWI': 'AA/10-08-081-14W4/0', 'distance': 0.0975490425632}, {'neighbor': 6, 'UWI': '00/10-16-082-14W4/0', 'distance': 0.113194344527}, {'neighbor': 7, 'UWI': '00/10-20-080-14W4/0', 'distance': 0.140246243914}]",300.5,323.5,23.0,325.0,13000,321.0,3,14000,348.0,1,-38.0,-11.0,0,0
1482752,,,,,359.25,,0.014,,,,59.927,,,,63.882,,,,,,,0.167,,,,,,,,,,,00/10-35-081-15W4/0,154240,14000,13000,348.0,321.0,1,3,56.066128,-112.234008,"[{'neighbor': 1, 'UWI': '00/10-06-082-14W4/0', 'distance': 0.0542149910726}, {'neighbor': 2, 'UWI': '00/06-25-082-15W4/0', 'distance': 0.0719230322011}, {'neighbor': 3, 'UWI': 'AA/10-29-081-14W4/0', 'distance': 0.0796287068399}, {'neighbor': 4, 'UWI': '00/10-08-082-15W4/0', 'distance': 0.0834816883035}, {'neighbor': 5, 'UWI': 'AA/10-08-081-14W4/0', 'distance': 0.0975490425632}, {'neighbor': 6, 'UWI': '00/10-16-082-14W4/0', 'distance': 0.113194344527}, {'neighbor': 7, 'UWI': '00/10-20-080-14W4/0', 'distance': 0.140246243914}]",300.5,323.5,23.0,325.0,13000,321.0,3,14000,348.0,1,-38.25,-11.25,0,0
1482753,,,,,359.5,,0.011,,,,58.729,,,,74.245,,,,,,,0.155,,,,,,,,,,,00/10-35-081-15W4/0,154240,14000,13000,348.0,321.0,1,3,56.066128,-112.234008,"[{'neighbor': 1, 'UWI': '00/10-06-082-14W4/0', 'distance': 0.0542149910726}, {'neighbor': 2, 'UWI': '00/06-25-082-15W4/0', 'distance': 0.0719230322011}, {'neighbor': 3, 'UWI': 'AA/10-29-081-14W4/0', 'distance': 0.0796287068399}, {'neighbor': 4, 'UWI': '00/10-08-082-15W4/0', 'distance': 0.0834816883035}, {'neighbor': 5, 'UWI': 'AA/10-08-081-14W4/0', 'distance': 0.0975490425632}, {'neighbor': 6, 'UWI': '00/10-16-082-14W4/0', 'distance': 0.113194344527}, {'neighbor': 7, 'UWI': '00/10-20-080-14W4/0', 'distance': 0.140246243914}]",300.5,323.5,23.0,325.0,13000,321.0,3,14000,348.0,1,-38.5,-11.5,0,0
1482754,,,,,359.75,,0.007,,,,57.529,,,,93.046,,,,,,,0.148,,,,,,,,,,,00/10-35-081-15W4/0,154240,14000,13000,348.0,321.0,1,3,56.066128,-112.234008,"[{'neighbor': 1, 'UWI': '00/10-06-082-14W4/0', 'distance': 0.0542149910726}, {'neighbor': 2, 'UWI': '00/06-25-082-15W4/0', 'distance': 0.0719230322011}, {'neighbor': 3, 'UWI': 'AA/10-29-081-14W4/0', 'distance': 0.0796287068399}, {'neighbor': 4, 'UWI': '00/10-08-082-15W4/0', 'distance': 0.0834816883035}, {'neighbor': 5, 'UWI': 'AA/10-08-081-14W4/0', 'distance': 0.0975490425632}, {'neighbor': 6, 'UWI': '00/10-16-082-14W4/0', 'distance': 0.113194344527}, {'neighbor': 7, 'UWI': '00/10-20-080-14W4/0', 'distance': 0.140246243914}]",300.5,323.5,23.0,325.0,13000,321.0,3,14000,348.0,1,-38.75,-11.75,0,0
1482755,,,,,360.0,,0.006,,,,56.926,,,,138.167,,,,,,,0.14,,,,,,,,,,,00/10-35-081-15W4/0,154240,14000,13000,348.0,321.0,1,3,56.066128,-112.234008,"[{'neighbor': 1, 'UWI': '00/10-06-082-14W4/0', 'distance': 0.0542149910726}, {'neighbor': 2, 'UWI': '00/06-25-082-15W4/0', 'distance': 0.0719230322011}, {'neighbor': 3, 'UWI': 'AA/10-29-081-14W4/0', 'distance': 0.0796287068399}, {'neighbor': 4, 'UWI': '00/10-08-082-15W4/0', 'distance': 0.0834816883035}, {'neighbor': 5, 'UWI': 'AA/10-08-081-14W4/0', 'distance': 0.0975490425632}, {'neighbor': 6, 'UWI': '00/10-16-082-14W4/0', 'distance': 0.113194344527}, {'neighbor': 7, 'UWI': '00/10-20-080-14W4/0', 'distance': 0.140246243914}]",300.5,323.5,23.0,325.0,13000,321.0,3,14000,348.0,1,-39.0,-12.0,0,0


------------------------------

## Use thickness from neighor and base to predict top just with that, add as feature

In [59]:
# l_df['new_pick']=l_df['Pick']-l_df['DEPT']

In [60]:
# df_all_wells_wKNN['diff_Pick_v_DEPT'] = df_all_wells_wKNN['Pick'] - df_all_wells_wKNN['DEPT']

In [61]:
df_all_wells_wKNN_DEPTHtoDEPT['MM_Top_Depth_predBy_NN1thick'][0:1]

0    359.66
Name: MM_Top_Depth_predBy_NN1thick, dtype: float64

In [62]:
#### Takes MM_Top_Depth_predBy_NN1thick and subtracts depth at that point, returns *absolute* value
def NN1_TopMcMDepth_Abs(df,MM_Top_Depth_predBy_NN1thick):
    df['DistFrom_NN1_TopDepth_Abs'] = abs(df[MM_Top_Depth_predBy_NN1thick] - df['DEPT'])
    return df

In [63]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM = NN1_TopMcMDepth_Abs(df_all_wells_wKNN_DEPTHtoDEPT,'MM_Top_Depth_predBy_NN1thick')

In [64]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM 

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,diff_TMcM_Pick_v_DEPT,diff_TPal_Pick_v_DEPT,cat_isTopMcMrNearby_known,cat_isTopPalNearby_known,DistFrom_NN1_TopDepth_Abs
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.000,,,,,,,0.460,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.348,235.058,0,0,210.058
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.550,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.098,234.808,0,0,209.808
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.848,234.558,0,0,209.558
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.280,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.598,234.308,0,0,209.308
4,203.664,,,,150.602,,0.240,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.348,234.058,0,0,209.058
5,198.617,,,,150.852,,0.242,,,,95.967,,,,8.571,,,,,,,0.470,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.098,233.808,0,0,208.808
6,194.160,,,,151.102,,0.241,,,,78.239,,,,11.399,,,,,,,0.476,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.848,233.558,0,0,208.558
7,188.812,,,,151.352,,0.253,,,,64.757,,,,15.404,,,,,,,0.447,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.598,233.308,0,0,208.308
8,182.164,,,,151.602,,0.268,,,,50.213,,,,20.262,,,,,,,0.403,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.348,233.058,0,0,208.058
9,174.410,,,,151.852,,0.277,,,,35.314,,,,25.061,,,,,,,0.366,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.098,232.808,0,0,207.808


----------------

## Finally, we'll create a variety of calculated features based on well log numbers at, above, below, and around each depth point.

#### The difficult thing about creating features based on windows within a well when you have multiple wells stacked in a dataframe is that sometimes that window from one well goes into the next well.

#### To get around that, we're going create a column that says the distance from the top of the well and another column that says the distance form the bottom of the well. When a row's distance from top or bottom is greater than 1/2 the max window size, we'll just use proceed as normal. When the distance between that row's depth and top or bottom is less than 1/2 the max window size, we'll .....................

In [65]:
#
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['NewWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI'].shift(1) != df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI']
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['LastBitWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI'].shift(-1) != df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI']


In [66]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM[0:1000]

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,diff_TMcM_Pick_v_DEPT,diff_TPal_Pick_v_DEPT,cat_isTopMcMrNearby_known,cat_isTopPalNearby_known,DistFrom_NN1_TopDepth_Abs,NewWell,LastBitWell
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.000,,,,,,,0.460,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.348,235.058,0,0,210.058,True,False
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.550,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.098,234.808,0,0,209.808,False,False
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.848,234.558,0,0,209.558,False,False
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.280,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.598,234.308,0,0,209.308,False,False
4,203.664,,,,150.602,,0.240,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.348,234.058,0,0,209.058,False,False
5,198.617,,,,150.852,,0.242,,,,95.967,,,,8.571,,,,,,,0.470,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.098,233.808,0,0,208.808,False,False
6,194.160,,,,151.102,,0.241,,,,78.239,,,,11.399,,,,,,,0.476,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.848,233.558,0,0,208.558,False,False
7,188.812,,,,151.352,,0.253,,,,64.757,,,,15.404,,,,,,,0.447,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.598,233.308,0,0,208.308,False,False
8,182.164,,,,151.602,,0.268,,,,50.213,,,,20.262,,,,,,,0.403,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.348,233.058,0,0,208.058,False,False
9,174.410,,,,151.852,,0.277,,,,35.314,,,,25.061,,,,,,,0.366,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,226.098,232.808,0,0,207.808,False,False


In [67]:
# df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['TopOfWell'] = np.where(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM[NewWell] == True,,


# df['elderly'] = np.where(df['age']>=50, 'yes', 'no')

In [68]:
TopOfWellRowsOnly = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.loc[df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['NewWell'] == True]
BottomOfWellRowsOnly = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.loc[df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['LastBitWell'] == True]

In [69]:
#rename depth to top and bottom depths , delete all other columns
TopOfWellRowsOnly = TopOfWellRowsOnly[['UWI','DEPT']]
TopOfWellRowsOnly['TopWellDept'] = TopOfWellRowsOnly['DEPT']
TopOfWellRowsOnly.drop(['DEPT'],axis=1, inplace=True)
#### same thing for bottom
BottomOfWellRowsOnly = BottomOfWellRowsOnly[['UWI','DEPT']]
BottomOfWellRowsOnly['BotWellDept'] = BottomOfWellRowsOnly['DEPT']
BottomOfWellRowsOnly.drop(['DEPT'],axis=1, inplace=True)
#### merge these two small dataframes
TopAndBottomOfWellRowsOnly = pd.merge(TopOfWellRowsOnly, BottomOfWellRowsOnly, on='UWI')
#### merge with larger dataframe
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM = pd.merge(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM, TopAndBottomOfWellRowsOnly, on='UWI')

In [70]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.head()

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,diff_TMcM_Pick_v_DEPT,diff_TPal_Pick_v_DEPT,cat_isTopMcMrNearby_known,cat_isTopPalNearby_known,DistFrom_NN1_TopDepth_Abs,NewWell,LastBitWell,TopWellDept,BotWellDept
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.0,,,,,,,0.46,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.348,235.058,0,0,210.058,True,False,149.602,396.102
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.55,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.098,234.808,0,0,209.808,False,False,149.602,396.102
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.848,234.558,0,0,209.558,False,False,149.602,396.102
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.28,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.598,234.308,0,0,209.308,False,False,149.602,396.102
4,203.664,,,,150.602,,0.24,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.348,234.058,0,0,209.058,False,False,149.602,396.102


In [71]:
#### Create a col for distance from row to top of well
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromTopWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['DEPT'] - df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['TopWellDept']

#### Create a col for distance from row to bottom of well
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromBotWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['BotWellDept'] - df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['DEPT']

#### Create col for well total thickness measured
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['WellThickness'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['BotWellDept'] - df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['TopWellDept']


In [73]:
# TopOfWellRowsOnly['TopWellDept'] = TopOfWellRowsOnly['DEPT']
# TopOfWellRowsOnly

In [74]:
# TopOfWellRowsOnly.drop(['DEPT'],axis=1, inplace=True)
# TopOfWellRowsOnly

#### This adds a column that says whether a row is closer to the bottm or the top of the well
#### This is useful for doing creation of features of rolling windows where you want to avoid going into another well stacked above.

In [75]:
#### This adds a column that says whether a row is closer to the bottm or the top of the well
#### This is useful for doing creation of features of rolling windows where you want to avoid going into another well stacked above.
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['closerToBotOrTop'] = np.where(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromTopWell']<=df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromBotWell'], 'FromTopWell', 'FromBotWell')

In [345]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.head()

Unnamed: 0,CALI,COND,DELT,DENS,DEPT,DEPTH,DPHI,DPHI:1,DPHI:2,DT,GR,GR:1,GR:2,IL,ILD,ILD:1,ILD:2,ILM,LITH,LLD,LLS,NPHI,PHID,PHIN,RESD,RHOB,RT,SFL,SFLU,SN,SNP,SP,UWI,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,diff_Pick_v_DEPT,cat_isTopMcMrNearby_known,diff_TMcM_Pick_v_DEPT,diff_TPal_Pick_v_DEPT,cat_isTopPalNearby_known,DistFrom_NN1_TopDepth_Abs,NewWell,LastBitWell,TopWellDept,BotWellDept,FromTopWell,FromBotWell,WellThickness,closerToBotOrTop
0,167.003,,,,149.602,,0.227,,,,102.473,,,,0.0,,,,,,,0.46,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.348,0,228.348,235.058,0,210.058,True,False,149.602,396.102,0.0,246.5,246.5,FromTopWell
1,199.159,,,,149.852,,0.263,,,,122.589,,,,4.202,,,,,,,0.55,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,228.098,0,228.098,234.808,0,209.808,False,False,149.602,396.102,0.25,246.25,246.5,FromTopWell
2,200.496,,,,150.102,,0.252,,,,120.196,,,,4.643,,,,,,,0.537,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.848,0,227.848,234.558,0,209.558,False,False,149.602,396.102,0.5,246.0,246.5,FromTopWell
3,203.933,,,,150.352,,0.244,,,,115.975,,,,5.28,,,,,,,0.513,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.598,0,227.598,234.308,0,209.308,False,False,149.602,396.102,0.75,245.75,246.5,FromTopWell
4,203.664,,,,150.602,,0.24,,,,109.271,,,,6.592,,,,,,,0.487,,,,,,,,,,,00/10-32-080-20W4/0,112385,14000,13000,384.66,377.95,1,3,55.978836,-113.095365,"[{'neighbor': 1, 'UWI': '00/10-18-080-20W4/0', 'distance': 0.0507719894529}, {'neighbor': 2, 'UWI': '00/10-18-081-20W4/0', 'distance': 0.0509221450648}, {'neighbor': 3, 'UWI': '00/10-35-080-20W4/0', 'distance': 0.0783130001021}, {'neighbor': 4, 'UWI': '00/07-32-081-20W4/0', 'distance': 0.0836550005977}, {'neighbor': 5, 'UWI': 'AA/11-14-081-20W4/0', 'distance': 0.0841140089937}, {'neighbor': 6, 'UWI': '00/10-23-081-20W4/0', 'distance': 0.0975473690265}, {'neighbor': 7, 'UWI': '00/07-36-079-21W4/0', 'distance': 0.104812341931}]",389.0,414.0,25.0,359.66,13000,377.95,3,14000,384.66,1,227.348,0,227.348,234.058,0,209.058,False,False,149.602,396.102,1.0,245.5,246.5,FromTopWell


In [346]:
def thicknessDivQuarter(thickness):
    """
    Given a thickness 
    It calculates the difference in rows by assuming each row is 0.25 apart in depth
    and return the difference in rows.
    If rows is not an integer, it rounds down so as to avoid going into another well stacked above it
    """
    rows = math.floor(thickness/0.25)
    return rows

In [None]:
def isDistTopOrBottonLarger()

In [368]:
####  df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
def addColWindowMean(df,col,windowSize,centered):
    featureName = col+"_mean_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = np.where(df['FromTopWell']>=windowSize and df['FromBotWell']>=windowSize, df[col].rolling(center=True,window=windowSize).mean(), df[col].rolling(center=True,window=thicknessDivQuarter(df[df['closerToBotOrTop']])).mean())
        #df[featureName] = df[col].rolling(center=True,window=windowSize).mean() 
    elif(centered == "above"):
        df[featureName] = np.where(df['FromTopWell'] > windowSize, df[col].rolling(center=False,window=windowSize).mean(), df[col].rolling(center=False,window=thicknessDivQuarter(df[df['closerToBotOrTop']])).mean())
        ## original
        #df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = np.where(df['FromTopWell']>=windowSize and df['FromBotWell']>=windowSize, df[col].rolling(center=False,window=windowSize).mean(), df[col].rolling(center=False,window=thicknessDivQuarter(df[df['closerToBotOrTop']])).mean())
        ## original
        #df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [364]:
test_df = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.copy() 

In [None]:
addColWindowMean(test_df,'GR',5,'above')

In [None]:
####  df['elderly'] = np.where(df['age']>=50, 'yes', 'no')
def addColWindowMean(df,col,windowSize,centered):
    featureName = col+"_mean_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).mean() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [None]:
####
def addColWindowMax(df,col,windowSize,centered):
    featureName = col+"_max_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).max() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).max() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).max() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [None]:
### Returns a column with the min values of a window centered 
def addColWindowMin(df,col,windowSize,centered):
    featureName = col+"_min_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).min() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).min() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).min() 
        #### unreverse
        
        df = df.sort_index(ascending=True)
    return df

In [None]:
#### helper function that takes in array and an integer for the number of highest values to find the mean of 
#### example: for an array = [1,3,6,44,33,22,452] and nValues = 2, the answer would be 44+452 / 2
def nLargest(array,nValues):
    answer = np.mean(array[np.argsort(array)[-nValues:]])  
    return answer

In [None]:
#### Returns a column with the average of the N largest values of a window 
def addColWindowAvgMaxNvalues(df,col,windowSize,centered,nValues):
    #df[featureName] = df[col].rolling(center=True,window=windowSize).nlargest(nValues).mean() 
    #return df
    featureName = col+"_min_"+str(windowSize)+"winSize_"+"dir"+centered+"_n"+str(nValues)
    if(centered == "around"):
        #df[featureName] = df[col].rolling(center=True,window=windowSize).nlargest(nValues).mean() 
        df[featureName] = df[col].rolling(center=True,window=windowSize).apply(lambda x: nLargest(x,nValues))
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).apply(lambda x: nLargest(x,nValues))
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        #   # df['new_column'] = df.apply(lambda x: your_function(x['High'],x['Low'],x['Close']), axis=1)
        df[featureName] = df[col].rolling(center=False,window=windowSize).apply(lambda x: nLargest(x,nValues))
        #df[featureName] = df[col].rolling(center=False,window=windowSize).nlargest(nValues).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [None]:
####
winVars = {"RangeOfCurves":['GR'],
                   "RangeOfWindows":[5,11,29],
                   "RangeOfWindowsCaution":[5],
                   "RangeOfDirection":['above','below','around'],
                   "MinOrMaxRange":['min','max'],
                   "NumbPtsRange":[1,5]}

In [None]:
curves = ['GR','ILD']
windows = [5,7,11,21]
directions = ["around","below","above"]

In [None]:
def createCurvFeat(dataframe,curves,windows,directions):
    comboArg_A = [curves,windows,directions]
    all_comboArgs_A = list(itertools.product(*comboArg_A))
    for eachArgList in all_comboArgs_A:
        l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
        l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
        l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
        l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)

In [None]:
def createCurvFeat(dataframe,curves,windows,directions):
    comboArg_A = [curves,windows,directions]
    all_comboArgs_A = list(itertools.product(*comboArg_A))
    for eachArgList in all_comboArgs_A:
        try:
            l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
        except:
            pass
        try:
            l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
        except:
            pass
        try:
            l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
        except:
            pass
        try:
            l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
        except:
            pass

In [None]:
                                    curves = ['GR','ILD']
                                    windows = [5,7,11,21]
                                    directions = ["around","below","above"]
                                    comboArg_A = [curves,windows,directions]
                                    all_comboArgs_A = list(itertools.product(*comboArg_A))
                                    for eachArgList in all_comboArgs_A:
                                        try:
                                            l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
                                        except:
                                            pass
                                    # Figure out a way to add size of well as a category

## Read in well logs from LAS files and put into Dictionary of Dataframes. As Reading-in, add features

In [None]:
def loadAndAddFeatures():
    count=0
    data_df=[]
    count_limit =1663
    list_of_failed_wells = []
    ### dictionary that holds every well as key:value or "UWI":df pair
    df_w_dict ={}
    while count < count_limit:
        for file in glob.glob('../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'):
            #### NOTE: limiting wells being read-in to 101 here !!!!!!!!!!!!!!!!
            count+=1
            if count > count_limit:
                print("hit limit of count below file for loop")
                answer = [df_w_dict,list_of_failed_wells]
                return answer
            else:
                l_df = lasio.read(file).df()
                
                str_uwi= file[-23:-4].replace("-", "/",1)[:17]+file[-6:-4].replace("-", "/",1)
                #l_df.DEPT = l_df.DEPT.astype(float)
                ##bottom_well_depth = l_df['DEPT'].max()
                if any(df_new.UWI == str_uwi):
                    if df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0] > -1:
                        l_df = l_df.reset_index()
#                         print(l_df['DEPT'])
                        print("got to UWI apppend")
                        l_df['UWI'] = str_uwi
                        print("UWI added is ",str_uwi," and type is ",type(str_uwi))
                        l_df['SitID']=df_new[df_new['UWI']==str_uwi]['SitID'].iloc[0]
#                         l_df['UWI (AGS)']=df_new[df_new['UWI']==str_uwi]['UWI (AGS)'].iloc[0]
                        l_df['Pick']=df_new[df_new['UWI']==str_uwi]['Pick'].iloc[0] 
                        l_df['HorID']=df_new[df_new['UWI']==str_uwi]['HorID'].iloc[0]
                        l_df['Quality']=df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0]
                        #### adding in paleozoic surface pick
                        l_df['Pick_paleoz']=df_new[df_new['UWI']==str_uwi]['Pick_paleoz'].iloc[0] 
                        l_df['HorID_paleoz']=df_new[df_new['UWI']==str_uwi]['HorID_paleoz'].iloc[0]
                        l_df['Quality_paleoz']=df_new[df_new['UWI']==str_uwi]['Quality_paleoz'].iloc[0]
                        
                        #### new as of 2018-02
                        l_df['MM_Top_Depth_predBy_NN1thick']=df_new[df_new['UWI']==str_uwi]['MM_Top_Depth_predBy_NN1thick'].iloc[0]
                        l_df['NN1_thickness']=df_new[df_new['UWI']==str_uwi]['NN1_thickness'].iloc[0]
                        
                        print(l_df)
                        print("got to end of col append & pick is ",l_df.Pick.unique()[0])  
                        try:
                            print("in first try statement, count = ",count)
                            float(l_df.Pick.unique()[0])
                            l_df.Pick = l_df.Pick.astype(float)
                            print("str_uwi = ",str_uwi)
                            if (('DEPT' not in l_df.columns) and ('DEPTH' not in l_df.columns)):
                                print("str_uwi = ",str_uwi, " did not progress as 'DEPT' or 'DEPTH was not a column'")
                                list_of_failed_wells.append(str_uwi)
                            else:
                                try:
                                    l_df.DEPT = l_df.DEPT.astype(float)
                                except:
                                    try:
                                        l_df.DEPT = l_df.DEPTH.astype(float)
                                        l_df.drop([DEPTH])
                                    except:
                                        print("DEPT or DEPTH is in ",str_uwi," but can't be changed to float type???")
                                try:
                                    l_df['new_pick']=l_df['Pick']-l_df['DEPT']
        #                             l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 1 if(x==0) else 0)
                                    l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 10 if x==0 else ( 5 if (-5 < x and x <5) else 0))
        #                             lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)
                                    #### doing the same as below but for BASE mcMurray or Paleozoic surface pick
                                    float(l_df.Pick_paleoz.unique()[0])
                                    l_df.Pick_paleoz = l_df.Pick_paleoz.astype(float)
                                    #l_df.DEPT = l_df.DEPT.astype(float)
                                    l_df['new_pick_paleoz']=l_df['Pick_paleoz']-l_df['DEPT']
                                    l_df['new_pick2_paleoz']=l_df['new_pick_paleoz'].apply(lambda x: 1 if(x==0) else 0)

                                    #### new as of 2018-02
                                    try:
                                        l_df_new = NN1_TopMcMDepth(l_df,'MM_Top_Depth_predBy_NN1thick')
                                    except:
                                            pass
                                    try:
                                        l_df_new = NN1_TopMcMDepth_Abs(l_df,'MM_Top_Depth_predBy_NN1thick')
                                    except:
                                            pass

                                    print("got to below astype part")
                                    #### instead of concat into a single dataframe, run functions & then add to dictionary   
                                    ##### run functions to create features on array basis for each well in separate dataframe
                                    ##### this makes certain things easier, compared to everything in a single dataframe, like making sure you don't accidentally grab data from next well up
                                    ##### and will make it easier to write data back to LAS if we run into memory limitations later
                                    curves = ['GR','ILD']
                                    windows = [5,7,11,21]
                                    directions = ["around","below","above"]
                                    comboArg_A = [curves,windows,directions]
                                    all_comboArgs_A = list(itertools.product(*comboArg_A))
                                    for eachArgList in all_comboArgs_A:
                                        try:
                                            l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
                                        except:
                                            pass
                                    #### add resultant dataframe to dictionary
                                    if l_df['DEPT'].max() < 600:
                                        df_w_dict[l_df_new['UWI'][0]]= l_df_new
                                except:
                                    pass
                        except ValueError as e:
                            print("e = ",e)
                            print ('Error picking')
                            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
                            message = template.format(type(e).__name__, e.args)
                            print("message = ",message)
                            print("file = ",file)
                            print("Got except, UWI added is ",str_uwi," and type is ",type(str_uwi))
                            list_of_failed_wells.append(str_uwi)
                            #continue;
                    else:
                        print("could not find UWI match for the well")
                        pass
                else:
                    pass
            #print("result = ",df_w_dict)
    #else: 
    #    return df_w_dict, list_of_failed_wells
    answer = [df_w_dict,list_of_failed_wells]
    
    return answer

In [None]:
## %timeit
answer = loadAndAddFeatures()

In [None]:
#answer=[df_w_dict,list_of_failed_wells]
df_w_dict = answer[0]
list_of_failed_wells = answer[1]

In [None]:
df_w_dict

In [None]:
type(df_w_dict)

In [None]:
list_of_failed_wells

In [None]:
#### dumping dict of data frame to pickle file
wells_and_features_20180704 = df_w_dict
pickle.dump(wells_and_features_20180704, open( "dict_of_df_wells_and_features_class3_20180704.p", "wb" ) )

## Optional SKIPPING EVERYTHING ABOVE AND STARTING HERE

In [None]:
df_w_dict =  pd.read_pickle('dict_of_df_wells_and_features_class3_20180704.p')

### NOTE: not all well logs were read-in successfully, need to go back and find out why

In [None]:
#print(df_w_dict)

In [None]:
print("list_of_failed_wells",list_of_failed_wells)

In [None]:
df_w_dict['00/04-13-077-05W4/0']

In [None]:
## testing one dataframe of one well in dictionary of all that were successfully read in
df_w_dict['00/01-03-085-15W4/0'].shape

In [None]:
print(len(df_w_dict))

## Turn dictionary of dataframes into single dataframe

In [None]:
def turnDictofDFtoDF(dict_of_df):
    data_df = pd.DataFrame()
    list_of_df = []
    values = dict_of_df.values()
    for each in values:
        list_of_df.append(each)
    data_df = pd.concat(list_of_df)
    return data_df
        

In [None]:
data_df = turnDictofDFtoDF(df_w_dict)
data_df.shape

In [None]:
type(data_df)

In [None]:
#### dumpingdata frame to pickle file
df_wells_and_features_20180210 = data_df
pickle.dump(df_wells_and_features_20180210, open( "df_wells_and_features_20180704_qual_all.p", "wb" ) )

Also going to save it to HDF5 file store

In [None]:
import numpy as np
from pandas import HDFStore  # create (or open) an hdf5 file and opens in append mode
hdf =HDFStore('dataframeOfWellsPlusFeatInOneGo_vA.h5')

In [None]:
hdf.put('d1', data_df, format='table', data_columns=True)

In [None]:
paleozoic_pick_test = data_df['new_pick_paleoz'][1800:2000]
paleozoic_pick_test

## Key variables that hold data frame column names

In [None]:
keys = ['ILD','DPHI','GR','NPHI','CALI','COND','DELT','RHOB','PHIN','DT','ILM','SP','SFLU','IL','DEPTH','DEPH','MD']

In [None]:
keys2 = ['ILD','DPHI','GR','NPHI','CALI','RHOB']

Adding deritive features

In [None]:
all_col_names = list(df_w_dict['00/04-13-077-05W4/0'])
all_col_names

In [None]:
# DistFrom_NN1_TopDepth_Abs
features2original = ['CALI','DEPT','DPHI','GR','ILD','NPHI', 'SitID','CALIder','DPHIder','GRder','ILDder']
features2 = [
    #'DEPT',
 'DPHI',
 'NPHI',
 'GR',
 'ILD',
 'SitID',
#  'DistFrom_NN1_TopDepth_Abs',
 'DistFrom_NN1_TopDepth',
 'NN1_thickness',
 'new_pick_paleoz',
 'GR_mean_5winSize_diraround',
 'GR_max_5winSize_diraround',
 'GR_min_5winSize_diraround',
 'GR_min_5winSize_diraround_n3',
 'GR_mean_5winSize_dirabove',
 'GR_max_5winSize_dirabove',
 'GR_min_5winSize_dirabove',
 'GR_min_5winSize_dirabove_n3',
 'GR_mean_7winSize_diraround',
 'GR_max_7winSize_diraround',
 'GR_min_7winSize_diraround',
 'GR_min_7winSize_diraround_n3',
 'GR_mean_7winSize_dirabove',
 'GR_max_7winSize_dirabove',
 'GR_min_7winSize_dirabove',
 'GR_min_7winSize_dirabove_n3',
 'GR_mean_11winSize_diraround',
 'GR_max_11winSize_diraround',
 'GR_min_11winSize_diraround',
 'GR_min_11winSize_diraround_n3',
 'GR_mean_11winSize_dirabove',
 'GR_max_11winSize_dirabove',
 'GR_min_11winSize_dirabove',
 'GR_min_11winSize_dirabove_n3',
 'GR_mean_21winSize_diraround',
 'GR_max_21winSize_diraround',
 'GR_min_21winSize_diraround',
 'GR_min_21winSize_diraround_n3',
 'GR_mean_21winSize_dirabove',
 'GR_max_21winSize_dirabove',
 'GR_min_21winSize_dirabove',
 'GR_min_21winSize_dirabove_n3',
 'ILD_mean_5winSize_diraround',
 'ILD_max_5winSize_diraround',
 'ILD_min_5winSize_diraround',
 'ILD_min_5winSize_diraround_n3',
 'ILD_mean_5winSize_dirabove',
 'ILD_max_5winSize_dirabove',
 'ILD_min_5winSize_dirabove',
 'ILD_min_5winSize_dirabove_n3',
 'ILD_mean_7winSize_diraround',
 'ILD_max_7winSize_diraround',
 'ILD_min_7winSize_diraround',
 'ILD_min_7winSize_diraround_n3',
 'ILD_mean_7winSize_dirabove',
 'ILD_max_7winSize_dirabove',
 'ILD_min_7winSize_dirabove',
 'ILD_min_7winSize_dirabove_n3',
 'ILD_mean_11winSize_diraround',
 'ILD_max_11winSize_diraround',
 'ILD_min_11winSize_diraround',
 'ILD_min_11winSize_diraround_n3',
 'ILD_mean_11winSize_dirabove',
 'ILD_max_11winSize_dirabove',
 'ILD_min_11winSize_dirabove',
 'ILD_min_11winSize_dirabove_n3',
 'ILD_mean_21winSize_diraround',
 'ILD_max_21winSize_diraround',
 'ILD_min_21winSize_diraround',
 'ILD_min_21winSize_diraround_n3',
 'ILD_mean_21winSize_dirabove',
 'ILD_max_21winSize_dirabove',
 'ILD_min_21winSize_dirabove',
 'ILD_min_21winSize_dirabove_n3']
label = 'new_pick2'
train_X2 = data_df[features2]
train_y = data_df[label]

In [None]:
train_X2.shape

In [None]:
# from xgboost.sklearn import XGBRegressor

# model2 = XGBRegressor()
# model2.fit(train_X2, train_y)
# result2= model2.predict(train_X2)
# result2

In [None]:
from xgboost.sklearn import XGBClassifier

model2 = XGBClassifier()
model2.fit(train_X2, train_y)
result2= model2.predict(train_X2)
result2

In [None]:
well_data=data_df.copy()

In [None]:
well_data.shape

In [None]:
id_array = well_data['SitID'].unique()
id_array_permutation = np.random.permutation(id_array)
train_index = id_array_permutation[:int(len(id_array)*.8)]
test_index = id_array_permutation[int(len(id_array)*.8)+1:]
train_df = well_data.loc[well_data['SitID'].isin(train_index)]
test_df = well_data.loc[well_data['SitID'].isin(test_index)]

In [None]:
features_originalB = ['CALI','DEPT','DPHI','GR','ILD','NPHI']
features = [
    #'DEPT',
 'DPHI',
 'NPHI',
 'GR',
 'ILD',
 'SitID',
#  'DistFrom_NN1_TopDepth_Abs',
 'DistFrom_NN1_TopDepth',
 'NN1_thickness',
 'new_pick2_paleoz',
 'GR_mean_5winSize_diraround',
 'GR_max_5winSize_diraround',
 'GR_min_5winSize_diraround',
 'GR_min_5winSize_diraround_n3',
 'GR_mean_5winSize_dirabove',
 'GR_max_5winSize_dirabove',
 'GR_min_5winSize_dirabove',
 'GR_min_5winSize_dirabove_n3',
 'GR_mean_7winSize_diraround',
 'GR_max_7winSize_diraround',
 'GR_min_7winSize_diraround',
 'GR_min_7winSize_diraround_n3',
 'GR_mean_7winSize_dirabove',
 'GR_max_7winSize_dirabove',
 'GR_min_7winSize_dirabove',
 'GR_min_7winSize_dirabove_n3',
 'GR_mean_11winSize_diraround',
 'GR_max_11winSize_diraround',
 'GR_min_11winSize_diraround',
 'GR_min_11winSize_diraround_n3',
 'GR_mean_11winSize_dirabove',
 'GR_max_11winSize_dirabove',
 'GR_min_11winSize_dirabove',
 'GR_min_11winSize_dirabove_n3',
 'GR_mean_21winSize_diraround',
 'GR_max_21winSize_diraround',
 'GR_min_21winSize_diraround',
 'GR_min_21winSize_diraround_n3',
 'GR_mean_21winSize_dirabove',
 'GR_max_21winSize_dirabove',
 'GR_min_21winSize_dirabove',
 'GR_min_21winSize_dirabove_n3',
 'ILD_mean_5winSize_diraround',
 'ILD_max_5winSize_diraround',
 'ILD_min_5winSize_diraround',
 'ILD_min_5winSize_diraround_n3',
 'ILD_mean_5winSize_dirabove',
 'ILD_max_5winSize_dirabove',
 'ILD_min_5winSize_dirabove',
 'ILD_min_5winSize_dirabove_n3',
 'ILD_mean_7winSize_diraround',
 'ILD_max_7winSize_diraround',
 'ILD_min_7winSize_diraround',
 'ILD_min_7winSize_diraround_n3',
 'ILD_mean_7winSize_dirabove',
 'ILD_max_7winSize_dirabove',
 'ILD_min_7winSize_dirabove',
 'ILD_min_7winSize_dirabove_n3',
 'ILD_mean_11winSize_diraround',
 'ILD_max_11winSize_diraround',
 'ILD_min_11winSize_diraround',
 'ILD_min_11winSize_diraround_n3',
 'ILD_mean_11winSize_dirabove',
 'ILD_max_11winSize_dirabove',
 'ILD_min_11winSize_dirabove',
 'ILD_min_11winSize_dirabove_n3',
 'ILD_mean_21winSize_diraround',
 'ILD_max_21winSize_diraround',
 'ILD_min_21winSize_diraround',
 'ILD_min_21winSize_diraround_n3',
 'ILD_mean_21winSize_dirabove',
 'ILD_max_21winSize_dirabove',
 'ILD_min_21winSize_dirabove',
 'ILD_min_21winSize_dirabove_n3']

label = 'new_pick2'

In [None]:
seed = 123

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
from xgboost.sklearn import XGBClassifier
# from xgboost.sklearn import XGBRegressor
#params_final = (
#    gamma=0, 
#    alpha=0.2, 
#    maxdepth=3, 
#    subsample=0.8, 
#    colsamplebytree= 0.8, 
#    n_estimators= 100, 
#    learningrate= 0.1, 
#    minchildweight= 1
#)
train_X = train_df[features]
train_y = train_df[label]
test_X = test_df[features]
test_y = test_df[label]

In [None]:
model = XGBClassifier(
    gamma=0, 
    reg_alpha=0.2, 
    max_depth=3, 
    subsample=0.8, 
    colsample_bytree= 0.8, 
    n_estimators= 300, 
    learning_rate= 0.03, 
    min_child_weight= 3)
model.fit(train_X,train_y)
result = model.predict(test_X)
result

In [None]:
test_df_pred = test_df.copy()
test_df_pred['Pick_pred'] = result
test_df_pred.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(test_df_pred['new_pick2'], test_df_pred['Pick_pred'])

In [None]:
accuracy

In [None]:
test_df_pred_onlyTopMCM_class5 = test_df_pred.loc[test_df_pred['new_pick2'] == 5]

In [None]:
print(len(test_df_pred_onlyTopMCM_class5))
#test_df_pred_onlyTopMCM

In [None]:
accuracy = accuracy_score(test_df_pred_onlyTopMCM_class5['new_pick2'], test_df_pred_onlyTopMCM_class5['Pick_pred'])
accuracy

In [None]:
import pdvega
import vega

In [None]:
# 'MM_Top_Depth_predBy_NN1thick'
test_df_pred2_TopScratch2 = test_df_pred_onlyTopMCM_class5[['DistFrom_NN1_TopDepth']]
test_df_pred2_TopScratch2.vgplot.hist(bins=100, alpha=0.5)

In [None]:
test_df_pred_onlyTopMCM_class10 = test_df_pred.loc[test_df_pred['new_pick2'] == 10]

In [None]:
print(len(test_df_pred_onlyTopMCM_class10))

In [None]:
accuracy_class10 = accuracy_score(test_df_pred_onlyTopMCM_class10['new_pick2'], test_df_pred_onlyTopMCM_class10['Pick_pred'])
accuracy_class10

In [None]:
# 'MM_Top_Depth_predBy_NN1thick'
test_df_pred2_TopScratch2 = test_df_pred_onlyTopMCM_class10[['DistFrom_NN1_TopDepth']]
test_df_pred2_TopScratch2.vgplot.hist(bins=100, alpha=0.5)

In [None]:
plt.plot(test_df_pred_onlyTopMCM_class10['DEPT'],test_df_pred_onlyTopMCM_class10['Pick_pred'], 'ro')

In [None]:
plt.plot(test_df_pred_onlyTopMCM_class10['DEPT'],test_df_pred_onlyTopMCM_class10['DistFrom_NN1_TopDepth'], 'ro')

In [None]:
#c='McMurray_Base_DEPTH'
test_df_pred_onlyTopMCM_class5.vgplot(kind='scatter', x='DEPT', y='Pick',c='NN1_thickness')

In [None]:
#c='McMurray_Base_DEPTH'
test_df_pred_onlyTopMCM_class10.vgplot(kind='scatter', x='DEPT', y='Pick',c='NN1_thickness')

In [None]:
test_df_pred_onlyTopMCM_class5.vgplot(kind='scatter', x='DEPT', y='Pick_pred',c='NN1_thickness')

In [None]:
test_df_pred_onlyTopMCM_class10pred = test_df_pred.loc[test_df_pred['Pick_pred'] == 10]
len(test_df_pred_onlyTopMCM_class10pred)

In [None]:
test_df_pred[0:20]

In [None]:
len(test_df_pred.UWI.unique())

In [None]:
idx = test_df_pred.groupby(['SitID'])['Pick_pred'].transform(max) == test_df_pred['Pick_pred']
test_df_pred3=test_df_pred[idx]
        
        


In [None]:
len(test_df_pred3)

In [None]:
#c='McMurray_Base_DEPTH'
test_df_pred3.vgplot(kind='scatter', x='DEPT', y='Pick',c='NN1_thickness')