## Feature Creation notebook
### Goal is to start with dict of dataframes of wells and a few other pieces and create a single dataframe with all the necessary features for all used wells
#### This work is similar to what has been done before but data loading & feature creation is separate and dask is used to speed feature creation
##### by Justin Gosses 2018-07-07

#### Inputs used during this notebook are:
    1. Dict of dataframes of used well created by notebooks in `/loadLAS` directory
    2. A dataframe of nearest neighbor information from a notebook found in the `WellsKNN/` directory
    3. picks_dic a data dictionary for the pick list below = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
    4. pick list = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
    5. well list = pd.read_csv('../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
    6. lattitude and longitude for eahc well = pd.read_csv('../../well_lat_lng.csv')

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import welly
from welly import Well
import lasio
import glob
from sklearn import neighbors
import pickle
import math
import dask
import dask.dataframe as dd
from dask.distributed import Client
# import pdvega
# import vega
import dask.dataframe as dd
from dask.distributed import Client
welly.__version__

'0.3.5'

In [2]:
print(dask.__version__)
print(pd.__version__)

0.17.5
0.23.0


In [3]:
%%timeit
import os
env = %env


75.7 µs ± 4.7 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [4]:
from IPython.display import display

In [5]:
#### Test results Part 1
#### Had to change display options to get this to print in full!
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100000

In [6]:
knn_dir = "../WellsKNN/"
load_dir = "../loadLAS"

## We're going to load a pickle file of a previously created dataframe

### The dataframe merges:
1. picks_dic = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
2. picks = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
3. wells = pd.read_csv('../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
4. gis = pd.read_csv('../../well_lat_lng.csv')

### It also excludes any wells that have nulls or zeros for Top McMurray or Base McMurray picks
This was done in notebooks: 
1. notebooks_2018/mapmaking/Map_Exploration_v2-KDtree.ipynb
2. notebooks_2018/Test_RUN_2018_02/DataCleaningPrepof_KNN_neighborPickDepth_df_creation_vA_20180210

### Let's load in the pickle file of the dataframe from previous notebook mentioned above

In [8]:
wells_df_new_cleaned_plus_nn_wNoNulls =  pd.read_pickle(knn_dir+'/'+'wells_df__NB_KDtreePost062018_vA__NoMcTopLeak_v2.p')

In [9]:
wells_df_new_cleaned_plus_nn_wNoNulls.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1926 entries, 0 to 2192
Data columns (total 15 columns):
SitID                           1926 non-null int64
McMurray_Base_HorID             1926 non-null int64
McMurray_Top_HorID              1926 non-null int64
McMurray_Base_DEPTH             1926 non-null float64
McMurray_Top_DEPTH              1926 non-null float64
McMurray_Base_Qual              1926 non-null int64
McMurray_Top_Qual               1926 non-null int64
lat                             1926 non-null float64
lng                             1926 non-null float64
UWI                             1926 non-null object
Neighbors_Obj                   1926 non-null object
NN1_McMurray_Top_DEPTH          1921 non-null float64
NN1_McMurray_Base_DEPTH         1926 non-null float64
NN1_thickness                   1921 non-null float64
MM_Top_Depth_predBy_NN1thick    1921 non-null float64
dtypes: float64(8), int64(5), object(2)
memory usage: 240.8+ KB


In [10]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

Unnamed: 0,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,UWI,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick
0,102496,14000,13000,561.0,475.0,1,3,54.785907,-110.12932,00/12-08-067-01W4/0,"[{'neighbor': 1, 'UWI': '00/09-11-067-02W4/0',...",544.0,630.0,86.0,475.0
1,102497,14000,13000,604.5,515.0,1,3,54.782284,-110.269446,00/07-08-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0',...",529.0,613.0,84.0,520.5
2,102498,14000,13000,564.0,480.0,1,3,54.785892,-110.186851,00/09-11-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/12-08-067-01W4/0',...",529.0,613.0,84.0,480.0
3,102500,14000,13000,636.5,549.0,1,3,54.829624,-110.269422,00/10-29-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-08-068-02W4/0',...",529.0,613.0,84.0,552.5
4,102501,14000,13000,613.0,529.0,1,2,54.840471,-110.224832,00/06-34-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0',...",514.0,603.0,89.0,524.0


In [11]:
print(len(wells_df_new_cleaned_plus_nn_wNoNulls))

1926


In [12]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

Unnamed: 0,SitID,McMurray_Base_HorID,McMurray_Top_HorID,McMurray_Base_DEPTH,McMurray_Top_DEPTH,McMurray_Base_Qual,McMurray_Top_Qual,lat,lng,UWI,Neighbors_Obj,NN1_McMurray_Top_DEPTH,NN1_McMurray_Base_DEPTH,NN1_thickness,MM_Top_Depth_predBy_NN1thick
0,102496,14000,13000,561.0,475.0,1,3,54.785907,-110.12932,00/12-08-067-01W4/0,"[{'neighbor': 1, 'UWI': '00/09-11-067-02W4/0',...",544.0,630.0,86.0,475.0
1,102497,14000,13000,604.5,515.0,1,3,54.782284,-110.269446,00/07-08-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0',...",529.0,613.0,84.0,520.5
2,102498,14000,13000,564.0,480.0,1,3,54.785892,-110.186851,00/09-11-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/12-08-067-01W4/0',...",529.0,613.0,84.0,480.0
3,102500,14000,13000,636.5,549.0,1,3,54.829624,-110.269422,00/10-29-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-08-068-02W4/0',...",529.0,613.0,84.0,552.5
4,102501,14000,13000,613.0,529.0,1,2,54.840471,-110.224832,00/06-34-067-02W4/0,"[{'neighbor': 1, 'UWI': '00/10-29-067-02W4/0',...",514.0,603.0,89.0,524.0


In [13]:
df_new = wells_df_new_cleaned_plus_nn_wNoNulls
# df_new["UWI (AGS)"] = df_new["UWI (AGS)_x"]
df_new["UWI"] = df_new["UWI"]
df_new["HorID"] = df_new["McMurray_Top_HorID"]
df_new["Pick"] = df_new["McMurray_Top_DEPTH"]
df_new["Quality"] = df_new["McMurray_Top_Qual"]
df_new["HorID_paleoz"] = df_new["McMurray_Base_HorID"]
df_new["Pick_paleoz"] = df_new["McMurray_Base_DEPTH"]
df_new["Quality_paleoz"] = df_new["McMurray_Base_Qual"]
df_new = df_new[["SitID","UWI","HorID","Pick","Quality","HorID_paleoz","Pick_paleoz","Quality_paleoz",'lat','lng','MM_Top_Depth_predBy_NN1thick','NN1_thickness']]
df_new

Unnamed: 0,SitID,UWI,HorID,Pick,Quality,HorID_paleoz,Pick_paleoz,Quality_paleoz,lat,lng,MM_Top_Depth_predBy_NN1thick,NN1_thickness
0,102496,00/12-08-067-01W4/0,13000,475.00,3,14000,561.00,1,54.785907,-110.129320,475.00,86.00
1,102497,00/07-08-067-02W4/0,13000,515.00,3,14000,604.50,1,54.782284,-110.269446,520.50,84.00
2,102498,00/09-11-067-02W4/0,13000,480.00,3,14000,564.00,1,54.785892,-110.186851,480.00,84.00
3,102500,00/10-29-067-02W4/0,13000,549.00,3,14000,636.50,1,54.829624,-110.269422,552.50,84.00
4,102501,00/06-34-067-02W4/0,13000,529.00,2,14000,613.00,1,54.840471,-110.224832,524.00,89.00
5,102503,00/11-04-067-03W4/0,13000,488.50,2,14000,553.50,1,54.771449,-110.402983,489.00,64.50
6,102505,00/10-08-067-03W4/0,13000,501.50,2,14000,572.50,1,54.785901,-110.422131,508.00,64.50
7,102507,00/10-14-067-03W4/0,13000,553.50,2,14000,606.50,1,54.800533,-110.345762,542.00,64.50
8,102514,00/10-28-067-03W4/0,13000,493.50,3,14000,558.00,1,54.829633,-110.396621,487.00,71.00
9,102517,00/07-36-067-03W4/0,13000,536.50,2,14000,615.00,1,54.840441,-110.320301,562.00,53.00


In [14]:
print(len(df_new))

1926


In [15]:
#### Number of unique wells based on UWI
len(df_new.UWI.unique())

1926

In [16]:
df_new_test = df_new[['UWI']]

In [17]:
print(any(df_new_test.UWI == '00/11-04-067-03W4/0'))

True


## Brainstorm 1 : various features to extract
1. Find average values of [each curve] in [different] length windows [above, around, below] a depth
2. Find average for [different number of] [max, min] values in [different] length windows [above, around, below] a depth for [each curve]
3. Find gradient within [window length] around a depth point 
4. Find [min, avg, max] gradient of various [smaller window length] within a larger [larger window length]
5. Find number of gradient changes (negative to positive) of various [smaller window length] within a larger [larger window length]
6. Find difference between two windows on either side of a given depth. For each window, average for [different number of] [max, min] values in [different] length windows above & below a given depth for [each curve].

In [18]:
####
def addColWindowMean(df,col,windowSize,centered):
    featureName = col+"_mean_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).mean() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [19]:
####
def addColWindowMax(df,col,windowSize,centered):
    featureName = col+"_max_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).max() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).max() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).max() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [20]:
#### Returns a column with the min values of a window centered 
def addColWindowMin(df,col,windowSize,centered):
    featureName = col+"_min_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).min() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).min() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).min() 
        #### unreverse
        
        df = df.sort_index(ascending=True)
    return df

In [21]:
#### helper function that takes in array and an integer for the number of highest values to find the mean of 
#### example: for an array = [1,3,6,44,33,22,452] and nValues = 2, the answer would be 44+452 / 2
def nLargest(array,nValues):
    answer = np.mean(array[np.argsort(array)[-nValues:]])  
    return answer

In [22]:
#### Returns a column with the average of the N largest values of a window 
def addColWindowAvgMaxNvalues(df,col,windowSize,centered,nValues):
    #df[featureName] = df[col].rolling(center=True,window=windowSize).nlargest(nValues).mean() 
    #return df
    featureName = col+"_min_"+str(windowSize)+"winSize_"+"dir"+centered+"_n"+str(nValues)
    if(centered == "around"):
        #df[featureName] = df[col].rolling(center=True,window=windowSize).nlargest(nValues).mean() 
        df[featureName] = df[col].rolling(center=True,window=windowSize).apply(lambda x: nLargest(x,nValues))
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).apply(lambda x: nLargest(x,nValues))
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        #   # df['new_column'] = df.apply(lambda x: your_function(x['High'],x['Low'],x['Close']), axis=1)
        df[featureName] = df[col].rolling(center=False,window=windowSize).apply(lambda x: nLargest(x,nValues))
        #df[featureName] = df[col].rolling(center=False,window=windowSize).nlargest(nValues).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [23]:
#### Takes MM_Top_Depth_predBy_NN1thick and subtracts depth at that point, returns value
def NN1_TopMcMDepth(df,MM_Top_Depth_predBy_NN1thick):
    df['DistFrom_NN1_TopDepth'] = df[MM_Top_Depth_predBy_NN1thick] - df['DEPT'] 
    return df


In [24]:
#### Takes MM_Top_Depth_predBy_NN1thick and subtracts depth at that point, returns *absolute* value
def NN1_TopMcMDepth_Abs(df,MM_Top_Depth_predBy_NN1thick):
    df['DistFrom_NN1_TopDepth_Abs'] = math.fabs(df[MM_Top_Depth_predBy_NN1thick] - df['DEPT'])
    return df

In [25]:
####
winVars = {"RangeOfCurves":['GR'],
                   "RangeOfWindows":[5,11,29],
                   "RangeOfWindowsCaution":[5],
                   "RangeOfDirection":['above','below','around'],
                   "MinOrMaxRange":['min','max'],
                   "NumbPtsRange":[1,5]}

## TRIAL : make dictionary simple first

In [35]:
def loadAndNoFeatures():
    count=0
    data_df=[]
    count_limit =1663
    list_of_failed_wells = []
    ### dictionary that holds every well as key:value or "UWI":df pair
    df_w_dict ={}
    while count < count_limit:
        for file in glob.glob('../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'):
            #### NOTE: limiting wells being read-in to 101 here !!!!!!!!!!!!!!!!
            count+=1
            if count > count_limit:
                print("hit limit of count below file for loop")
                answer = [df_w_dict,list_of_failed_wells]
                return answer
            else:
                l_df = lasio.read(file).df()
                
                str_uwi= file[-23:-4].replace("-", "/",1)[:17]+file[-6:-4].replace("-", "/",1)
                #l_df.DEPT = l_df.DEPT.astype(float)
                ##bottom_well_depth = l_df['DEPT'].max()
                if any(df_new.UWI == str_uwi):
                    if df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0] > -1:
                        l_df = l_df.reset_index()
#                         print(l_df['DEPT'])
                        print("got to UWI apppend")
                        l_df['UWI'] = str_uwi
                        df_w_dict[l_df['UWI'][0]]= l_df
#                         print("UWI added is ",str_uwi," and type is ",type(str_uwi))
#                         l_df['SitID']=df_new[df_new['UWI']==str_uwi]['SitID'].iloc[0]
# #                         l_df['UWI (AGS)']=df_new[df_new['UWI']==str_uwi]['UWI (AGS)'].iloc[0]
#                         l_df['Pick']=df_new[df_new['UWI']==str_uwi]['Pick'].iloc[0] 
#                         l_df['HorID']=df_new[df_new['UWI']==str_uwi]['HorID'].iloc[0]
#                         l_df['Quality']=df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0]
#                         #### adding in paleozoic surface pick
#                         l_df['Pick_paleoz']=df_new[df_new['UWI']==str_uwi]['Pick_paleoz'].iloc[0] 
#                         l_df['HorID_paleoz']=df_new[df_new['UWI']==str_uwi]['HorID_paleoz'].iloc[0]
#                         l_df['Quality_paleoz']=df_new[df_new['UWI']==str_uwi]['Quality_paleoz'].iloc[0]
                        
#                         #### new as of 2018-02
#                         l_df['MM_Top_Depth_predBy_NN1thick']=df_new[df_new['UWI']==str_uwi]['MM_Top_Depth_predBy_NN1thick'].iloc[0]
#                         l_df['NN1_thickness']=df_new[df_new['UWI']==str_uwi]['NN1_thickness'].iloc[0]
                        
#                         print(l_df)
#                         print("got to end of col append & pick is ",l_df.Pick.unique()[0])  
#                         try:
#                             print("in first try statement, count = ",count)
#                             float(l_df.Pick.unique()[0])
#                             l_df.Pick = l_df.Pick.astype(float)
#                             print("str_uwi = ",str_uwi)
#                             if (('DEPT' not in l_df.columns) and ('DEPTH' not in l_df.columns)):
#                                 print("str_uwi = ",str_uwi, " did not progress as 'DEPT' or 'DEPTH was not a column'")
#                                 list_of_failed_wells.append(str_uwi)
#                             else:
#                                 try:
#                                     l_df.DEPT = l_df.DEPT.astype(float)
#                                 except:
#                                     try:
#                                         l_df.DEPT = l_df.DEPTH.astype(float)
#                                         l_df.drop([DEPTH])
#                                     except:
#                                         print("DEPT or DEPTH is in ",str_uwi," but can't be changed to float type???")
#                                 try:
#                                     l_df['new_pick']=l_df['Pick']-l_df['DEPT']
#         #                             l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 1 if(x==0) else 0)
#                                     l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 10 if x==0 else ( 5 if (-5 < x and x <5) else 0))
#         #                             lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)
#                                     #### doing the same as below but for BASE mcMurray or Paleozoic surface pick
#                                     float(l_df.Pick_paleoz.unique()[0])
#                                     l_df.Pick_paleoz = l_df.Pick_paleoz.astype(float)
#                                     #l_df.DEPT = l_df.DEPT.astype(float)
#                                     l_df['new_pick_paleoz']=l_df['Pick_paleoz']-l_df['DEPT']
#                                     l_df['new_pick2_paleoz']=l_df['new_pick_paleoz'].apply(lambda x: 1 if(x==0) else 0)

#                                     #### new as of 2018-02
#                                     try:
#                                         l_df_new = NN1_TopMcMDepth(l_df,'MM_Top_Depth_predBy_NN1thick')
#                                     except:
#                                             pass
#                                     try:
#                                         l_df_new = NN1_TopMcMDepth_Abs(l_df,'MM_Top_Depth_predBy_NN1thick')
#                                     except:
#                                             pass

#                                     print("got to below astype part")
#                                     #### instead of concat into a single dataframe, run functions & then add to dictionary   
#                                     ##### run functions to create features on array basis for each well in separate dataframe
#                                     ##### this makes certain things easier, compared to everything in a single dataframe, like making sure you don't accidentally grab data from next well up
#                                     ##### and will make it easier to write data back to LAS if we run into memory limitations later
#                                     curves = ['GR','ILD']
#                                     windows = [5,7,11,21]
#                                     directions = ["around","below","above"]
#                                     comboArg_A = [curves,windows,directions]
#                                     all_comboArgs_A = list(itertools.product(*comboArg_A))
#                                     for eachArgList in all_comboArgs_A:
#                                         try:
#                                             l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
#                                         except:
#                                             pass
#                                         try:
#                                             l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
#                                         except:
#                                             pass
#                                         try:
#                                             l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
#                                         except:
#                                             pass
#                                         try:
#                                             l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
#                                         except:
#                                             pass
#                                     #### add resultant dataframe to dictionary
#                                     if l_df['DEPT'].max() < 600:
#                                         df_w_dict[l_df_new['UWI'][0]]= l_df_new
#                                 except:
#                                     pass
#                         except ValueError as e:
#                             print("e = ",e)
#                             print ('Error picking')
#                             template = "An exception of type {0} occurred. Arguments:\n{1!r}"
#                             message = template.format(type(e).__name__, e.args)
#                             print("message = ",message)
#                             print("file = ",file)
#                             print("Got except, UWI added is ",str_uwi," and type is ",type(str_uwi))
#                             list_of_failed_wells.append(str_uwi)
#                             #continue;
                    else:
                        print("could not find UWI match for the well")
                        pass
                else:
                    pass
            #print("result = ",df_w_dict)
    #else: 
    #    return df_w_dict, list_of_failed_wells
    answer = [df_w_dict,list_of_failed_wells]
    
    return initial_well_dict

In [36]:
initial_well_dict = loadAndNoFeatures()

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

Header section Parameter regexp=~P was not found.


got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI apppend
got to UWI a

In [37]:
#answer=[df_w_dict,list_of_failed_wells]
dict_of_well_df = initial_well_dict[0]
list_of_failed_wells = initial_well_dict[1]

In [38]:
print("list_of_failed_wells",list_of_failed_wells)
dict_of_well_df

list_of_failed_wells []


{'00/10-21-078-26W4/0':        DEPT     DELT       GR     ILD                  UWI
 0    411.48  310.798   59.316  22.461  00/10-21-078-26W4/0
 1    411.73  289.773   53.073  30.512  00/10-21-078-26W4/0
 2    411.98  252.559   48.435  34.455  00/10-21-078-26W4/0
 3    412.23  260.422   45.294  32.269  00/10-21-078-26W4/0
 4    412.48  287.479   40.856  27.848  00/10-21-078-26W4/0
 5    412.73  308.897   41.623  21.463  00/10-21-078-26W4/0
 6    412.98  299.268   47.515  17.121  00/10-21-078-26W4/0
 7    413.23  298.019   54.158  15.136  00/10-21-078-26W4/0
 8    413.48  301.013   60.626  12.843  00/10-21-078-26W4/0
 9    413.73  298.447   65.089  11.869  00/10-21-078-26W4/0
 10   413.98  329.645   64.518  11.195  00/10-21-078-26W4/0
 11   414.23  342.848   62.973  10.961  00/10-21-078-26W4/0
 12   414.48  356.874   64.885  11.454  00/10-21-078-26W4/0
 13   414.73  361.827   67.310  12.515  00/10-21-078-26W4/0
 14   414.98  362.851   67.211  13.665  00/10-21-078-26W4/0
 15   415.23  369

In [41]:
#### dumping dict of data frame to pickle file
dict_wells_df_and_Nofeatures_20180707 = dict_of_well_df
pickle.dump(dict_wells_df_and_Nofeatures_20180707, open( "dict_of__wells_df_No_features_class3_20180707.p", "wb" ) )

## Read in well logs from LAS files and put into Dictionary of Dataframes. As Reading-in, add features

In [None]:
def loadAndAddFeatures():
    count=0
    data_df=[]
    count_limit =1663
    list_of_failed_wells = []
    ### dictionary that holds every well as key:value or "UWI":df pair
    df_w_dict ={}
    while count < count_limit:
        for file in glob.glob('../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'):
            #### NOTE: limiting wells being read-in to 101 here !!!!!!!!!!!!!!!!
            count+=1
            if count > count_limit:
                print("hit limit of count below file for loop")
                answer = [df_w_dict,list_of_failed_wells]
                return answer
            else:
                l_df = lasio.read(file).df()
                
                str_uwi= file[-23:-4].replace("-", "/",1)[:17]+file[-6:-4].replace("-", "/",1)
                #l_df.DEPT = l_df.DEPT.astype(float)
                ##bottom_well_depth = l_df['DEPT'].max()
                if any(df_new.UWI == str_uwi):
                    if df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0] > -1:
                        l_df = l_df.reset_index()
#                         print(l_df['DEPT'])
                        print("got to UWI apppend")
                        l_df['UWI'] = str_uwi
                        print("UWI added is ",str_uwi," and type is ",type(str_uwi))
                        l_df['SitID']=df_new[df_new['UWI']==str_uwi]['SitID'].iloc[0]
#                         l_df['UWI (AGS)']=df_new[df_new['UWI']==str_uwi]['UWI (AGS)'].iloc[0]
                        l_df['Pick']=df_new[df_new['UWI']==str_uwi]['Pick'].iloc[0] 
                        l_df['HorID']=df_new[df_new['UWI']==str_uwi]['HorID'].iloc[0]
                        l_df['Quality']=df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0]
                        #### adding in paleozoic surface pick
                        l_df['Pick_paleoz']=df_new[df_new['UWI']==str_uwi]['Pick_paleoz'].iloc[0] 
                        l_df['HorID_paleoz']=df_new[df_new['UWI']==str_uwi]['HorID_paleoz'].iloc[0]
                        l_df['Quality_paleoz']=df_new[df_new['UWI']==str_uwi]['Quality_paleoz'].iloc[0]
                        
                        #### new as of 2018-02
                        l_df['MM_Top_Depth_predBy_NN1thick']=df_new[df_new['UWI']==str_uwi]['MM_Top_Depth_predBy_NN1thick'].iloc[0]
                        l_df['NN1_thickness']=df_new[df_new['UWI']==str_uwi]['NN1_thickness'].iloc[0]
                        
                        print(l_df)
                        print("got to end of col append & pick is ",l_df.Pick.unique()[0])  
                        try:
                            print("in first try statement, count = ",count)
                            float(l_df.Pick.unique()[0])
                            l_df.Pick = l_df.Pick.astype(float)
                            print("str_uwi = ",str_uwi)
                            if (('DEPT' not in l_df.columns) and ('DEPTH' not in l_df.columns)):
                                print("str_uwi = ",str_uwi, " did not progress as 'DEPT' or 'DEPTH was not a column'")
                                list_of_failed_wells.append(str_uwi)
                            else:
                                try:
                                    l_df.DEPT = l_df.DEPT.astype(float)
                                except:
                                    try:
                                        l_df.DEPT = l_df.DEPTH.astype(float)
                                        l_df.drop([DEPTH])
                                    except:
                                        print("DEPT or DEPTH is in ",str_uwi," but can't be changed to float type???")
                                try:
                                    l_df['new_pick']=l_df['Pick']-l_df['DEPT']
        #                             l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 1 if(x==0) else 0)
                                    l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 10 if x==0 else ( 5 if (-5 < x and x <5) else 0))
        #                             lambda x: x*10 if x<2 else (x**2 if x<4 else x+10)
                                    #### doing the same as below but for BASE mcMurray or Paleozoic surface pick
                                    float(l_df.Pick_paleoz.unique()[0])
                                    l_df.Pick_paleoz = l_df.Pick_paleoz.astype(float)
                                    #l_df.DEPT = l_df.DEPT.astype(float)
                                    l_df['new_pick_paleoz']=l_df['Pick_paleoz']-l_df['DEPT']
                                    l_df['new_pick2_paleoz']=l_df['new_pick_paleoz'].apply(lambda x: 1 if(x==0) else 0)

                                    #### new as of 2018-02
                                    try:
                                        l_df_new = NN1_TopMcMDepth(l_df,'MM_Top_Depth_predBy_NN1thick')
                                    except:
                                            pass
                                    try:
                                        l_df_new = NN1_TopMcMDepth_Abs(l_df,'MM_Top_Depth_predBy_NN1thick')
                                    except:
                                            pass

                                    print("got to below astype part")
                                    #### instead of concat into a single dataframe, run functions & then add to dictionary   
                                    ##### run functions to create features on array basis for each well in separate dataframe
                                    ##### this makes certain things easier, compared to everything in a single dataframe, like making sure you don't accidentally grab data from next well up
                                    ##### and will make it easier to write data back to LAS if we run into memory limitations later
                                    curves = ['GR','ILD']
                                    windows = [5,7,11,21]
                                    directions = ["around","below","above"]
                                    comboArg_A = [curves,windows,directions]
                                    all_comboArgs_A = list(itertools.product(*comboArg_A))
                                    for eachArgList in all_comboArgs_A:
                                        try:
                                            l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                                        except:
                                            pass
                                        try:
                                            l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
                                        except:
                                            pass
                                    #### add resultant dataframe to dictionary
                                    if l_df['DEPT'].max() < 600:
                                        df_w_dict[l_df_new['UWI'][0]]= l_df_new
                                except:
                                    pass
                        except ValueError as e:
                            print("e = ",e)
                            print ('Error picking')
                            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
                            message = template.format(type(e).__name__, e.args)
                            print("message = ",message)
                            print("file = ",file)
                            print("Got except, UWI added is ",str_uwi," and type is ",type(str_uwi))
                            list_of_failed_wells.append(str_uwi)
                            #continue;
                    else:
                        print("could not find UWI match for the well")
                        pass
                else:
                    pass
            #print("result = ",df_w_dict)
    #else: 
    #    return df_w_dict, list_of_failed_wells
    answer = [df_w_dict,list_of_failed_wells]
    
    return answer

In [None]:
## %timeit
answer = loadAndAddFeatures()

In [None]:
#answer=[df_w_dict,list_of_failed_wells]
df_w_dict = answer[0]
list_of_failed_wells = answer[1]

In [None]:
df_w_dict

In [None]:
type(df_w_dict)

In [None]:
list_of_failed_wells

In [None]:
#### dumping dict of data frame to pickle file
wells_and_features_20180704 = df_w_dict
pickle.dump(wells_and_features_20180704, open( "dict_of_df_wells_and_features_class3_20180704.p", "wb" ) )

## Optional SKIPPING EVERYTHING ABOVE AND STARTING HERE

In [None]:
df_w_dict =  pd.read_pickle('dict_of_df_wells_and_features_class3_20180704.p')

### NOTE: not all well logs were read-in successfully, need to go back and find out why

In [None]:
#print(df_w_dict)

In [None]:
print("list_of_failed_wells",list_of_failed_wells)

In [None]:
df_w_dict['00/04-13-077-05W4/0']

In [None]:
## testing one dataframe of one well in dictionary of all that were successfully read in
df_w_dict['00/01-03-085-15W4/0'].shape

In [None]:
print(len(df_w_dict))

## Turn dictionary of dataframes into single dataframe

In [None]:
def turnDictofDFtoDF(dict_of_df):
    data_df = pd.DataFrame()
    list_of_df = []
    values = dict_of_df.values()
    for each in values:
        list_of_df.append(each)
    data_df = pd.concat(list_of_df)
    return data_df
        

In [None]:
data_df = turnDictofDFtoDF(df_w_dict)
data_df.shape

In [None]:
type(data_df)

In [None]:
#### dumpingdata frame to pickle file
df_wells_and_features_20180210 = data_df
pickle.dump(df_wells_and_features_20180210, open( "df_wells_and_features_20180704_qual_all.p", "wb" ) )

Also going to save it to HDF5 file store

In [None]:
import numpy as np
from pandas import HDFStore  # create (or open) an hdf5 file and opens in append mode
hdf =HDFStore('dataframeOfWellsPlusFeatInOneGo_vA.h5')

In [None]:
hdf.put('d1', data_df, format='table', data_columns=True)

In [None]:
paleozoic_pick_test = data_df['new_pick_paleoz'][1800:2000]
paleozoic_pick_test

## Key variables that hold data frame column names

In [None]:
keys = ['ILD','DPHI','GR','NPHI','CALI','COND','DELT','RHOB','PHIN','DT','ILM','SP','SFLU','IL','DEPTH','DEPH','MD']

In [None]:
keys2 = ['ILD','DPHI','GR','NPHI','CALI','RHOB']

Adding deritive features

In [None]:
all_col_names = list(df_w_dict['00/04-13-077-05W4/0'])
all_col_names

In [None]:
# DistFrom_NN1_TopDepth_Abs
features2original = ['CALI','DEPT','DPHI','GR','ILD','NPHI', 'SitID','CALIder','DPHIder','GRder','ILDder']
features2 = [
    #'DEPT',
 'DPHI',
 'NPHI',
 'GR',
 'ILD',
 'SitID',
#  'DistFrom_NN1_TopDepth_Abs',
 'DistFrom_NN1_TopDepth',
 'NN1_thickness',
 'new_pick_paleoz',
 'GR_mean_5winSize_diraround',
 'GR_max_5winSize_diraround',
 'GR_min_5winSize_diraround',
 'GR_min_5winSize_diraround_n3',
 'GR_mean_5winSize_dirabove',
 'GR_max_5winSize_dirabove',
 'GR_min_5winSize_dirabove',
 'GR_min_5winSize_dirabove_n3',
 'GR_mean_7winSize_diraround',
 'GR_max_7winSize_diraround',
 'GR_min_7winSize_diraround',
 'GR_min_7winSize_diraround_n3',
 'GR_mean_7winSize_dirabove',
 'GR_max_7winSize_dirabove',
 'GR_min_7winSize_dirabove',
 'GR_min_7winSize_dirabove_n3',
 'GR_mean_11winSize_diraround',
 'GR_max_11winSize_diraround',
 'GR_min_11winSize_diraround',
 'GR_min_11winSize_diraround_n3',
 'GR_mean_11winSize_dirabove',
 'GR_max_11winSize_dirabove',
 'GR_min_11winSize_dirabove',
 'GR_min_11winSize_dirabove_n3',
 'GR_mean_21winSize_diraround',
 'GR_max_21winSize_diraround',
 'GR_min_21winSize_diraround',
 'GR_min_21winSize_diraround_n3',
 'GR_mean_21winSize_dirabove',
 'GR_max_21winSize_dirabove',
 'GR_min_21winSize_dirabove',
 'GR_min_21winSize_dirabove_n3',
 'ILD_mean_5winSize_diraround',
 'ILD_max_5winSize_diraround',
 'ILD_min_5winSize_diraround',
 'ILD_min_5winSize_diraround_n3',
 'ILD_mean_5winSize_dirabove',
 'ILD_max_5winSize_dirabove',
 'ILD_min_5winSize_dirabove',
 'ILD_min_5winSize_dirabove_n3',
 'ILD_mean_7winSize_diraround',
 'ILD_max_7winSize_diraround',
 'ILD_min_7winSize_diraround',
 'ILD_min_7winSize_diraround_n3',
 'ILD_mean_7winSize_dirabove',
 'ILD_max_7winSize_dirabove',
 'ILD_min_7winSize_dirabove',
 'ILD_min_7winSize_dirabove_n3',
 'ILD_mean_11winSize_diraround',
 'ILD_max_11winSize_diraround',
 'ILD_min_11winSize_diraround',
 'ILD_min_11winSize_diraround_n3',
 'ILD_mean_11winSize_dirabove',
 'ILD_max_11winSize_dirabove',
 'ILD_min_11winSize_dirabove',
 'ILD_min_11winSize_dirabove_n3',
 'ILD_mean_21winSize_diraround',
 'ILD_max_21winSize_diraround',
 'ILD_min_21winSize_diraround',
 'ILD_min_21winSize_diraround_n3',
 'ILD_mean_21winSize_dirabove',
 'ILD_max_21winSize_dirabove',
 'ILD_min_21winSize_dirabove',
 'ILD_min_21winSize_dirabove_n3']
label = 'new_pick2'
train_X2 = data_df[features2]
train_y = data_df[label]

In [None]:
train_X2.shape

In [None]:
# from xgboost.sklearn import XGBRegressor

# model2 = XGBRegressor()
# model2.fit(train_X2, train_y)
# result2= model2.predict(train_X2)
# result2

In [None]:
from xgboost.sklearn import XGBClassifier

model2 = XGBClassifier()
model2.fit(train_X2, train_y)
result2= model2.predict(train_X2)
result2

In [None]:
well_data=data_df.copy()

In [None]:
well_data.shape

In [None]:
id_array = well_data['SitID'].unique()
id_array_permutation = np.random.permutation(id_array)
train_index = id_array_permutation[:int(len(id_array)*.8)]
test_index = id_array_permutation[int(len(id_array)*.8)+1:]
train_df = well_data.loc[well_data['SitID'].isin(train_index)]
test_df = well_data.loc[well_data['SitID'].isin(test_index)]

In [None]:
features_originalB = ['CALI','DEPT','DPHI','GR','ILD','NPHI']
features = [
    #'DEPT',
 'DPHI',
 'NPHI',
 'GR',
 'ILD',
 'SitID',
#  'DistFrom_NN1_TopDepth_Abs',
 'DistFrom_NN1_TopDepth',
 'NN1_thickness',
 'new_pick2_paleoz',
 'GR_mean_5winSize_diraround',
 'GR_max_5winSize_diraround',
 'GR_min_5winSize_diraround',
 'GR_min_5winSize_diraround_n3',
 'GR_mean_5winSize_dirabove',
 'GR_max_5winSize_dirabove',
 'GR_min_5winSize_dirabove',
 'GR_min_5winSize_dirabove_n3',
 'GR_mean_7winSize_diraround',
 'GR_max_7winSize_diraround',
 'GR_min_7winSize_diraround',
 'GR_min_7winSize_diraround_n3',
 'GR_mean_7winSize_dirabove',
 'GR_max_7winSize_dirabove',
 'GR_min_7winSize_dirabove',
 'GR_min_7winSize_dirabove_n3',
 'GR_mean_11winSize_diraround',
 'GR_max_11winSize_diraround',
 'GR_min_11winSize_diraround',
 'GR_min_11winSize_diraround_n3',
 'GR_mean_11winSize_dirabove',
 'GR_max_11winSize_dirabove',
 'GR_min_11winSize_dirabove',
 'GR_min_11winSize_dirabove_n3',
 'GR_mean_21winSize_diraround',
 'GR_max_21winSize_diraround',
 'GR_min_21winSize_diraround',
 'GR_min_21winSize_diraround_n3',
 'GR_mean_21winSize_dirabove',
 'GR_max_21winSize_dirabove',
 'GR_min_21winSize_dirabove',
 'GR_min_21winSize_dirabove_n3',
 'ILD_mean_5winSize_diraround',
 'ILD_max_5winSize_diraround',
 'ILD_min_5winSize_diraround',
 'ILD_min_5winSize_diraround_n3',
 'ILD_mean_5winSize_dirabove',
 'ILD_max_5winSize_dirabove',
 'ILD_min_5winSize_dirabove',
 'ILD_min_5winSize_dirabove_n3',
 'ILD_mean_7winSize_diraround',
 'ILD_max_7winSize_diraround',
 'ILD_min_7winSize_diraround',
 'ILD_min_7winSize_diraround_n3',
 'ILD_mean_7winSize_dirabove',
 'ILD_max_7winSize_dirabove',
 'ILD_min_7winSize_dirabove',
 'ILD_min_7winSize_dirabove_n3',
 'ILD_mean_11winSize_diraround',
 'ILD_max_11winSize_diraround',
 'ILD_min_11winSize_diraround',
 'ILD_min_11winSize_diraround_n3',
 'ILD_mean_11winSize_dirabove',
 'ILD_max_11winSize_dirabove',
 'ILD_min_11winSize_dirabove',
 'ILD_min_11winSize_dirabove_n3',
 'ILD_mean_21winSize_diraround',
 'ILD_max_21winSize_diraround',
 'ILD_min_21winSize_diraround',
 'ILD_min_21winSize_diraround_n3',
 'ILD_mean_21winSize_dirabove',
 'ILD_max_21winSize_dirabove',
 'ILD_min_21winSize_dirabove',
 'ILD_min_21winSize_dirabove_n3']

label = 'new_pick2'

In [None]:
seed = 123

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
from xgboost.sklearn import XGBClassifier
# from xgboost.sklearn import XGBRegressor
#params_final = (
#    gamma=0, 
#    alpha=0.2, 
#    maxdepth=3, 
#    subsample=0.8, 
#    colsamplebytree= 0.8, 
#    n_estimators= 100, 
#    learningrate= 0.1, 
#    minchildweight= 1
#)
train_X = train_df[features]
train_y = train_df[label]
test_X = test_df[features]
test_y = test_df[label]

In [None]:
model = XGBClassifier(
    gamma=0, 
    reg_alpha=0.2, 
    max_depth=3, 
    subsample=0.8, 
    colsample_bytree= 0.8, 
    n_estimators= 300, 
    learning_rate= 0.03, 
    min_child_weight= 3)
model.fit(train_X,train_y)
result = model.predict(test_X)
result

In [None]:
test_df_pred = test_df.copy()
test_df_pred['Pick_pred'] = result
test_df_pred.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# make predictions for test data
# y_pred = model.predict(X_test)
# predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(test_df_pred['new_pick2'], test_df_pred['Pick_pred'])

In [None]:
accuracy

In [None]:
test_df_pred_onlyTopMCM_class5 = test_df_pred.loc[test_df_pred['new_pick2'] == 5]

In [None]:
print(len(test_df_pred_onlyTopMCM_class5))
#test_df_pred_onlyTopMCM

In [None]:
accuracy = accuracy_score(test_df_pred_onlyTopMCM_class5['new_pick2'], test_df_pred_onlyTopMCM_class5['Pick_pred'])
accuracy

In [None]:
import pdvega
import vega

In [None]:
# 'MM_Top_Depth_predBy_NN1thick'
test_df_pred2_TopScratch2 = test_df_pred_onlyTopMCM_class5[['DistFrom_NN1_TopDepth']]
test_df_pred2_TopScratch2.vgplot.hist(bins=100, alpha=0.5)

In [None]:
test_df_pred_onlyTopMCM_class10 = test_df_pred.loc[test_df_pred['new_pick2'] == 10]

In [None]:
print(len(test_df_pred_onlyTopMCM_class10))

In [None]:
accuracy_class10 = accuracy_score(test_df_pred_onlyTopMCM_class10['new_pick2'], test_df_pred_onlyTopMCM_class10['Pick_pred'])
accuracy_class10

In [None]:
# 'MM_Top_Depth_predBy_NN1thick'
test_df_pred2_TopScratch2 = test_df_pred_onlyTopMCM_class10[['DistFrom_NN1_TopDepth']]
test_df_pred2_TopScratch2.vgplot.hist(bins=100, alpha=0.5)

In [None]:
plt.plot(test_df_pred_onlyTopMCM_class10['DEPT'],test_df_pred_onlyTopMCM_class10['Pick_pred'], 'ro')

In [None]:
plt.plot(test_df_pred_onlyTopMCM_class10['DEPT'],test_df_pred_onlyTopMCM_class10['DistFrom_NN1_TopDepth'], 'ro')

In [None]:
#c='McMurray_Base_DEPTH'
test_df_pred_onlyTopMCM_class5.vgplot(kind='scatter', x='DEPT', y='Pick',c='NN1_thickness')

In [None]:
#c='McMurray_Base_DEPTH'
test_df_pred_onlyTopMCM_class10.vgplot(kind='scatter', x='DEPT', y='Pick',c='NN1_thickness')

In [None]:
test_df_pred_onlyTopMCM_class5.vgplot(kind='scatter', x='DEPT', y='Pick_pred',c='NN1_thickness')

In [None]:
test_df_pred_onlyTopMCM_class10pred = test_df_pred.loc[test_df_pred['Pick_pred'] == 10]
len(test_df_pred_onlyTopMCM_class10pred)

In [None]:
test_df_pred[0:20]

In [None]:
len(test_df_pred.UWI.unique())

In [None]:
idx = test_df_pred.groupby(['SitID'])['Pick_pred'].transform(max) == test_df_pred['Pick_pred']
test_df_pred3=test_df_pred[idx]
        
        


In [None]:
len(test_df_pred3)

In [None]:
#c='McMurray_Base_DEPTH'
test_df_pred3.vgplot(kind='scatter', x='DEPT', y='Pick',c='NN1_thickness')