## This is the first notebook done after the hackathon
### I'm using a different approach to calculating features (more array and vector based and less row and loop based) which is speeding up the calculation. 
### The thought process is the same
#### - Justin

In [92]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import welly
import lasio
import glob
welly.__version__

'0.3.0'

In [91]:
%%timeit
import os
env = %env


10000 loops, best of 3: 93.1 µs per loop


In [9]:
from IPython.display import display

In [10]:
pd.set_option('display.max_rows', 2000)
## pd.set_option('display.height', 2000)

## Notes

In [2]:
## for finding similar geographic points
## scipy spatial kd tree - similar to quad tree ... CKD tree is a little faster as its the C version
## scipy.spatial
## spatial 
## asdfds
##  
## tree.query

## Read In Some Data

In [4]:

picks_dic = pd.read_csv('./SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
picks = pd.read_csv('./SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
wells = pd.read_csv('./SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
picks_new=picks[picks['HorID']==13000]
df_new=pd.merge(wells, picks_new, on='SitID')
df_new.head()

Unnamed: 0,SitID,UWI (AGS),UWI,HorID,Pick,Quality
0,102496,674010812000,00/12-08-067-01W4/0,13000,475,3
1,102497,674020807000,00/07-08-067-02W4/0,13000,515,3
2,102498,674021109000,00/09-11-067-02W4/0,13000,480,3
3,102500,674022910000,00/10-29-067-02W4/0,13000,549,3
4,102501,674023406000,00/06-34-067-02W4/0,13000,529,2


In [11]:
#### reading in an example well for testing
#  w_test_df = lasio.read(well_path+"00-01-04-075-23W4-0.LAS").df()
# w_test_df.df()


## Brainstorm 1 : various features to extract
1. Find average values of [each curve] in [different] length windows [above, around, below] a depth
2. Find average for [different number of] [max, min] values in [different] length windows [above, around, below] a depth for [each curve]
3. Find gradient within [window length] around a depth point 
4. Find [min, avg, max] gradient of various [smaller window length] within a larger [larger window length]
5. Find number of gradient changes (negative to positive) of various [smaller window length] within a larger [larger window length]
6. Find difference between two windows on either side of a given depth. For each window, average for [different number of] [max, min] values in [different] length windows above & below a given depth for [each curve].

In [46]:
####
def addColWindowMean(df,col,windowSize,centered):
    featureName = col+"_mean_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).mean() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [47]:
####
def addColWindowMax(df,col,windowSize,centered):
    featureName = col+"_max_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).max() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).max() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).max() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [48]:
#### Returns a column with the min values of a window centered 
def addColWindowMin(df,col,windowSize,centered):
    featureName = col+"_min_"+str(windowSize)+"winSize_"+"dir"+centered
    if(centered == "around"):
        df[featureName] = df[col].rolling(center=True,window=windowSize).min() 
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).min() 
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        df[featureName] = df[col].rolling(center=False,window=windowSize).min() 
        #### unreverse
        
        df = df.sort_index(ascending=True)
    return df

In [73]:
#### helper function that takes in array and an integer for the number of highest values to find the mean of 
#### example: for an array = [1,3,6,44,33,22,452] and nValues = 2, the answer would be 44+452 / 2
def nLargest(array,nValues):
    answer = np.mean(array[np.argsort(array)[-nValues:]])  
    return answer

In [80]:
#### Returns a column with the average of the N largest values of a window 
def addColWindowAvgMaxNvalues(df,col,windowSize,centered,nValues):
    #df[featureName] = df[col].rolling(center=True,window=windowSize).nlargest(nValues).mean() 
    #return df
    featureName = col+"_min_"+str(windowSize)+"winSize_"+"dir"+centered+"_n"+str(nValues)
    if(centered == "around"):
        #df[featureName] = df[col].rolling(center=True,window=windowSize).nlargest(nValues).mean() 
        df[featureName] = df[col].rolling(center=True,window=windowSize).apply(lambda x: nLargest(x,nValues))
    elif(centered == "above"):
        df[featureName] = df[col].rolling(center=False,window=windowSize).apply(lambda x: nLargest(x,nValues))
    elif(centered == "below"):
        #### reverse data frame
        #df = df.iloc[::-1]
        df = df.sort_index(ascending=False)
        #   # df['new_column'] = df.apply(lambda x: your_function(x['High'],x['Low'],x['Close']), axis=1)
        df[featureName] = df[col].rolling(center=False,window=windowSize).apply(lambda x: nLargest(x,nValues))
        #df[featureName] = df[col].rolling(center=False,window=windowSize).nlargest(nValues).mean() 
        #### unreverse
        df = df.sort_index(ascending=True)
    return df

In [70]:
####
winVars = {"RangeOfCurves":['GR'],
                   "RangeOfWindows":[5,11,29],
                   "RangeOfWindowsCaution":[5],
                   "RangeOfDirection":['above','below','around'],
                   "MinOrMaxRange":['min','max'],
                   "NumbPtsRange":[1,5]}

## Read in well logs from LAS files and put into Dictionary of Dataframes. As Reading-in, add features

In [97]:
def loadAndAddFeatures():
    count=0
    data_df=[]
    ### dictionary that holds every well as key:value or "UWI":df pair
    df_w_dict ={}
    for file in glob.glob('./SPE_006_originalData/OilSandsDB/Logs/*.LAS'):
        #### NOTE: limiting wells being read-in to 101 here !!!!!!!!!!!!!!!!
        if count >101:
            break
        count+=1  
        l_df = lasio.read(file).df()
        str_uwi= file[-23:-4].replace("-", "/",1)[:17]+file[-6:-4].replace("-", "/",1)
        l_df = l_df.reset_index()
        l_df['UWI'] = str_uwi
        l_df['SitID']=df_new[df_new['UWI']==str_uwi]['SitID'].iloc[0]
        l_df['UWI (AGS)']=df_new[df_new['UWI']==str_uwi]['UWI (AGS)'].iloc[0]
        l_df['Pick']=df_new[df_new['UWI']==str_uwi]['Pick'].iloc[0] 
        l_df['HorID']=df_new[df_new['UWI']==str_uwi]['HorID'].iloc[0]
        l_df['Quality']=df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0]
        print(l_df.Pick.unique()[0])  
        try:
            #print("count = ",count)
            float(l_df.Pick.unique()[0])
            l_df.Pick = l_df.Pick.astype(float)
            l_df.DEPT = l_df.DEPT.astype(float)
            l_df['new_pick']=l_df['Pick']-l_df['DEPT']
            l_df['new_pick2']=l_df['new_pick'].apply(lambda x: 1 if(x==0) else 0)
            #print("got to above count >= 2")
            #if count == 1:
             #   data_df=l_df
            if count >= 2:
                #print("got inside count >= 2")
                #### instead of concat into a single dataframe, run functions & then add to dictionary   
                ##### run functions to create features on array basis for each well in separate dataframe
                ##### this makes certain things easier, compared to everything in a single dataframe, like making sure you don't accidentally grab data from next well up
                ##### and will make it easier to write data back to LAS if we run into memory limitations later
                curves = ['ILD','GR']
                windows = [5,7,11,21]
                directions = ["around","below","above"]
                comboArg_A = [curves,windows,directions]
                all_comboArgs_A = list(itertools.product(*comboArg_A))
                for eachArgList in all_comboArgs_A:
                    l_df_new = addColWindowMean(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                    l_df_new = addColWindowMax(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                    l_df_new = addColWindowMin(l_df,eachArgList[0],eachArgList[1],eachArgList[2])
                    l_df_new = addColWindowAvgMaxNvalues(l_df,eachArgList[0],eachArgList[1],eachArgList[2],3)
                #print("type(l_df) = ",type(l_df_new))
                #print("l_df['avg_GR_windowCenter5'][0] = ",l_df_new['avg_GR_windowCenter5'][0])
                #### add resultant dataframe to dictionary
                df_w_dict[l_df_new['UWI'][0]]= l_df_new
        except ValueError as e:
            print("e = ",e)
            print ('Error picking')
            #continue;

In [98]:
%prun loadAndAddFeatures()

607
        
e =  could not convert string to float: 
Error picking
243
631
253
411
441
535
429
454
263
602
220.5
435
441
421.5
327.96
        
e =  could not convert string to float: 
Error picking
416
        
e =  could not convert string to float: 
Error picking
591
456
808.02
        
e =  could not convert string to float: 
Error picking
806.5
596
519
568
362.5
        
e =  could not convert string to float: 
Error picking
309.5
477
593.5
        
e =  could not convert string to float: 
Error picking
372
480
580
368.5
474.5
493
455
450
329
588.87
220.5
        
e =  could not convert string to float: 
Error picking
327
475
283
422
471
642
531
        
e =  could not convert string to float: 
Error picking
473
596.5
149.96
355.5
323.5
294
425
541
180.5
396.24
465
474
410
231.5
556.5
        
e =  could not convert string to float: 
Error picking
        
e =  could not convert string to float: 
Error picking
449
636
487
468
365.46
        
e =  could not convert string to float:

### NOTE: not all well logs were read-in successfully, need to go back and find out why

In [99]:
df_w_dict['00/01-03-085-15W4/0']

Unnamed: 0,DEPT,DPHI,NPHI,GR,ILD,UWI,SitID,UWI (AGS),Pick,HorID,...,GR_min_11winSize_dirabove,GR_min_11winSize_dirabove_n3,GR_mean_21winSize_diraround,GR_max_21winSize_diraround,GR_min_21winSize_diraround,GR_min_21winSize_diraround_n3,GR_mean_21winSize_dirabove,GR_max_21winSize_dirabove,GR_min_21winSize_dirabove,GR_min_21winSize_dirabove_n3
0,97.000,0.326,0.412,91.436,16.597,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
1,97.300,0.321,0.384,103.410,15.074,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
2,97.600,0.289,0.374,116.013,13.957,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
3,97.900,0.258,0.398,116.635,13.174,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
4,98.200,0.274,0.407,83.204,14.785,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
5,98.500,0.303,0.416,69.322,16.916,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
6,98.800,0.318,0.416,61.116,18.624,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
7,99.100,0.326,0.387,54.800,18.624,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
8,99.400,0.322,0.350,69.295,16.915,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,
9,99.700,0.292,0.351,91.358,12.432,00/01-03-085-15W4/0,115476,0854150301000,243.0,13000,...,,,,,,,,,,


In [100]:
## testing one dataframe of one well in dictionary of all that were successfully read in
df_w_dict['00/01-03-085-15W4/0'].shape

(518, 77)

## Key variables that hold data frame column names

In [9]:
keys = ['ILD','DPHI','GR','NPHI','CALI','COND','DELT','RHOB','PHIN','DT','ILM','SP','SFLU','IL','DEPTH','DEPH','MD']

In [10]:
keys2 = ['ILD','DPHI','GR','NPHI','CALI','RHOB']

In [12]:
type(w_test_df)

pandas.core.frame.DataFrame