# Feature Creation notebook
### Goal is to start with dict of dataframes of wells and a few other pieces and create a single dataframe with all the necessary features for all used wells
#### This work is similar to what has been done before but data loading & feature creation is separate and dask is used to speed feature creation
##### by Justin Gosses 2018-07-07

#### Inputs used during this notebook are:
    1. Dict of dataframes of used well created by notebooks in `/loadLAS` directory
    2. A dataframe of nearest neighbor information from a notebook found in the `WellsKNN/` directory
    3. picks_dic a data dictionary for the pick list below = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
    4. pick list = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
    5. well list = pd.read_csv('../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
    6. lattitude and longitude for eahc well = pd.read_csv('../../well_lat_lng.csv')

In [3]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import welly
from welly import Well
import lasio
import glob
from sklearn import neighbors
import pickle
import math
import dask
import dask.dataframe as dd
from dask.distributed import Client
# import pdvega
# import vega
import dask.dataframe as dd
from dask.distributed import Client
welly.__version__

'0.3.5'

In [4]:
print(dask.__version__)
print(pd.__version__)

0.18.1
0.23.1


In [5]:
%%timeit
import os
env = %env


76.7 µs ± 1.78 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [6]:
from IPython.display import display

In [7]:
#### Test results Part 1
#### Had to change display options to get this to print in full!
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100000

In [8]:
knn_dir = "../WellsKNN/"
load_dir = "../loadLAS"

## If you open this notebook fresh and jump to a point below where a pick file is read in, you still need to load everything above! 

-------------

-------

--------------------

## We're going to load a pickle file of a previously created dataframe

### That dataframe merges:
1. picks_dic = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
2. picks = pd.read_csv('../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
3. wells = pd.read_csv('../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
4. gis = pd.read_csv('../../well_lat_lng.csv')

### It also excludes any wells that have nulls or zeros for Top McMurray or Base McMurray picks
This was done in notebooks: 
1. notebooks_2018/mapmaking/Map_Exploration_v2-KDtree.ipynb
2. notebooks_2018/Test_RUN_2018_02/DataCleaningPrepof_KNN_neighborPickDepth_df_creation_vA_20180210

In [None]:
wells_df_new_cleaned_plus_nn_wNoNulls =  pd.read_pickle(knn_dir+'/'+'wells_df__NB_KDtreePost062018_vA__NoMcTopLeak_v2.p')

In [None]:
wells_df_new_cleaned_plus_nn_wNoNulls.info()

In [None]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

In [None]:
print(len(wells_df_new_cleaned_plus_nn_wNoNulls))

In [None]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

### This renames the columms of the dataframe above to match previous versions of the feature creation code

In [None]:
df_new = wells_df_new_cleaned_plus_nn_wNoNulls
# df_new["UWI (AGS)"] = df_new["UWI (AGS)_x"]
df_new["UWI"] = df_new["UWI"]
df_new["HorID"] = df_new["McMurray_Top_HorID"]
df_new["Pick"] = df_new["McMurray_Top_DEPTH"]
df_new["Quality"] = df_new["McMurray_Top_Qual"]
df_new["HorID_paleoz"] = df_new["McMurray_Base_HorID"]
df_new["Pick_paleoz"] = df_new["McMurray_Base_DEPTH"]
df_new["Quality_paleoz"] = df_new["McMurray_Base_Qual"]
df_new = df_new[["SitID","UWI","HorID","Pick","Quality","HorID_paleoz","Pick_paleoz","Quality_paleoz",'lat','lng','MM_Top_Depth_predBy_NN1thick','NN1_thickness']]
df_new

In [None]:
df_new.info()

In [None]:
unique_quality_str = df_new.Quality.unique()
print(unique_quality_str)

In [None]:
print(len(df_new))

In [None]:
#### Number of unique wells based on UWI
len(df_new.UWI.unique())

In [None]:
df_new_test = df_new[['UWI']]

In [None]:
print(any(df_new_test.UWI == '00/11-04-067-03W4/0'))

---------------

## We're now going to load in all the las files but exclude any that arean't in the dataframe shown above. Additionally, we'll ignore any placed in the `excluded_problem_wells` folder

In [None]:
def loadAndNoFeatures():
    count=0
    data_df=[]
    count_limit =2500
    list_of_failed_wells = []
    ### dictionary that holds every well as key:value or "UWI":df pair
    df_w_dict ={}
    while count < count_limit:
        for file in glob.glob('../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'):
            count+=1
            if count > count_limit:
                print("hit limit of count below file for loop")
                answer = [df_w_dict,list_of_failed_wells]
                return answer
            else:
                l_df = lasio.read(file).df()
                str_uwi= file[-23:-4].replace("-", "/",1)[:17]+file[-6:-4].replace("-", "/",1)
                if any(df_new.UWI == str_uwi):
                    if df_new[df_new['UWI']==str_uwi]['Quality'].iloc[0] > -1:
                        l_df = l_df.reset_index()
                        print("got to UWI apppend")
                        l_df['UWI'] = str_uwi
                        df_w_dict[l_df['UWI'][0]]= l_df
                    else:
                        print("could not find UWI match for the well")
                        pass
                else:
                    pass
            #print("result = ",df_w_dict)
    #else: 
    #    return df_w_dict, list_of_failed_wells
    answer = [df_w_dict,list_of_failed_wells]
    
    return initial_well_dict

In [None]:
initial_well_dict = loadAndNoFeatures()

In [None]:
#answer=[df_w_dict,list_of_failed_wells]
dict_of_well_df = initial_well_dict[0]
list_of_failed_wells = initial_well_dict[1]

In [None]:
print("list_of_failed_wells",list_of_failed_wells)

In [None]:
print("len = ", len(dict_of_well_df))

In [None]:
print("check for well 00/11-04-067-03W4/0 = ",dict_of_well_df['00/11-04-067-03W4/0'])

In [None]:
print(type(dict_of_well_df))

In [None]:
print(type(dict_of_well_df['00/11-04-067-03W4/0']))

### We now have a dict of Pandas dataframes, were each dataframe is a well, that we will write to a pickle file

In [None]:
#### dumping dict of data frame to pickle file
dict_wells_df_and_Nofeatures_20180707 = dict_of_well_df
pickle.dump(dict_wells_df_and_Nofeatures_20180707, open( "dict_of__wells_df_No_features_class3_20180707.p", "wb" ) )

-----------------------------

## Next step will be to take this dict of dataframes and turn it into a single dataframe or perhaps a dask data frame. Then cycle or cast to add in columns for the information on nearest neighbors from the nearest neighbors dataframe based on a column for UWI. 

1. Create dataframe from dict 
2. Add material from KNN dataframe to this dataframe based on UWI
3. Go back to original dict of well dataframes and see if a dask dataframe can be created and then add KNN dataframe. 
4. Test steps (1,2) vs. (3) for speed

Open pickle of dick of well dataframes

In [None]:
dictOfWellDf =  pd.read_pickle('dict_of__wells_df_No_features_class3_20180707.p')

In [None]:
print(type(dictOfWellDf))

In [None]:
def turnDictOfWellDfs_to_SingleDfOfAllWells(dictOfWellDf):
    """
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO
    and returns a single dataframe of all wells
    """
    # start by creating empty dataframe and list
    data_df = pd.DataFrame()
    list_of_df = []
    # get dict of well data frames into values format
    values = dictOfWellDf.values()
    # go through each item in values and add to a list
    for each in values:
        list_of_df.append(each)
    # concat the list into a single dataframe
    data_df = pd.concat(list_of_df)
    return data_df

In [None]:
df_all_wells_basic = turnDictOfWellDfs_to_SingleDfOfAllWells(dictOfWellDf)
print(type(df_all_wells_basic))

In [None]:
df_all_wells_basic.head()

In [None]:
df_all_wells_basic.info()

In [None]:
df_all_wells_basic.astype(bool).sum(axis=0)

In [None]:
df_all_wells_basic.isnull().sum()

should probably at some point find the wells with missing major values like GR and either take them out or find out if there is a naming change like GR2 and replace the names so those wells can be used.

In [None]:
wells_df_new_cleaned_plus_nn_wNoNulls.head()

In [None]:
def combine_DfOfAllWells_with_knnDf(df_all_wells_basic,knn_df):
    """
    Takes in 2 arguments, a dataframe of all wells with only basic info 
    & the dataframe with info on knn neighbor data
    and returns a single dataframe that merges the two input dataframes based on UWI column
    """
    df_all_wells_wKNN = pd.merge(df_all_wells_basic, knn_df, on='UWI')
    return df_all_wells_wKNN

In [None]:
%%time 
df_all_wells_wKNN = combine_DfOfAllWells_with_knnDf(df_all_wells_basic,wells_df_new_cleaned_plus_nn_wNoNulls)

In [None]:
df_all_wells_wKNN.head()

In [None]:
len(df_all_wells_wKNN)

In [None]:
len(df_all_wells_wKNN.UWI.unique())

The dataframe of nearest neighbor information had 1926 rows, this now has 1920 unique UWIs.
Did some of the wells in import not make it through or where kicked out from later steps????

Same thing as above but for Dask data frames

In [None]:
def turnDictOfWellDfs_to_SingleDfOfAllWells(dictOfDF):
    """
    Takes in a dict of dataframes, where each dataframe is for a well created by LASIO
    and returns a single dataframe of all wells
    """
    return dask_df_all_wells_basic

In [None]:
def combine_DfOfAllWells_with_knnDf(dask_df_all_wells_basic,knn_df):
    """
    Takes in 2 arguments, a dataframe of all wells with only basic info 
    & the dataframe with info on knn neighbor data
    and returns a single dataframe that merges the two input dataframes based on UWI column
    """
    return dask_df_all_wells_wKNN

--------------------

## After joining on the nearest neighbor dataframe, we can cast the original columns to floats instead of strings which some but not necessarily all might be. 
When we do this, be careful about variation in depth column name and rename DEPTH and DEPT to DEPTH

In [None]:
columns = list(df_all_wells_wKNN.columns.values)

In [None]:
columns

In [None]:
# List for turning everything except UWI, SiteID, and Neighbors obj into a float for easier working with later
columns_to_turn_to_floats = ['CALI',
 'COND',
 'DELT',
 'DENS',
 'DEPT',
 'DEPTH',
 'DPHI',
 'DPHI:1',
 'DPHI:2',
 'DT',
 'GR',
 'GR:1',
 'GR:2',
 'IL',
 'ILD',
 'ILD:1',
 'ILD:2',
 'ILM',
 'LITH',
 'LLD',
 'LLS',
 'NPHI',
 'PHID',
 'PHIN',
 'RESD',
 'RHOB',
 'RT',
 'SFL',
 'SFLU',
 'SN',
 'SNP',
 'SP',
 'McMurray_Base_HorID',
 'McMurray_Top_HorID',
 'McMurray_Base_DEPTH',
 'McMurray_Top_DEPTH',
 'McMurray_Base_Qual',
 'McMurray_Top_Qual',
 'lat',
 'lng',
 'NN1_McMurray_Top_DEPTH',
 'NN1_McMurray_Base_DEPTH',
 'NN1_thickness',
 'MM_Top_Depth_predBy_NN1thick',
 'HorID',
 'Pick',
 'Quality',
 'HorID_paleoz',
 'Pick_paleoz',
 'Quality_paleoz']

In [None]:
%%time
df_all_wells_wKNN[columns_to_turn_to_floats].astype(float)

In [None]:
df_all_wells_wKNN

In [None]:
df_all_wells_wKNN.info()

-----------------

## Now we're going to find some depths! Most of the depths in the wells are from a column called DEPT but there a handfull of wells that use a column called DEPTH. For convience sake, we're going to move the DEPTH values were not NaN to the DEPT column so all depths are in the same column. We're also going to try to replace NaNs in GR with GR:1 and GR:2 where data exists.

In [None]:
def useDiffColNamesToFillInNA(dataframeOfWells,colReplaceList):
    """
    Takes in two arguments,
    Argument one is a dataframe of multiple wells
    Argument two is a list of lists. Where each sub-list is a  pair of column names. 
    The right col is used to fill in NANs where they exist in left column.
    The function returns a dataframe of wells with the NANs in certain columns replaced based on input arguments.
    Example = [[ColA,ColB],[ColF,ColG],[ColZ,ColE]]
    """
    for each in colReplaceList:
        print("each",each)
        dataframeOfWells[each[0]].fillna(dataframeOfWells[each[1]], inplace=True)
    return dataframeOfWells

In [None]:
### list of sub-lists. Items on left are replaced with volumns from right column if left column has a NaN
colReplaceList = [['DEPT','DEPTH'],['GR','GR:1'],['GR','GR:2']]

In [None]:
#### Create new dataframe
df_all_wells_wKNN_DEPTHtoDEPT = useDiffColNamesToFillInNA(df_all_wells_wKNN,colReplaceList)

In [None]:
#### Look at DEPT to make sure it has gone up, it has!
df_all_wells_wKNN_DEPTHtoDEPT.info()

---------------------

## Create columns for how close a row is (based on depth) from the official pick for that well. 
### We'll be doing this for Top and Base McMurray.

In [None]:
#### for top McMurray
df_all_wells_wKNN_DEPTHtoDEPT['diff_TMcM_Pick_v_DEPT'] = df_all_wells_wKNN_DEPTHtoDEPT['Pick'] - df_all_wells_wKNN_DEPTHtoDEPT['DEPT']
#### for base McMurray or Top Paleozoic
df_all_wells_wKNN_DEPTHtoDEPT['diff_TPal_Pick_v_DEPT'] = df_all_wells_wKNN_DEPTHtoDEPT['Pick_paleoz'] - df_all_wells_wKNN_DEPTHtoDEPT['DEPT']


In [None]:
#### print a few wells to double check
df_all_wells_wKNN_DEPTHtoDEPT[0:1000]

## IT SHOULD BE NOTED THAT THE 'correct' PICK DEPTHS IN MANY CASES DO NOT PERFECTLY MATCH THE DEPTHS AVAILABLE IN THE LOGS. 
### In other words, the pick might be 105 but there is no row with 105.00 depth, only a 104.98 and a 105.02!
### This matters for what you count as a correct label!

### Create column for whether a row (based on depth) is within 0.0, +- 5, or >5 from the official pick.

In [None]:
#### Create a column that has a number that symbolizes whether a row is close or not to the 'real' pick
#### We'll do this first for Top McMurray and then top Paleozoic, which is basically base McMurray
df_all_wells_wKNN_DEPTHtoDEPT['cat_isTopMcMrNearby_known']=df_all_wells_wKNN_DEPTHtoDEPT['diff_TMcM_Pick_v_DEPT'].apply(lambda x: 100 if x==0 else ( 95 if (-0.5 < x and x <0.5) else 60 if (-5 < x and x <5) else 0))
#### Top paleozoic version
df_all_wells_wKNN_DEPTHtoDEPT['cat_isTopPalNearby_known']=df_all_wells_wKNN_DEPTHtoDEPT['diff_TPal_Pick_v_DEPT'].apply(lambda x: 100 if x==0 else ( 95 if (-0.5 < x and x <0.5) else 60 if (-5 < x and x <5) else 0))


In [None]:
#### drop previously created diff_TMcM_Pick_v_DEPT
#df_all_wells_wKNN_DEPTHtoDEPT.drop(columns=['diff_Pick_v_DEPT'])

In [None]:
#### print a few wells to double check
df_all_wells_wKNN_DEPTHtoDEPT.tail()

------------------------------

## Use thickness from neighor and base to predict top just with that, add as feature

In [None]:
# l_df['new_pick']=l_df['Pick']-l_df['DEPT']

In [None]:
# df_all_wells_wKNN['diff_Pick_v_DEPT'] = df_all_wells_wKNN['Pick'] - df_all_wells_wKNN['DEPT']

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT['MM_Top_Depth_predBy_NN1thick'][0:1]

In [None]:
#### Takes MM_Top_Depth_predBy_NN1thick and subtracts depth at that point, returns *absolute* value
def NN1_TopMcMDepth_Abs(df,MM_Top_Depth_predBy_NN1thick):
    df['DistFrom_NN1_TopDepth_Abs'] = abs(df[MM_Top_Depth_predBy_NN1thick] - df['DEPT'])
    return df

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM = NN1_TopMcMDepth_Abs(df_all_wells_wKNN_DEPTHtoDEPT,'MM_Top_Depth_predBy_NN1thick')

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM 

----------------

## Finally, we'll create a variety of calculated features based on well log numbers at, above, below, and around each depth point.

#### The difficult thing about creating features based on windows within a well when you have multiple wells stacked in a dataframe is that sometimes that window from one well goes into the next well.

#### To get around that, we're going create a column that says the distance from the top of the well and another column that says the distance form the bottom of the well. When a row's distance from top or bottom is greater than 1/2 the max window size, we'll just use proceed as normal. When the distance between that row's depth and top or bottom is less than 1/2 the max window size, we'll .....................

In [None]:
#
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['NewWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI'].shift(1) != df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI']
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['LastBitWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI'].shift(-1) != df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['UWI']


In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM[0:1000]

In [None]:
# df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['TopOfWell'] = np.where(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM[NewWell] == True,,


# df['elderly'] = np.where(df['age']>=50, 'yes', 'no')

In [None]:
TopOfWellRowsOnly = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.loc[df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['NewWell'] == True]
BottomOfWellRowsOnly = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.loc[df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['LastBitWell'] == True]

In [None]:
#rename depth to top and bottom depths , delete all other columns
TopOfWellRowsOnly = TopOfWellRowsOnly[['UWI','DEPT']]
TopOfWellRowsOnly['TopWellDept'] = TopOfWellRowsOnly['DEPT']
TopOfWellRowsOnly.drop(['DEPT'],axis=1, inplace=True)
#### same thing for bottom
BottomOfWellRowsOnly = BottomOfWellRowsOnly[['UWI','DEPT']]
BottomOfWellRowsOnly['BotWellDept'] = BottomOfWellRowsOnly['DEPT']
BottomOfWellRowsOnly.drop(['DEPT'],axis=1, inplace=True)
#### merge these two small dataframes
TopAndBottomOfWellRowsOnly = pd.merge(TopOfWellRowsOnly, BottomOfWellRowsOnly, on='UWI')
#### merge with larger dataframe
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM = pd.merge(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM, TopAndBottomOfWellRowsOnly, on='UWI')

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM.head()

In [None]:
#### Create a col for distance from row to top of well
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromTopWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['DEPT'] - df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['TopWellDept']

#### Create a col for distance from row to bottom of well
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromBotWell'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['BotWellDept'] - df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['DEPT']

#### Create col for well total thickness measured
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['WellThickness'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['BotWellDept'] - df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['TopWellDept']


#### This adds a column that says whether a row is closer to the bottm or the top of the well
#### This is useful for doing creation of features of rolling windows where you want to avoid going into another well stacked above.

In [None]:
#### This adds a column that says whether a row is closer to the bottm or the top of the well
#### This is useful for doing creation of features of rolling windows where you want to avoid going into another well stacked above.
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['closerToBotOrTop'] = np.where(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromTopWell']<=df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromBotWell'], 'FromTopWell', 'FromBotWell')

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['closTopBotDist'] = np.where(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromTopWell']<=df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromBotWell'], df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromTopWell'], df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['FromBotWell'])

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['rowsToEdge'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['closTopBotDist']/0.25
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['rowsToEdge'] = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM['rowsToEdge'].astype(int)

----------------

## Writing dataframe to pickle file before doing main feature creation step that uses curves

In [None]:
#### dumping dict of data frame to pickle file
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724 = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM
pickle.dump(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724, open( "df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724.p", "wb" ) )

## Reading in the same dataframe from pickle file

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop =  pd.read_pickle('df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724.p')

---------------------

## The following is a rewrite of the basic features calculated from the curves
### It runs faster than previous Pandas version could be made to run faster, specifically by using `apply` less
### It also calculates things in a window around a point, above a point, but not below a point. I have to go back and re-write that code. Sorting is expensive task in Dask, so I don't want to reverse order twice for each feature like I did previously in Pandas. 

-------------------

#### The next two lines bring up a Dask client dashboard that will open as a new tab. It provides great insight into what functions are being run by dask, how they run, and which ones are slowest.

In [None]:
client = Client()

In [None]:
client

The next bit is only creating features based on curve data within a given well, so we'll read and write pickle files at the start and end of this section.

In [None]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop =  pd.read_pickle('df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724.p')

In [None]:
%time
test_5 = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop.copy()
test_5 = dd.from_pandas(test_5, npartitions=50)

### In the line below, we pick the curves and windows to run

In [None]:
curves = ['GR','ILD']
windows = [5,7,11,21]

### The function nLargest is used via `apply`, I should probably re-write this to use Dask's Nlargest API but didn't here as the docs imply it might behave slightly differently.
### A quick look at the status dashboard in the Dask Client suggests the use of apply takes up maybe 1/4-1/2 of total compute time currently!

In [None]:
def nLargest(array,nValues):
    answer = np.mean(array[np.argsort(array)[-nValues:]])  
    return answer

In [None]:
def thoughts_seperateRollingAndConditionalIntoTwoDaskProcesses(dd,curves,windows):
    """
    for loop for each combination of parameter for rolling functions
    curves = ['GR','ILD']
    windows = [5,7,11,21]
    directions = ["around","below","above"]
        #         Not sure the best way to do the 'below' centered rolling in dask as the sort_index is expensive in dask so might be slow!
        #       Skipping this for now will come back when not tired. Maybe use shift?
    For each column created, check window size vs. allowable window size column, if too small, use single row value from original column
    """
    comboArg_B = [curves,windows]
    all_comboArgs_B = list(itertools.product(*comboArg_B))
    for eachArgList in all_comboArgs_B:
        col = eachArgList[0]
        windowSize = eachArgList[1]
        #centered = eachArgList[2]
        featureName = col+"_min_"+str(windowSize)+"winSize_"
        half_window = int(windowSize/2)
        #         quarter_window = int(windowSize/4)

        
        ### goes through distance to edge and when less than windowSize writes "too close" otherwise returns NaN
        ### fills in Nan with calculated feature column
        ### replaces "too close" with NaN
        ### replaces NaN with dd[col]
        ### overrights original column
        
        #### MIN
        dd[featureName+'dir'+'Around'+'Min'] = dd[col].rolling(windowSize,center=True).min()
        dd[featureName+'dir'+'Around'+'Min'] = dd[featureName+'dir'+'Around'+'Min'].where(cond=dd['closTopBotDist'] > half_window, other=dd[col])
        
        dd[featureName+'dir'+'Above'+'Min'] = dd[col].rolling(windowSize,center=False).min()
        dd[featureName+'dir'+'Above'+'Min'] = dd[featureName+'dir'+'Above'+'Min'].where(cond=dd['closTopBotDist'] > windowSize, other=dd[col])
        #### MAX
        dd[featureName+'dir'+'Around'+'Max'] = dd[col].rolling(windowSize,center=True).max()
        dd[featureName+'dir'+'Around'+'Max'] = dd[featureName+'dir'+'Around'+'Max'].where(cond=dd['closTopBotDist'] > half_window, other=dd[col])
        
        dd[featureName+'dir'+'Above'+'Max'] = dd[col].rolling(windowSize,center=False).max()
        dd[featureName+'dir'+'Above'+'Max'] = dd[featureName+'dir'+'Above'+'Max'].where(cond=dd['closTopBotDist'] > windowSize, other=dd[col])
        #### Mean
        dd[featureName+'dir'+'Around'+'Mean'] = dd[col].rolling(windowSize,center=True).mean()
        dd[featureName+'dir'+'Around'+'Mean'] = dd[featureName+'dir'+'Around'+'Mean'].where(cond=dd['closTopBotDist'] > half_window, other=dd[col])
        
        dd[featureName+'dir'+'Above'+'Mean'] = dd[col].rolling(windowSize,center=False).mean()
        dd[featureName+'dir'+'Above'+'Mean'] = dd[featureName+'dir'+'Above'+'Mean'].where(cond=dd['closTopBotDist'] > windowSize, other=dd[col])

        ## nLargest
        nValues = 5
        dd[featureName+'dir'+'Above'+'nLarge'] = dd[col].rolling(windowSize,center=False).apply( lambda x: nLargest(x,nValues),raw=True)  
        dd[featureName+'dir'+'Above'+'nLarge'] = dd[featureName+'dir'+'Above'+'nLarge'].where(cond=dd['closTopBotDist'] > windowSize, other=dd[col])
        
        dd[featureName+'dir'+'Around'+'nLarge'] = dd[col].rolling(windowSize,center=True).apply(lambda x: nLargest(x,nValues),raw=True) 
        dd[featureName+'dir'+'Around'+'nLarge'] = dd[featureName+'dir'+'Around'+'nLarge'].where(cond=dd['closTopBotDist'] > windowSize, other=dd[col])
    
    return dd

In [None]:
%time
ddf_test5 = thoughts_seperateRollingAndConditionalIntoTwoDaskProcesses(test_5,curves,windows)

In [None]:
%time
test5result = ddf_test5.compute()


In [None]:
%time
test5result

In [None]:
type(test5result)

In [None]:
len(test5result.columns)

--------------------

## Writing pandas dataframe to pickle

In [None]:
#### dumping dict of data frame to pickle file
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM__NearTop_CurveF_20180726 = test5result
pickle.dump(df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM__NearTop_CurveF_20180726, open( "df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724.p", "wb" ) )

In [9]:
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop_2 =  pd.read_pickle("df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724.p")

## Write pandas dataframe to hdf5 
#### Dropping [Neighbors_Obj] col as it is object and can't be written to HDF5

In [18]:
# Write hdf5 to current directory
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop_3 = df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop_2.drop(['Neighbors_Obj'], axis=1)
filename = "df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724"
ending = ".h5"
groupkey = "a"
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_NearTop_3.to_hdf(filename+ending, key='/' + groupkey, format='table')

-----------------------

## Explore adding features based on map position using widget to draw polygons which are then one-hot encoded?

## Any other features?

In [16]:
!ls

createFeat_20180707_vA-Copy1.ipynb
createFeat_20180707_vA.ipynb
createFeat_20180707_vB.ipynb
createFeat_20180707_vC.ipynb
createFeat_20180725_vD.ipynb
[34mdask-worker-space[m[m
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724
df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM_20180724.p
dict_of__wells_df_No_features_class3_20180707.p
