In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os 
import sys
import re
import pickle

In [3]:
def noDatum(func):
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except:
            return ''
    return wrapper

class LAS:
    def __init__(self, input_dir, pickle_dir):
        self.dir = input_dir
        self.las_files = os.listdir(self.dir)
        self.pickle_dir = pickle_dir
        self._check_for_output_dir(pickle_dir)
        
    def __repr__(self):
        resp = '''
        Input Directory: {idir}
        Output Directory: {odir}
        Number of LAS Files: {num}
        '''.format(idir=self.dir, odir=self.pickle_dir, num=len(self.las_files))
        return resp
    
    def _check_for_output_dir(self, dir_):
        if not os.path.isdir(dir_):
            os.mkdir(dir_)
            
    def _concat_input(self, file):
        return os.path.join(self.dir, file)
    
    def _concat_output(self, file):
        return os.path.join(self.pickle_dir, file + r'.pickle')
        
    def _open(self, file):
        with open(self._concat_input(file), 'r') as fh:
            file = fh.read()
            
        return file
    
    def _lat(self, file):
        line =  re.search('.*lat\ .*', file).group()
        lat = re.search('[0-9]+\.[0-9]+', line).group().lstrip().rstrip()
        return float(lat)
    
    def _long(self, file):
        line =  re.search('.*lon\ .*', file).group()
        long = re.search('[0-9]+\.[0-9]+', line).group().lstrip().rstrip()
        return float(long)
    
    def _start(self, file):
        line = re.search('.*strt\ .*', file).group()
        start = re.search('[0-9]+\.[0-9]+', line).group()
        return float(start)
    
    def _stop(self, file):
        line = re.search('.*stop\ .*', file).group()
        stop = re.search('[0-9]+\.[0-9]+', line).group()
        return float(stop)
    
    @noDatum
    def _datum(self, file):
        line = re.search('.*datum\ .*', file).group()
        datum = re.search('[0-9]+\.[0-9]+', line).group()
        return float(datum)
    
    def _get_curve_info(self, file):
        curves = re.search(r'(?s)(?<=~curve\ information)(.*?)(?=~params)', file).group().split('\n')[1:-1]
        return curves
    
    def _curve_info(self, file):
        curves = []
        units = []
        description = []
        
        c = self._get_curve_info(file)
        
        for curve in c:
            curve_info = curve.split(':')

            curve, unit = ''.join(curve_info[0]).replace(' ', '').split('.')
            desc = curve_info[1].strip()
            
            curves.append(curve)
            units.append(unit)
            description.append(desc)
            
        return curves, units, description
    
    def _data(self, file):
        dataRaw = re.search(r'(?s)(?<=~ascii -----------------------------------------------------\n)(.*?).*', file).group().split('\n')
        dataRaw = [re.sub('\s+', ',', x.lstrip().rstrip()) for x in dataRaw[:-1]]
        data = [[float(val) for val in x.split(',')] for x in dataRaw]
        return data
    
    def _pickle_it(self, data_dict, filename):
        
        out_filename = self._concat_output(filename)
        
        with open(out_filename, 'wb') as file:
            pickle.dump(data_dict, file, protocol=pickle.HIGHEST_PROTOCOL)
    
    def process(self):
        for filename in self.las_files:
            try:
                file = self._open(filename).lower()

                curves, units, description = self._curve_info(file)

                data_dict = {'start': self._start(file),
                             'stop': self._stop(file),
                             'lat': self._lat(file),
                             'long': self._long(file),
                             'datum': self._datum(file),
                             'curves': curves,
                             'units': units,
                             'description': description,
                             'data': self._data(file)
                            }



                self._pickle_it(data_dict, filename)
            except:
                print('Failed: ', filename)

In [4]:
#root directory to las files
INPUT_DIR = r'C:/Users/simskel/Desktop/MLChallenge/las'

#where to save the parsed data as pickle files
OUTPUT_DIR = r'C:/Users/simskel/Desktop/MLChallenge/pickles'

In [5]:
las = LAS(input_dir=INPUT_DIR, pickle_dir=OUTPUT_DIR)

In [6]:
las.process()