In [1]:
import os
import numpy as np
import pandas as pd
import gensim

import nltk
from Litho.nlp_funcs import *
from Litho.similarity import (check_similarity, match_lithcode, jaccard_similarity, 
                              calc_similarity_score, print_sim_compare, merge_similar_words)

stopwords = nltk.corpus.stopwords.words('english')
stopw2 = ['redish', 'reddish', 'red', 'black', 'blackish', 'brown', 'brownish',
          'blue', 'blueish', 'orange', 'orangeish', 'gray', 'grey', 'grayish',
          'greyish', 'white', 'whiteish', 'purple', 'purpleish', 'yellow',
          'yellowish', 'green', 'greenish', 'light', 'very', 'pink','coarse',
          'fine', 'medium', 'hard', 'soft', 'coloured', 'multicoloured',
          'weathered', 'fractured', 'dark', 'color', 'colour', 'clean', 'cleaner']

stopwords.extend(stopw2)



In [2]:
path = os.getcwd()+'/'
file = 'boresTa.csv'
lith_data = pd.read_csv(path+file)

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
# Using output from `criteriaClassification.ipynb` to influence our list of known classes
classified = pd.read_csv(os.path.join(path, 'classification_criteria.csv'), usecols=[9])

known_classes = set([x[0] for x in classified.values.tolist()])

# manually add missing class for now
known_classes.add('quartzite')  

# Sort from longest to shortest (yes, the order matters!)
# e.g. we want to capture 'siltstone' before matching 'silt'
known_classes = sorted(known_classes, key=len, reverse=True)

In [6]:
len(known_classes)

93

In [7]:
# common mistakes found in dataset and corrected spelling
corrections = {
    'caly': 'clay',
    'claye': 'clay',
    'clayeyy': 'clay',
    'gravelly': 'gravel',
    'grvl': 'gravel',
    'silts': 'silt',
    'silty': 'silt',
    'siltston': 'siltstone',
    'comapcted': 'compacted',
    'cchoesive': 'cohesive',
    'conglomerte': 'conglomerate',
    'conglomate': 'conglomerate',
    'comglomerate': 'conglomerate',
    'comapcted': 'compacted',
    'tospoil': 'topsoil',
    'toposil': 'topsoil',
    'bolders': 'boulder',
    'bolder': 'boulder',
    'bsalt': 'basalt',
    # the below are needed as it gets missed when it appears next to another word
    'colluvial': 'colluvial',
    'lithology': 'lithology',
    'sidertire': 'siderite'
}

In [8]:
# words the spell checker gets wrong
known_correct_words = ['coring', 'colluvial', 'silt', 'contaminents', 'aplite', 'concretion', 'igenous', 'quartzite']

In [9]:
def separate_words(mixed_str, target_word):
    return mixed_str.replace(target_word, ' {} '.format(target_word)).replace('  ', ' ').strip()

def remove_extra_whitespace(sentence):
    wordlist = sentence.split()
    return ' '.join([w.strip() for w in wordlist])

In [10]:
def fix_typo(word, class_list, corrections, known_correct_words, matches=None):
    """Recursively clean up sentences by identifying known words and corrections.
    
    :param word: str, string to separate
    :param class_list: list[str], of known lithological classes
    :param corrections: dict, a mapping of typos and corrections
    :param matches: list[str], identified classes
    
    :returns: list[str], corrected string
    
    Example: 
    >>> class_list = ['sandstone', 'siltstone', 'slate', 'basalt', 'rock', 'caly']
    >>> corrections = {'caly': 'clay'}
    >>> split_words(''sandstonesiltstoneslaterockbasaltcaly', class_list, matches=[])
    ['sandstone', 'siltstone', 'slate', 'rock', 'basalt', 'clay']
    
    """
    orig_word = word
    word = remove_extra_whitespace(word)
    word = ''.join([w for w in word if not w.isdigit()])  # remove numbers
    word = ' '.join([w for w in word.split() if w not in stopwords])  # remove stopwords
    word = remove_extra_whitespace(word)
    
    if matches is None:
        matches = []

    if len(word) == 0:
        return matches
    
    for possible_word in class_list:
        if possible_word in word:
            matches.append(possible_word)
            word = word.replace(possible_word, '')
        
        if len(word) == 0:
            break
        # End if
    # End for
    
    if len(word) > 1:
        fixed = False
        for typo in corrections:
            if typo in word:
                fixed = True
                word = word.replace(typo, ' {} '.format(corrections[typo])).replace('  ', ' ').strip()
            # End if
        # End for
        
        # Attempt match against known classes only if known misspellings fail
        if not fixed:
            for cls in known_classes:
                for wd in word.split():
                    if (wd not in known_correct_words) and calc_similarity_score(wd, cls) == 1.0:
                        word = word.replace(wd, cls)
                    # End if
                # End for
            # End for
        # End if
        
        if word != orig_word:
            word = ' '.join([wd.strip() for wd in word.split() if len(wd) > 1])  # strip single characters
            matches = fix_typo(word, class_list, corrections, known_correct_words, matches)
        else:
            matches.extend(orig_word.split())
        # End if
    # End if
    
    return list(set(matches))
# End fix_typo()

In [11]:
fix_typo('coarse sand and fine gravel', known_classes, corrections, known_correct_words)

['sand gravel']

In [12]:
fix_typo('sandsiltstonecomglomerate', known_classes, corrections, known_correct_words)

['siltstone', 'conglomerate', 'sand']

In [13]:
fix_typo('brownsand', known_classes, corrections, known_correct_words)

['sand']

In [14]:
fix_typo('topsoil', known_classes, corrections, known_correct_words)

['soil', 'top']

In [15]:
fix_typo('sandstonesiltstoneslaterockbasaltcalyravelcolluvial', known_classes, corrections, known_correct_words)

['basalt',
 'sandstone',
 'clay',
 'rock',
 'colluvial',
 'slate',
 'siltstone',
 'ravel']

In [16]:
tmp = 'sandy clay; light brown, strongly cohesive, rare silica clasts, . light brown sandy clay, cohesive, containing silicate jasper & lithic clasts from fine sand to mediumg ravel, poorly graded, '
tmp = ' '.join([wd for wd in tmp.split() if wd.isalnum()])
fix_typo(tmp, known_classes, corrections, known_correct_words)

['strongly',
 'clasts',
 'jasper',
 'lithic',
 'calcite',
 'poorly',
 'rare',
 'containing',
 'silica',
 'mediumg',
 'sand']

In [17]:
tmp = 'coarse sand and fine gravel, subangular, mostly white quartz with clear quartz & jasper.  clay nodules beige less than 20%'
fix_typo(tmp, known_classes, corrections, known_correct_words)

['less',
 'clay',
 'subangular,',
 'jasper',
 'calcrete',
 'nodules',
 'beige',
 'mostly',
 'sand gravel',
 'quartz']

In [18]:
# Get description column as all lower case and stripping special characters
descriptions = lith_data.Description.str.replace('\W', '').str.lower().tolist()

In [19]:
grouped_tokens = {}

cleaned_descriptions = []
for idx, src_desc in enumerate(descriptions):
    cleaned_desc = fix_typo(str(src_desc), known_classes, corrections, [])
    cleaned_desc = remove_extra_whitespace(' '.join(list(set([wd for wd in cleaned_desc if wd in known_classes]))).strip())
    cleaned_descriptions.append(cleaned_desc)
# End for

In [20]:
len(list(set(cleaned_descriptions)))

2943

In [21]:
comparison = zip(descriptions, cleaned_descriptions)

Export results to file

In [25]:
import datetime
current_dt = datetime.datetime.now().strftime('%Y-%m-%d_%H%M%S')

pd.DataFrame({'Description': descriptions, 'Extracted': cleaned_descriptions}).to_csv('{}_attempt.csv'.format(current_dt), index=None)
pd.DataFrame({'Unique Extracted Classes': list(set(cleaned_descriptions))}).to_csv('{}_unique_descriptions.csv'.format(current_dt), index=None)