In [1]:
import pandas as pd
import spacy
spacy.prefer_gpu()
nlp = spacy.load("en_core_web_sm")

import utils 
import warnings

In [2]:
df = pd.read_csv('../data/actors.csv')

In [3]:
text = nlp("James Stewart is the actor with the highest Rating.")
args = {'_sort': ['-rating']}

In [28]:
def _search(text, args, df, copy=False):
    """Construct a tornado template which regenerates some
    text from a dataframe and formhandler arguments.

    The pipeline consists of:
    1. cleaning the text and the dataframe
    2. searching the dataframe and FH args for tokens in the text
    3. detecting inflections on the tokens.

    Parameters
    ----------
    text : spacy.Doc
        Input text
    args : dict
        Formhandler arguments
    df : pd.DataFrame
        Source dataframe.

    Returns
    --------
    tuple
        of search results, cleaned text and token inflections. The webapp uses
        these to construct a tornado template.
    """
    # utils.load_spacy_model()
    if copy:
        df = df.copy()
    df = utils.gfilter(df, args.copy())
    # Do this only if needed:
    # clean_text = utils.sanitize_text(text.text)
    args = utils.sanitize_fh_args(args, df)
    # Is this correct?
    dfs = DFSearch(df)
    dfix = dfs.search(text)
    dfix.update(search_args(dfs.ents, args))
    dfix.clean()
    inflections = grammar.find_inflections(dfix, args, df)
    _infl = {}
    for token, funcs in inflections.items():
        _infl[token] = []
        for func in funcs:
            _infl[token].append({
                'source': func.source,
                'fe_name': func.fe_name,
                'func_name': func.__name__
            })
    # FIXME: Why return text if it's unchanged?
    return dfix, text, _infl

def _df_maxlen(df):
    # Find the length of the longest string present in the columns, indices or values of a df
    col_max = max([len(c) for c in df.columns.astype(str)])
    ix_max = max([len(c) for c in df.index.astype(str)])
    array_max = max([df[c].astype(str).apply(len).max() for c in df])
    return max(col_max, ix_max, array_max)

def search_args(entities, args, lemmatized=True, fmt='fh_args["{}"][{}]',
                argkeys=('_sort', '_by', '_c')):
    """
    Search formhandler arguments provided as URL query parameters.

    Parameters
    ----------
    entities : list
        list of named entities found in the source text
    args : dict
        FormHandler args as parsed by g1.url.parse(...).searchList
    lemmatized : bool, optional
        whether to search on lemmas of text values
    fmt : str, optional
        String format used to describe FormHandler arguments in the template
    argkeys : list, optional
        Formhandler argument keys to be considered for the search. Any key not
        present in this will be ignored.
        # TODO: Column names can be keys too!!

    Returns
    -------
    dict
        Mapping of entities / tokens to objects describing where they are found
        in Formhandler arguemnts. Each search result object has the following
        structure:
        {
            'type': 'some token',
            'location': 'fh_args',
            'tmpl': 'fh_args['_by'][0]'  # The template that gets this token from fh_args
        }
    """
    args = {k: v for k, v in args.items() if k in argkeys}
    search_res = {}
    entities = list(chain(*entities))
    search_res.update(_search_groupby(entities, args, lemmatized=lemmatized))
    search_res.update(_search_sort(entities, args, lemmatized=lemmatized))
    search_res.update(_search_select(entities, args, lemmatized=lemmatized))
    return search_res


In [23]:
class DFSearch(object):
    """Make a dataframe searchable."""

    def __init__(self, df, nlp=None, **kwargs):
        """Default constructor.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe to search.
        nlp : A `spacy.lang` model, optional
        """
        self.df = df
        # What do results contain?
        # A map of tokens to list of search results.
        self.results = DFSearchResults()
        if not nlp:
            nlp = utils.load_spacy_model()
        self.matcher = kwargs.get('matcher', utils.make_np_matcher(nlp))
        self.ents = []
        

    def search(self, text, colname_fmt='df.columns[{}]',
               cell_fmt='df["{}"].iloc[{}]', **kwargs):
        """
        Search the dataframe.

        Parameters
        ----------
        text : spacy.Doc
            The text to search.
        colname_fmt : str, optional
            String format to describe dataframe columns in the search results,
            can be one of 'df.columns[{}]' or 'df[{}]'.
        cell_fmt : str, optional
            String format to describe dataframe values in the search results.
            Can be one of 'df.iloc[{}, {}]', 'df.loc[{}, {}]', 'df[{}][{}]', etc.

        Returns
        -------
        dict
            A dictionary who's keys are tokens from `text` found in
            the source dataframe, and values are a list of locations in the df
            where they are found.
        """
        self.search_nes(text)
        if len(text.text) <= _df_maxlen(self.df):
            for i in _text_search_array(text.text, self.df.columns):
                self.results[text] = {'location': 'colname', 'tmpl': colname_fmt.format(i),
                                      'type': 'doc'}
            for x, y in zip(*_text_search_array(text.text, self.df)):
                x = utils.sanitize_indices(self.df.shape, x, 0)
                y = utils.sanitize_indices(self.df.shape, y, 1)
                self.results[text] = {
                    'location': 'cell', 'tmpl': cell_fmt.format(self.df.columns[y], x),
                    'type': 'doc'}

        else:
            for token, ix in self.search_columns(text, **kwargs).items():
                ix = utils.sanitize_indices(self.df.shape, ix, 1)
                self.results[token] = {'location': 'colname', 'tmpl': colname_fmt.format(ix),
                                       'type': 'token'}

            for token, (x, y) in self.search_table(text, **kwargs).items():
                x = utils.sanitize_indices(self.df.shape, x, 0)
                y = utils.sanitize_indices(self.df.shape, y, 1)
                self.results[token] = {
                    'location': 'cell', 'tmpl': cell_fmt.format(self.df.columns[y], x),
                    'type': 'token'}
            self.search_quant([c for c in text if c.pos_ == 'NUM'])
        # self.search_derived_quant([c.text for c in selfdoc if c.pos_ == 'NUM'])

        return self.results

    def search_nes(self, doc, colname_fmt='df.columns[{}]', cell_fmt='df["{}"].iloc[{}]'):
        """Find named entities in text, and search for them in the dataframe.

        Parameters
        ----------
        text : str
            The text to search.
        """
        self.ents = utils.ner(doc, self.matcher)
        print("entities:",self.ents)
        for token, ix in self.search_columns(self.ents, literal=True).items():
            ix = utils.sanitize_indices(self.df.shape, ix, 1)
            self.results[token] = {
                'location': 'colname',
                'tmpl': colname_fmt.format(ix), 'type': 'ne'
            }
        for token, (x, y) in self.search_table(self.ents, literal=True).items():
            x = utils.sanitize_indices(self.df.shape, x, 0)
            y = utils.sanitize_indices(self.df.shape, y, 1)
            self.results[token] = {
                'location': 'cell',
                'tmpl': cell_fmt.format(self.df.columns[y], x), 'type': 'ne'}

    def search_table(self, text, **kwargs):
        """Search the `.values` attribute of the dataframe for tokens in `text`."""
        kwargs['array'] = self.df.copy()
        return self._search_array(text, **kwargs)

    def search_columns(self, text, **kwargs):
        """Search df columns for tokens in `text`."""
        kwargs['array'] = self.df.columns
        return self._search_array(text, **kwargs)

    def search_quant(self, quants, nround=2, cell_fmt='df["{}"].iloc[{}]'):
        """Search the dataframe for a set of quantitative values.

        Parameters
        ----------
        quants : list / array like
            The values to search.
        nround : int, optional
            Numeric values in the dataframe are rounded to these many
            significant digits before searching.
        """
        dfclean = utils.sanitize_df(self.df, nround)
        qarray = np.array([c.text for c in quants])
        quants = np.array(quants)
        n_quant = qarray.astype('float').round(nround)
        for x, y in zip(*dfclean.isin(n_quant).values.nonzero()):
            x = utils.sanitize_indices(dfclean.shape, x, 0)
            y = utils.sanitize_indices(dfclean.shape, y, 1)
            tk = quants[n_quant == dfclean.iloc[x, y]][0]
            self.results[tk] = {
                'location': 'cell', 'tmpl': cell_fmt.format(self.df.columns[y], x),
                'type': 'quant'}

    def search_derived_quant(self, quants, nround=2):
        """Search the common derived dataframe parameters for a set of quantitative values.

        Parameters
        ----------
        quants : list / array like
            The values to search.
        nround : int, optional
            Numeric values in the dataframe are rounded to these many
            significant digits before searching.
        """
        dfclean = utils.sanitize_df(self.df, nround)
        quants = np.array(quants)
        #  n_quant = quants.astype('float').round(2)

        for num in quants:
            if int(num) == len(dfclean):
                self.results[num] = {
                    'location': 'cell', 'tmpl': "len(df)",
                    'type': 'quant'}

    def _search_array(self, text, array, literal=False,
                      case=False, lemmatize=True, nround=False):
        """Search for tokens in text within an array.

        Parameters
        ----------
        text : str or spacy document
            Text to search
        array : array-like
            Array to search in.
        literal : bool, optional
            Whether to match tokens to values literally.
        case : bool, optional
            If true, run a case sensitive search.
        lemmatize : bool, optional
            If true (default), search on lemmas of tokens and values.
        nround : int, optional
            Significant digits used to round `array` before searching.

        Returns
        -------
        dict
            Mapping of tokens to a sequence of indices within `array`.

        Example
        -------
        >>> _search_array('3', np.arange(5))
        {'3': [3]}
        >>> df = pd.DataFrame(np.eye(3), columns='one punch man'.split())
        >>> _search_array('1', df.values)
        {'1': [(0, 0), (1, 1), (2, 2)]}
        >>> _search_array('punched man', df.columns)
        {'punched': [1], 'man': [2]}
        >>> _search_array('1 2 buckle my shoe', df.index)
        {'1': [1], '2': [2]}
        """
        if array.ndim == 1:
            func = _search_1d_array
        else:
            func = _search_2d_array
        return func(text, array, literal, case, lemmatize, nround)
        # if len(res) == 0:  # Fall back on searching the whole string, not just the entities
        #     res = func([text], array, literal, case, lemmatize, nround)
        # return res
class DFSearchResults(dict):
    """A convenience wrapper around `dict` to collect search results.

    Different from `dict` in that values are always lists, and setting to
    existing key appends to the list.
    """

    def __setitem__(self, key, value):
        if key not in self:
            super(DFSearchResults, self).__setitem__(key, [value])
        elif self[key][0] != value:
            self[key].append(value)

    def update(self, other):
        # Needed because the default update method doesn't seem to use setitem
        for k, v in other.items():
            self[k] = v

    def clean(self):
        """Sort the search results for each token by priority and un-overlap tokens."""
        for k, v in self.items():
            _sort_search_results(v)
        # unoverlap the keys
        to_remove = []
        for k in self:
            to_search = self.keys() - {k}
            if utils.is_overlap(k, to_search):
                to_remove.append(k)
        for i in to_remove:
            del self[i]



In [24]:
dfs = DFSearch(df)

In [30]:
from itertools import chain
# dfs.search(text)
# dfs.search_nes(text)
import numpy as np
dfs.search(text)
search_args(dfs.ents, args)

entities: [Rating, Stewart, James Stewart, actor, James]


  


NameError: name '_search_groupby' is not defined

In [27]:
df

Unnamed: 0,category,name,rating,votes
0,Actors,Humphrey Bogart,0.57,109
1,Actors,Cary Grant,0.44,142
2,Actors,James Stewart,0.99,120
3,Actors,Marlon Brando,0.1,108
4,Actors,Fred Astaire,0.21,84
5,Actresses,Katharine Hepburn,0.04,63
6,Actresses,Bette Davis,0.28,14
7,Actresses,Audrey Hepburn,0.12,94
8,Actresses,Ingrid Bergman,0.3,52
9,Actors,Spencer Tracy,0.47,192


In [8]:
def _preprocess_array_search(text, array, literal=False, case=False, lemmatize=True,
                             nround=False):
    nlp = utils.load_spacy_model()
    if case or nround:
        raise NotImplementedError

    if literal and lemmatize:
        warnings.warn('Ignoring lemmatization.')

    if not (literal or lemmatize):
        warnings.warn(
            'One of `literal` or `lemmatize` must be True. Falling back to lemmatize=True')
        literal, lemmatize = False, True

    if literal:  # ignore every other flag else
        tokens = pd.Series([c.text for c in text], index=text)

    elif lemmatize:
        tokens = pd.Series([c.lemma_ for c in text], index=text)
        if array.ndim == 1:
            array = array.map(nlp)
            array = pd.Series([token.lemma_ for doc in array for token in doc])
        elif array.ndim == 2:
            for col in array.columns[array.dtypes == np.dtype('O')]:
                s = [c if isinstance(c, str) else str(c) for c in array[col]]
                s = [nlp(c) for c in s]
                try:
                    array[col] = [token.lemma_ for doc in s for token in doc]
                except ValueError:
                    warnings.warn('Cannot lemmatize multi-word cells.')
                    if not case:  # still need to respect the `case` param
                        array[col] = array[col].str.lower()

    return tokens, array

def _search_1d_array(text, array, literal=False, case=False, lemmatize=True,
                     nround=False):
    tokens, array = _preprocess_array_search(text, array, literal, case, lemmatize, nround)
    mask = array.isin(tokens)
    if not mask.any():
        return {}
    if isinstance(mask, pd.Series):
        nz = mask.to_numpy().nonzero()[0]
    else:
        nz = mask.nonzero()[0]
    indices = {array[i]: i for i in nz}
    tk = tokens[tokens.isin(array)]
    return _remerge_span_tuples({token: indices[s] for token, s in tk.items()})

def _search_2d_array(text, array, literal=False, case=False, lemmatize=True, nround=False):
    array = array.astype(str)
    tokens, array = _preprocess_array_search(text, array, literal, case, lemmatize, nround)
    mask = array.isin(tokens.values)
    if not mask.any().any():
        return {}
    indices = {array.iloc[i, j]: (i, j) for i, j in zip(*mask.values.nonzero())}
    tk = tokens[tokens.isin(array.values.ravel())]
    return _remerge_span_tuples({token: indices[s] for token, s in tk.items()})


In [9]:
def _remerge_span_tuples(results):
    unmerged_spans = [k for k in results if isinstance(k, tuple)]
    for span in unmerged_spans:
        start, end = span[0].idx, span[-1].idx + len(span[-1])
        new_span = span[0].doc.char_span(start, end)
        results[new_span] = results.pop(span)
    return results


In [11]:
import re
NP_RULES = {
    'NP1': [{'POS': 'PROPN', 'OP': '+'}],
    'NP2': [{'POS': 'NOUN', 'OP': '+'}],
    'NP3': [{'POS': 'ADV', 'OP': '+'}, {'POS': 'VERB', 'OP': '+'}],
    'NP4': [{'POS': 'ADJ', 'OP': '+'}, {'POS': 'VERB', 'OP': '+'}],
    'QUANT': [{'POS': 'NUM', 'OP': '+'}]
}
QUANT_PATTERN = re.compile(r'(^\.d+|^d+\.?(d?)+)')
_spacy = {
    'model': False,
    'lemmatizer': False,
    'matcher': False
}

def make_np_matcher(nlp, rules=NP_RULES):
    """Make a rule based noun phrase matcher.

    Parameters
    ----------
    nlp : `spacy.lang`
        The spacy model to use.
    rules : dict, optional
        Mapping of rule IDS to spacy attribute patterns, such that each mapping
        defines a noun phrase structure.

    Returns
    -------
    `spacy.matcher.Matcher`
    """
    if not _spacy['matcher']:
        from spacy.matcher import Matcher
        matcher = Matcher(nlp.vocab)
        for k, v in rules.items():
            matcher.add(k, None, v)
        _spacy['matcher'] = matcher
    else:
        matcher = _spacy['matcher']
    return matcher

In [12]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
for k, v in NP_RULES.items():
    matcher.add(k, None, v)

In [13]:
def ner(doc, matcher, match_ids=False, remove_overlap=True):
    """Find all NEs and other nouns in a spacy doc.

    Parameters
    ----------
    doc: spacy.tokens.doc.Doc
        The document in which to search for entities.
    matcher: spacy.matcher.Matcher
        The rule based matcher to use for finding noun phrases.
    match_ids: list, optional
        IDs from the spacy matcher to filter from the matches.
    remove_overlap: bool, optional
        Whether to remove overlapping tokens from the result.

    Returns
    -------
    list
        List of spacy.token.span.Span objects.
    """
    entities = set()
    for span in doc.ents:
        newtokens = [c for c in span if not c.is_space]
        if newtokens:
            newspan = doc[newtokens[0].i: (newtokens[-1].i + 1)]
            entities.add(newspan)
    if not match_ids:
        entities.update([doc[start:end] for _, start, end in matcher(doc)])
    else:
        for m_id, start, end in matcher(doc):
            if matcher.vocab.strings[m_id] in match_ids:
                entities.add(doc[start:end])
    if remove_overlap:
        entities = unoverlap(entities)
    return entities


In [16]:
def unoverlap(tokens):
    """From a set of tokens, remove all tokens that are contained within
    others."""
    textmap = {c: c for c in tokens}
    newtokens = []
    for token in tokens:
        if not is_overlap(textmap[token], set(tokens) - {token}):
            newtokens.append(token)
    return [textmap[t] for t in newtokens]


In [15]:
ner()

TypeError: ner() missing 2 required positional arguments: 'doc' and 'matcher'

In [126]:
[c for c in doc.ents[0] if not c.is_space]

[James, Stewart]

In [131]:
doc[newtokens[0].i: (newtokens[-1].i + 1)]

James Stewart

In [132]:
matcher(doc)

[(2603134184856246923, 0, 1),
 (2603134184856246923, 0, 2),
 (2603134184856246923, 1, 2),
 (4505603235458341185, 4, 5),
 (4505603235458341185, 8, 9)]

In [137]:
[doc[start:end] for _, start, end in matcher(doc)]

[James, James Stewart, Stewart, actor, Rating]