# Word Analyzer
Tool for multi-filtering words in a Pandas dataframe according to specific conditions :

- No compound words
- N-letter words
- Words starting with
- Words ending with
- Words that must contain such letters
- Words that must not contain such letters
- Words that must contain such letters at such rank

## Explanation of the arguments of the multi_filters function
The 7 filters of the function are arranged in series.On each passage through an activated filter, the dataframe is transformed (by losing lines according to the desired criteria) until reaching the output of the function.

### no_comp 
**Type** : bool

**Default** : `True`

Delete compound words (with spaces and dashes) in order to allow the filter to compose only with whole words

`multi_filters(df, col_name ="Mot", no_comp=True)`

### length
**Type** : int

**Default** : `None`

Filter words according to a given length

`multi_filters(df, col_name ="Mot", length=7)`

### start_with
**Type** : str

**Default** : `None`

Filter words starting with one or more letters given in a str

`multi_filters(df, col_name ="Mot", start_with="Per")`

### end_with
**Type** : str

**Default** : `None`

Filter words ending with one or more letters given in a str

`multi_filters(df, col_name ="Mot", end_with="eur")`

### contains
**Type** : list

**Default** : `None`

Filter words according to the letters they must contain

`multi_filters(df, col_name ="Mot", contains=["r", "e", "t"])`

### not_contain
**Type** : list

**Default** : `None`

Filter words according to the letters they must NOT contain

`multi_filters(df, col_name ="Mot", not_contain=["v", "i", "m"])`

### nth_letters
**Type** : list of lists

**Default** : `None`

Filters words according to the letters they must contain at specific ranks. Each sub-element must be a list containing `[desired rank (int), desired letter (str)]`

`multi_filters(df, col_name ="Mot", nth_letters=[[2,"t"],[5,"r"]])`

### log
**Type** : str

**Default** : "info"

Define the level of logging to display, by default is set to "info" which will display messages of this level and higher. In "debug" mode, advanced statistics are displayed each time the filter is passed, including the execution time, the number of lines deleted, etc.

In [35]:
from time import time
import logging
import pandas as pd

In [36]:
logger = logging.getLogger()

In [37]:
# Pandas dataframe of the dictionary dico.csv
df = pd.read_csv("dico.csv")
df = df.sort_values("Mot")
df = df.dropna()
df = df.reset_index(drop=True)
INIT_ROWS = df.shape[0]

In [38]:
def percent(partial, total, rnd=2):
    """ Calculates the percentage changes of rows deleted 
    on each filter pass """
    
    try:
        return round((partial/total)*100, rnd)
    except ZeroDivisionError:
        return 0

In [39]:
def debug(filter, start_time, end_time, rows_before, rows_after):
    """ Displays debug messages on each filter pass when the 'log' 
    argument of the 'multi_filters' function is 'debug' """
    
    exec_time = round(end_time-start_time, 3)
    rows_del = rows_before - rows_after
    relative_vario = percent(rows_before - rows_after, rows_before)
    global_vario = percent(INIT_ROWS - rows_after, INIT_ROWS)

    logging.debug(f"""
        --- '{filter}' FILTER --- 
        Execution time : {exec_time}s
        Rows before : {rows_before} 
        Rows after : {rows_after}
        Rows deleted : {rows_before - rows_after} 
        Ponctual vario : (-{relative_vario}%)
        Global vario : (-{global_vario}%)
        """)

In [40]:
def multi_filters(df, col_name, no_comp=True, length=None, start_with=None, 
end_with=None, nth_letters=None, contains=None, not_contain=None,
log="info"):

    """ Allows multiple filters to be applied to 
    dictionary words :

    :param df : Pandas dataframe of dico.csv
    :type df : pandas.core.frame.DataFrame

    :param col_name : column name to filter in dataframe
    :type col_name : str

    :param no_comp : Remove compound words
    :type no_comp : bool

    :param length : Word length
    :type length : int

    :param start_with : Letter(s) with which the word must start
    :type start_with : str 

    :param end_with : Letter(s) with which the word must end
    :type end_with : str

    :param nth_letter = The letter that the word must contain at rank n.
    :type nth_letter : list

    :param contains = Letters that the word must contain
    :type contains = list

    :param not_contain = Letters that the word must NOT contain
    :type not_contain = list

    :param log = Enable logging with the desired level (debug, info, warning, critical)
    can be set at None in this case only the CRITICAL will be displayed
    :type log = str

    :param return = Returns a new filtered dataframe
    :type return = pandas.core.frame.DataFrame
    """
    # Logging initialisation
    if log != None:
        log = log.upper()
        if log == "DEBUG":
            logger.setLevel(logging.DEBUG)
        elif log == "INFO":
            logger.setLevel(logging.INFO)
        elif log == "WARNING":
            logger.setLevel(logging.WARNING)
        elif log == "CRITICAL":
            logger.setLevel(logging.CRITICAL)
        else:
            logger.setLevel(logging.CRITICAL)
    else:
        logger.setLevel(logging.CRITICAL)
    
    # ------------ DATAFRAME CHECKS ------------
    # Dataframe check
    if type(df) != pd.core.frame.DataFrame:
        logging.critical(f"""
        df must be a Pandas dataframe. {type(df)} given """)
        return None
        
    elif col_name not in df.columns:
        logging.critical(f"""
        '{col_name}' column doesn't exist in the dataframe.
        Columns present : {[col for col in df.columns]}""")
        return None
    
    else:
        pass
    
    # ------------ CONFLICTS CHECK ------------
    # contains/not_contain check
    if contains != None and not_contain != None:
        if not isinstance(contains, list) or not isinstance(not_contain, list):
            logging.critical(f"""'contains' or 'not_contain' isn't a list""")
            return None

        elif set(contains) & set(not_contain):
            logging.critical(f"""
            'contains' and 'not_contain' must not share common values  """)
            return None
        
        else:
            pass

    INIT_TIME = time()
    INIT_SHAPE = df.shape[0]
    filters_crossed = []

    logging.debug(f"""
    -- INITIAL VALUES --
    Start at : {INIT_TIME}
    Dataframe shape : {df.shape}
    Column to filter : {col_name}
    no_comp = {no_comp}
    length = {length}
    start_with = {start_with}
    end_with = {end_with}
    nth_letters = {nth_letters}
    contains = {contains}
    not_contain = {not_contain}
    """)

    # ------------ FILTERS ------------
    # FILTER 1/ No compound words
    if no_comp:
        start_time = time()

        df = df.loc[
        (~df[col_name].str.contains(r'\s')) & 
        (~df[col_name].str.contains(r'-'))
        ]

        end_time = time()
        debug("no_comp", start_time, end_time, INIT_SHAPE, df.shape[0])
        filters_crossed.append("no_comp")
    
    # FILTER 2/ By word length
    if length != None:
        if not isinstance(length, int):
            logging.critical(f"""'length' must be of type int. 
            {type(length)} given""")
            return None

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[df[col_name].str.len() == length]

        end_time = time()
        debug("length", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("length")
    
    # FILTER 3/ By absence of letters
    if not_contain != None:
        if not isinstance(not_contain, list):
            logging.critical(f"""'not_contain' must be of type list. 
            {type(not_contain)} given""")
            return None

        elif not all(type(x) == str for x in not_contain):
            logging.critical("""One of the elements of 'not_contain' 
            is not a str.""")
            return None
        
        else:
            pass

        not_contain = set(not_contain) # remove duplicates
        r = ""
        for lettre in not_contain:
            r = r + f"(?=.*{lettre})"

        regex = f"^{r}.*$"

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[~df[col_name].str.contains(regex)] # ~ for negation

        end_time = time()
        debug("not_contain", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("not_contain")
    
    # FILTER 4/ By presence of letters
    if contains != None:
        if not isinstance(contains, list):
            logging.critical(f"""'contains' must be of type list. 
            {type(contains)} given""")
            return None

        elif not all(type(x) == str for x in contains):
            logging.critical("""One of the elements of 'contains' 
            is not a str.""")
            return None

        contains = set(contains) # remove duplicates
        r = ""
        for lettre in contains:
            r = r + f"(?=.*{lettre})"

        regex = f"^{r}.*$"

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[df[col_name].str.contains(regex)]

        end_time = time()
        debug("contains", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("contains")
    
    # FILTER 5/ By beginning of word
    if start_with != None:
        if not isinstance(start_with, str):
            logging.critical(f"""'start_with' must be of type str. 
            {type(start_with)} given""")
            return None

        start_with = start_with.capitalize()

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[df[col_name].str.startswith(start_with)]

        end_time = time()
        debug("start_with", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("start_with")
    
    # FILTER 6/ By letters position
    if nth_letters != None:
        if not isinstance(nth_letters, list):
            logging.critical(f"""'nth_letters' must be of type list. 
            {type(nth_letters)} given""")
            return None
        
        elif not all(type(x)==list and len(x)==2 for x in nth_letters):
            logging.critical(f"""All elements of the 'nth letters' list 
            must be lists of 2 elements: [rank, letter]""")
            return None
        
        elif not all(type(x[0])==int and type(x[1])==str 
        and len(x[1])==1 for x in nth_letters):
            logging.critical(f"""Each sub-element of nth_letters must be a list 
            composed of 2 elements [rank(int), 1 letter (str)]""")
            return None
        
        else:
            pass

        nth_letters = dict(nth_letters)
        
        ponctual_shape = df.shape[0]
        start_time = time()

        for rank, letter in nth_letters.items():
            df = df.loc[df[col_name].apply(lambda x: len(x) > rank and x[rank-1] == letter)]

        end_time = time()
        debug("nth_letters", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("nth_letters")

    # FILTER 7/ By ending of word
    if end_with != None:
        if not isinstance(start_with, str):
            logging.critical(f"""'start_with' must be of type str. 
            {type(start_with)} given""")
            return None

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[df[col_name].str.endswith(end_with)]

        end_time = time()
        debug("end_with", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("end_with")
    
    if df.shape[0] == 0:
        logging.info("No words found")
    
    logging.debug(f"""
    -- FINAL STATS -- 
    Total execution time : {round(time() - INIT_TIME, 3)}s
    Filters crossed = {len(filters_crossed)}/7 -> {filters_crossed}
    Total rows deleted : {INIT_SHAPE - df.shape[0]}
    From {INIT_SHAPE} to {df.shape[0]} -> (-{percent(INIT_SHAPE - df.shape[0], INIT_SHAPE, rnd=4)}%)
    """)

    return df

In [41]:
multi_filters(df,
col_name="Mot",
start_with="g",
end_with="it",
contains=["a"],
not_contain=["b"],
nth_letters=[[4,"t"]],
length=7
)

Unnamed: 0,Mot,Définitions
354317,Gantait,['Du verbe ganter.']
355842,Gattait,"[""Troisième personne du singulier de l'indicat..."
358324,Gertait,"[""Troisième personne du singulier de l'indicat..."
365395,Goutait,"[""Troisième personne du singulier de l'indicat..."
365794,Goûtait,['Du verbe goûter.']
368175,Gratuit,"[""Qu'on donne, sans y être tenu."", '…']"
371798,Grutait,"[""Troisième personne du singulier de l'indicat..."
