In [20]:
from time import time
import logging
import pandas as pd

In [21]:
logger = logging.getLogger()

In [22]:
# Pandas dataframe of the dictionary dico.csv
df = pd.read_csv("dico.csv")
df = df.sort_values("Mot")
df = df.dropna()
df = df.reset_index(drop=True)
INIT_ROWS = df.shape[0]

In [23]:
def percent(partial, total, rnd=2):
    try:
        return round((partial/total)*100, rnd)
    except ZeroDivisionError:
        return 0

In [24]:
def debug(filter, start_time, end_time, rows_before, rows_after):
    exec_time = round(end_time-start_time, 3)
    rows_del = rows_before - rows_after
    relative_vario = percent(rows_before - rows_after, rows_before)
    global_vario = percent(INIT_ROWS - rows_after, INIT_ROWS)

    logging.debug(f"""
        --- '{filter}' FILTER --- 
        Execution time : {exec_time}s
        Rows before : {rows_before} 
        Rows after : {rows_after}
        Rows deleted : {rows_before - rows_after} 
        Ponctual vario : (-{relative_vario}%)
        Global vario : (-{global_vario}%)
        """)

In [25]:
def multi_filter(df, col_name, no_comp=True, length=None, start_with=None, 
end_with=None, nth_letters=None, contains=None, not_contain=None,
log=None):

    """ Allows multiple filters to be applied to 
    dictionary words :
    - df = Pandas dataframe of dico.csv
    - col_name = column name to filter in dataframe
    - no_comp = Remove compound words
    - length = Word length
    - start_with = Letter(s) with which the word must start
    - end_with = Letter(s) with which the word must end
    - nth_letter = The letter that the word must contain at 
    rank n. must be an indexable container object, 
    for example a tuple: (rank, letter) where rank is a 
    positive integer and letter a str
    """
    # Logging initialisation
    if log != None:
        log = log.upper()
        if log == "DEBUG":
            logger.setLevel(logging.DEBUG)
        elif log == "INFO":
            logger.setLevel(logging.INFO)
        elif log == "WARNING":
            logger.setLevel(logging.WARNING)
        elif log == "CRITICAL":
            logger.setLevel(logging.CRITICAL)
        else:
            logger.setLevel(logging.CRITICAL)
    else:
        logger.setLevel(logging.CRITICAL)
    
    # Dataframe check
    if type(df) != pd.core.frame.DataFrame:
        logging.critical(f"""
        df must be a Pandas dataframe. {type(df)} given """)
        return None
        
    else:
        if col_name not in df.columns:
            logging.critical(f"""
            '{col_name}' column doesn't exist in the dataframe.
            Columns present : {[col for col in df.columns]}""")
            return None
    
    # contains/not_contain check
    if contains != None and not_contain != None:
        if set(contains) & set(not_contain):
            logging.critical(f"""
            'contains' and 'not_contain' must not share common values  """)
            return None

    INIT_TIME = time()
    INIT_SHAPE = df.shape[0]
    filters_crossed = []

    logging.debug(f"""
    -- INITIAL VALUES --
    Start at : {INIT_TIME}
    Dataframe shape : {df.shape}
    Column to filter : {col_name}
    no_comp = {no_comp}
    length = {length}
    start_with = {start_with}
    end_with = {end_with}
    nth_letters = {nth_letters}
    contains = {contains}
    not_contain = {not_contain}
    """)

    # ------------ FILTERS ------------
    # FILTER 1/ No compound words
    if no_comp:
        start_time = time()

        df = df.loc[
        (~df[col_name].str.contains(r'\s')) & 
        (~df[col_name].str.contains(r'-'))
        ]

        end_time = time()
        debug("no_comp", start_time, end_time, INIT_SHAPE, df.shape[0])
        filters_crossed.append("no_comp")
    
    # FILTER 2/ By word length
    if length != None:
        if not isinstance(length, int):
            raise TypeError(f"""
            'length' must be a int type : ({type(length)} given)
            """)

        else:
            ponctual_shape = df.shape[0]
            start_time = time()

            df = df.loc[df[col_name].str.len() == length]

            end_time = time()
            debug("length", start_time, end_time, ponctual_shape, df.shape[0])
            filters_crossed.append("length")
    
    # FILTER 3/ By absence of letters
    if not_contain != None:
        not_contain = set(not_contain)
        r = ""
        for lettre in not_contain:
            r = r + f"(?=.*{lettre})"

        regex = f"^{r}.*$"

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[~df[col_name].str.contains(regex)]

        end_time = time()
        debug("not_contain", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("not_contain")
    
    # FILTER 4/ By presence of letters
    if contains != None:
        contains = set(contains)
        r = ""
        for lettre in contains:
            r = r + f"(?=.*{lettre})"

        regex = f"^{r}.*$"

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[df[col_name].str.contains(regex)]

        end_time = time()
        debug("contains", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("contains")
    
    # FILTER 5/ By beginning of word
    if start_with != None:
        if not isinstance(start_with, str):
            raise TypeError(f"""
            'start_with' must be a str type : 
            ({type(start_with)} given)
            """)

        else:
            start_with = start_with.capitalize()

            ponctual_shape = df.shape[0]
            start_time = time()

            df = df.loc[df[col_name].str.startswith(start_with)]

            end_time = time()
            debug("start_with", start_time, end_time, ponctual_shape, df.shape[0])
            filters_crossed.append("start_with")
    
    # FILTER 6/ By letters position
    if nth_letters != None:
        nth_letters = dict(nth_letters)
        
        ponctual_shape = df.shape[0]
        start_time = time()

        for rank, letter in nth_letters.items():
            df = df.loc[df[col_name].apply(lambda x: len(x) > rank and x[rank-1] == letter)]

        end_time = time()
        debug("nth_letters", start_time, end_time, ponctual_shape, df.shape[0])
        filters_crossed.append("nth_letters")

    # FILTER 7/ By ending of word
    if end_with != None:
        if not isinstance(end_with, str):
            raise TypeError(f"""
            'end_with' must be a str type : ({type(end_with)} given)
            """)

        else:
            ponctual_shape = df.shape[0]
            start_time = time()

            df = df.loc[df[col_name].str.endswith(end_with)]

            end_time = time()
            debug("end_with", start_time, end_time, ponctual_shape, df.shape[0])
            filters_crossed.append("end_with")
    
    logging.debug(f"""
    -- FINAL STATS -- 
    Total execution time : {round(time() - INIT_TIME, 3)}s
    Filters crossed = {len(filters_crossed)}/7 -> {filters_crossed}
    Total rows deleted : {INIT_SHAPE - df.shape[0]}
    From {INIT_SHAPE} to {df.shape[0]} -> (-{percent(INIT_SHAPE - df.shape[0], INIT_SHAPE, rnd=4)}%)
    """)

    return df

In [26]:
multi_filter(df,
col_name="Mot",
start_with="g",
end_with="it",
contains=["a","r"],
not_contain=["b"],
nth_letters=[[4,"t"]],
length=7,
log=None
)

Unnamed: 0,Mot,Définitions
358324,Gertait,"[""Troisième personne du singulier de l'indicat..."
368175,Gratuit,"[""Qu'on donne, sans y être tenu."", '…']"
371798,Grutait,"[""Troisième personne du singulier de l'indicat..."
