In [7]:
from time import time
import logging
import pandas as pd

In [8]:
logger = logging.getLogger()

In [9]:
# Pandas dataframe of the dictionary dico.csv
df = pd.read_csv("dico.csv")
df = df.sort_values("Mot")
df = df.dropna()
df = df.reset_index(drop=True)

In [54]:
def percent(partial, total, rnd=2):
    try:
        return round((partial/total)*100, rnd)
    except ZeroDivisionError:
        return 0

In [77]:
def multi_filter(df, col_name, no_comp=True, length=None, start_with=None, 
end_with=None, nth_letters=None, contains=None, not_contain=None,
log=None):

    """ Allows multiple filters to be applied to 
    dictionary words :
    - df = Pandas dataframe of dico.csv
    - no_comp = Remove compound words
    - length = Word length
    - start_with = Letter(s) with which the word must start
    - end_with = Letter(s) with which the word must end
    - nth_letter = The letter that the word must contain at 
    rank n. must be an indexable container object, 
    for example a tuple: (rank, letter) where rank is a 
    positive integer and letter a str
    """
    # Logging initialisation
    if log != None:
        log = log.upper()
        if log == "DEBUG":
            logger.setLevel(logging.DEBUG)
        elif log == "INFO":
            logger.setLevel(logging.INFO)
        elif log == "WARNING":
            logger.setLevel(logging.WARNING)
        elif log == "CRITICAL":
            logger.setLevel(logging.CRITICAL)
        else:
            logger.setLevel(logging.CRITICAL)
    else:
        logger.setLevel(logging.CRITICAL)
    
    # Dataframe check
    if type(df) != pd.core.frame.DataFrame:
        logging.critical(f"""
        df must be a Pandas dataframe. {type(df)} given """)
        return None
        
    else:
        if col_name not in df.columns:
            logging.critical(f"""
            '{col_name}' column doesn't exist in the dataframe.
            Columns present : {[col for col in df.columns]}""")
            return None
    
    # contains/not_contain check
    if contains != None and not_contain != None:
        if set(contains) & set(not_contain):
            logging.critical(f"""
            'contains' and 'not_contain' must not share common values  """)
            return None

    INIT_TIME = time()
    INIT_SHAPE = df.shape[0]
    filters_crossed = []

    logging.debug(f"""
    -- INITIAL VALUES --
    Start at : {INIT_TIME}
    Dataframe shape : {df.shape}
    Column to filter : {col_name}
    no_comp = {no_comp}
    length = {length}
    start_with = {start_with}
    end_with = {end_with}
    nth_letters = {nth_letters}
    contains = {contains}
    not_contain = {not_contain}
    """)

    # FILTER 1/ No compound words
    if no_comp:
        start_time = time()

        df = df.loc[
        (~df[col_name].str.contains(r'\s')) & 
        (~df[col_name].str.contains(r'-'))
        ]

        end_time = time()
        logging.debug(f"""
        -- 'no_comp' FILTER -- 
        Execution time : {round(end_time-start_time, 3)}s
        Rows before : {INIT_SHAPE} 
        Rows after : {df.shape[0]}
        Rows deleted : {INIT_SHAPE - df.shape[0]} 
        Vario : (-{percent(INIT_SHAPE - df.shape[0], INIT_SHAPE)}%)
        """)
        filters_crossed.append("no_comp")
    
    # FILTER 2/ By word length
    if length != None:
        if not isinstance(length, int):
            raise TypeError(f"""
            'length' must be a int type : ({type(length)} given)
            """)

        else:
            ponctual_shape = df.shape[0]
            start_time = time()

            df = df.loc[df[col_name].str.len() == length]

            end_time = time()
            logging.debug(f"""
            -- 'length' FILTER -- 
            Execution time : {round(end_time-start_time, 3)}s
            Rows before : {ponctual_shape}
            Rows after : {df.shape[0]} 
            Rows deleted : {ponctual_shape - df.shape[0]} 
            Ponctual vario : (-{percent(ponctual_shape - df.shape[0], ponctual_shape)}%)
            Total vario : (-{percent(ponctual_shape - df.shape[0], INIT_SHAPE)}%)
            """)
            filters_crossed.append("length")
    
    # FILTER 3/ By absence of letters
    if not_contain != None:
        not_contain = set(not_contain)
        r = ""
        for lettre in not_contain:
            r = r + f"(?=.*{lettre})"

        regex = f"^{r}.*$"

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[~df[col_name].str.contains(regex)]

        end_time = time()
        logging.debug(f"""
        -- 'not_contain' FILTER -- 
        Execution time : {round(end_time-start_time, 3)}s
        Rows before : {ponctual_shape}
        Rows after : {df.shape[0]}  
        Rows deleted : {ponctual_shape - df.shape[0]}
        Ponctual vario : (-{percent(ponctual_shape - df.shape[0], ponctual_shape)}%)
        Total vario : (-{percent(ponctual_shape - df.shape[0], INIT_SHAPE)}%)
        """)
        filters_crossed.append("not_contain")
    
    # FILTER 4/ By presence of letters
    if contains != None:
        contains = set(contains)
        r = ""
        for lettre in contains:
            r = r + f"(?=.*{lettre})"

        regex = f"^{r}.*$"

        ponctual_shape = df.shape[0]
        start_time = time()

        df = df.loc[df[col_name].str.contains(regex)]

        end_time = time()
        logging.debug(f"""
        -- 'contains' FILTER -- 
        Execution time : {round(end_time-start_time, 3)}s
        Rows before : {ponctual_shape}
        Rows after : {df.shape[0]}
        Rows deleted : {ponctual_shape - df.shape[0]}
        Ponctual vario : (-{percent(ponctual_shape - df.shape[0], ponctual_shape)}%)
        Total vario : (-{percent(ponctual_shape - df.shape[0], INIT_SHAPE)}%)
        """)
        filters_crossed.append("contains")
    
    # FILTER 5/ By beginning of word
    if start_with != None:
        if not isinstance(start_with, str):
            raise TypeError(f"""
            'start_with' must be a str type : 
            ({type(start_with)} given)
            """)

        else:
            start_with = start_with.capitalize()

            ponctual_shape = df.shape[0]
            start_time = time()

            df = df.loc[df[col_name].str.startswith(start_with)]

            end_time = time()
            logging.debug(f"""
            -- 'start_with' FILTER -- 
            Execution time : {round(end_time-start_time, 3)}s
            Rows before : {ponctual_shape}
            Rows after : {df.shape[0]}
            Rows deleted : {ponctual_shape - df.shape[0]}
            Ponctual vario : (-{percent(ponctual_shape - df.shape[0], ponctual_shape)}%)
            Total vario : (-{percent(ponctual_shape - df.shape[0], INIT_SHAPE)}%)
            """)
            filters_crossed.append("start_with")
    
    # FILTER 6/ By letters position
    if nth_letters != None:
        nth_letters = dict(nth_letters)
        
        ponctual_shape = df.shape[0]
        start_time = time()

        for rank, letter in nth_letters.items():
            df = df.loc[df[col_name].apply(lambda x: len(x) > rank and x[rank-1] == letter)]

        end_time = time()
        logging.debug(f"""
        -- 'nth_letters' FILTER -- 
        Execution time : {round(end_time-start_time, 3)}s
        Rows before : {ponctual_shape}
        Rows after : {df.shape[0]}
        Rows deleted : {ponctual_shape - df.shape[0]}
        Ponctual vario : (-{percent(ponctual_shape - df.shape[0], ponctual_shape)}%)
        Total vario : (-{percent(ponctual_shape - df.shape[0], INIT_SHAPE)}%)
        """)
        filters_crossed.append("nth_letters")

    # FILTER 7/ By ending of word
    if end_with != None:
        if not isinstance(end_with, str):
            raise TypeError(f"""
            'end_with' must be a str type : ({type(end_with)} given)
            """)

        else:
            ponctual_shape = df.shape[0]
            start_time = time()

            df = df.loc[df[col_name].str.endswith(end_with)]

            end_time = time()
            logging.debug(f"""
            -- 'end_with' FILTER -- 
            Execution time : {round(end_time-start_time, 3)}s
            Rows before : {ponctual_shape} 
            Rows after : {df.shape[0]}
            Rows deleted : {ponctual_shape - df.shape[0]}
            Ponctual vario : (-{percent(ponctual_shape - df.shape[0], ponctual_shape)}%)
            Total vario : (-{percent(ponctual_shape - df.shape[0], INIT_SHAPE)}%)
            """)
            filters_crossed.append("end_with")
    
    logging.debug(f"""
    -- FINAL STATS -- 
    Filters crossed = {len(filters_crossed)}/7 -> {filters_crossed}
    Total execution time : {round(time() - INIT_TIME, 3)}s
    Total rows deleted : {INIT_SHAPE - df.shape[0]}
    From {INIT_SHAPE} to {df.shape[0]} -> (-{percent(INIT_SHAPE - df.shape[0], INIT_SHAPE)}%)
    """)

    return df

In [80]:
multi_filter(df,
col_name="Mot",
start_with="g",
end_with="it",
contains=["a","r"],
not_contain=["b"], 
length=7
)

Unnamed: 0,Mot,Définitions
354594,Gardait,['Du verbe garder.']
358040,Germait,['Du verbe germer.']
358324,Gertait,"[""Troisième personne du singulier de l'indicat..."
358410,Gerçait,['Du verbe gercer.']
360323,Givrait,['Du verbe givrer.']
364839,Gourait,['Du verbe gourer.']
366001,Gradait,"[""Troisième personne du singulier de l'indicat..."
367078,Grandit,['Du verbe grandir.']
368175,Gratuit,"[""Qu'on donne, sans y être tenu."", '…']"
368212,Gravait,['Du verbe graver.']
