In [None]:
import os
import numpy as np
import seaborn as sns
import pandas as pd

import random

from datetime import datetime, timedelta

from matplotlib import rcParams
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

from IPython.display import display

In [None]:
# Read excel
data_path = os.path.join("..", "Data")
schedules_df = pd.read_excel(os.path.join(data_path, "Schedules.xlsx"))
data_df = pd.read_excel(os.path.join(data_path, "Data.xlsx"))
clients_df = pd.read_excel(os.path.join(data_path, "Clients.xlsx"))
transl_cost_pairs_df = pd.read_excel(os.path.join(data_path, "TranslatorsCost+Pairs.xlsx"))

In [None]:
wildcards = [None, "Quality", "Time", "Cost"]
task_types = data_df["TASK_TYPE"].unique()
unique_language_pairs = data_df[["SOURCE_LANG", "TARGET_LANG"]].drop_duplicates().reset_index(drop=True)
min_qualities = [0, 7, 7.5, 8]
pm = ['PMT', 'KMT', 'BMT', 'RMT']

In [None]:
class Task:
    def __init__(self, START, END, SELLING_HOURLY_PRICE, MIN_QUALITY, WILDCARD, TASK_TYPE, SOURCE_LANG, TARGET_LANG, MANUFACTURER, MANUFACTURER_SECTOR, 
                 MANUFACTURER_INDUSTRY_GROUP, MANUFACTURER_INDUSTRY, MANUFACTURER_SUBINDUSTRY, PM = None, PROJECT_ID = None, TASK_ID = None, TRANSLATOR=None, 
                 ASSIGNED=None, READY=None, WORKING=None, DELIVERED=None, RECEIVED=None, CLOSE=None, HOURS=None, 
                 HOURLY_RATE=None, COST=None, QUALITY_EVALUATION=None):
        """
        A class used to represent a Task. 
        The arguments initialized to None are the information that is not given at the beginning
        """
        self.PROJECT_ID = PROJECT_ID # not given
        self.PM = PM # not given
        self.TASK_ID = TASK_ID # not given
        self.START = START
        self.END = END # not given
        self.TASK_TYPE = TASK_TYPE
        self.SOURCE_LANG = SOURCE_LANG
        self.TARGET_LANG = TARGET_LANG
        self.TRANSLATOR = TRANSLATOR # not given
        self.ASSIGNED = ASSIGNED # not given
        self.READY = READY # not given
        self.WORKING = WORKING # not given
        self.DELIVERED = DELIVERED # not given
        self.RECEIVED = RECEIVED # not given
        self.CLOSE = CLOSE # not given
        self.HOURS = HOURS # not given
        self.HOURLY_RATE = HOURLY_RATE # not given
        self.SELLING_HOURLY_PRICE = SELLING_HOURLY_PRICE
        self.COST = COST # not given
        self.QUALITY_EVALUATION = QUALITY_EVALUATION  # not given
        self.MANUFACTURER = MANUFACTURER
        self.MANUFACTURER_SECTOR = MANUFACTURER_SECTOR
        self.MANUFACTURER_INDUSTRY_GROUP = MANUFACTURER_INDUSTRY_GROUP
        self.MANUFACTURER_INDUSTRY = MANUFACTURER_INDUSTRY
        self.MANUFACTURER_SUBINDUSTRY = MANUFACTURER_SUBINDUSTRY
        self.MIN_QUALITY = MIN_QUALITY
        self.WILDCARD = WILDCARD # WILDCARD: En el cas que no es puguin complir totes les concisions, quina es la que es pot saltar.
    
    def __str__(self):
        return (
            f"Task Details:\n"
            f"  - Task ID: {self.TASK_ID}\n"
            f"  - Type: {self.TASK_TYPE}\n"
            f"  - Industry: {self.MANUFACTURER_INDUSTRY}\n"
            f"  - Start: {self.START}\n"
            f"  - End: {self.END}\n"
            f"  - Budget: {self.SELLING_HOURLY_PRICE}\n"
            f"  - Quality: {self.MIN_QUALITY}\n"
            f"  - Wildcard: {self.WILDCARD}\n"
            f"  - Source Language: {self.SOURCE_LANG}\n"
            f"  - Target Language: {self.TARGET_LANG}"
        )


def generate_task():
    """
    Generate a random task object with all the attributes that can be assigned at first
    """
    
    # Pick a random manufacturer from the data_df (debería de ser con clients_df, pero luego hay que encontrar sus posibles industrias sector, subsector...)
    manufacturer = data_df.sample(n=1).iloc[0] 
    # Select the corresponding client from clients_df 
    matching_client = clients_df[clients_df["CLIENT_NAME"] == manufacturer["MANUFACTURER"]]
    # If the client is new (no registered in clients_df), we can assign a random wildcard
    wildcard = matching_client["WILDCARD"].values[0] if not matching_client.empty else random.choice(wildcards)    
    
    # Pick a possible pair of languages 
    language_pair = unique_language_pairs.sample(n=1).iloc[0]
    
    # Set the starting time to the current time
    start = datetime.now()  
    
    return  Task(
        PROJECT_ID="1", #TODO que vaya aumentando de 1 en 1,
        PM=random.choice(pm), 
        TASK_ID=1,
        START=start,
        END=start + timedelta(hours=random.randint(1, 3)), # example duration from 1 to 3 hours (toy example)
        SELLING_HOURLY_PRICE= int(np.random.normal(loc=26, scale=7)),
        MIN_QUALITY=random.choice(min_qualities),
        WILDCARD=wildcard,
        TASK_TYPE=random.choice(task_types),
        SOURCE_LANG=language_pair["SOURCE_LANG"],
        TARGET_LANG=language_pair["TARGET_LANG"],
        MANUFACTURER=manufacturer["MANUFACTURER"],
        MANUFACTURER_SECTOR=manufacturer["MANUFACTURER_SECTOR"],
        MANUFACTURER_INDUSTRY_GROUP= manufacturer["MANUFACTURER_INDUSTRY_GROUP"],
        MANUFACTURER_INDUSTRY=manufacturer["MANUFACTURER_INDUSTRY"],
        MANUFACTURER_SUBINDUSTRY=manufacturer["MANUFACTURER_SUBINDUSTRY"]
    )



### Explanation of what is happening

In the global scope:

0. We use as base of useful features the dataframe `transl_cost_pairs_df`

1. We compute the **average proportional delay** (speed) for each translator and merge it with this base dataframe


Now for the each task:

2. We start to the strict filter:
    - Filter of **languages**: only consider the translators who offer this translation
    - Filter of **price**: only consider the prices below the threshold
    - Filter of **quality** by language: this is done by making an average of the quality of these languages for each translator, and then using it as a threshold. 
    - Filter of **availability**: if the taks lasts less than 7 days we check whether the translator will even work before the theoretical deadline (it is 7 days because everyone works at least once a week)

3. We do a **weighted knn**:
    - We do it on the *perfect point* (price = 0, quality = 10, speed = 100%, experience = 10... (orientative values))
    - The weights are chosen by the wildcards and by common sense (it may sound controversial or stupid but I can explain why consider this)
        A weighted knn, after normalizing, it distorts the chosen axis size to give more or less weight. 
    - The similarity score is the final ranking. It is computed with the euclidian distance, but since we did these distortions, it is not very euclidean

4. Outcome possibilities: ***(work in progress)***
    - We get None or too few translators: if it is None we use the wildcard to completely ignore that factor in the strict filter
    - We get a lot of translators: (we need the rest of features to give better recommendations)

PD: some metrics can be improved, the quality is computed for each task (and probably more features). AND there are 3 mistakes in the datasets, which are pointed out at the bottom
    

In [None]:
def compute_speed():
    """
    Compute the speed/delay_percentage of each translator based on the average time taken to complete tasks.
    The speed is calculated as the extra percentage of time taken compared to the expected time.
    
    Args:
        None
        
    Returns:
        translators_attributes_df (pd.DataFrame): Contains [name, source_lang, target_lang, hourly_rate, average_quality, delay_percentage].
    """
    # Compute the delay of each task
    data_df['DELAY_PERCENTAGE'] = ((data_df['DELIVERED'] - data_df['END']) / (data_df['END'] - data_df['START'])) * 100
    data_df['DELAY_PERCENTAGE'] = data_df['DELAY_PERCENTAGE'].replace([np.inf, np.nan], 0)  # Replace inf and NaN with 0

    # Limit the delay percentage to 100%
    data_df['DELAY_PERCENTAGE'] = data_df['DELAY_PERCENTAGE'].clip(upper=100)  # Limitar a 100%

    # Compute the mean delay percentage for each translator
    avg_delay_by_translator = data_df.groupby('TRANSLATOR')['DELAY_PERCENTAGE'].mean().reset_index()
    avg_delay_by_translator['DELAY_PERCENTAGE'] = avg_delay_by_translator['DELAY_PERCENTAGE'].round(2)

    # Rename the column for clarity
    avg_delay_by_translator.rename(columns={'DELAY_PERCENTAGE': 'AVG_DELAY_PERCENTAGE'}, inplace=True)

    # Merge the average delay with the translator cost pairs DataFrame
    translators_attributes_df = transl_cost_pairs_df.merge(avg_delay_by_translator, on='TRANSLATOR', how='left')


    #TODO hay un problema con esto y es que si le han asignado la tarea tarde, o empieza a trabajar más tarde (por horario), él puede arrastrar esto y no es culpa suya.
    #también deberia influir la cantidad de tareas que ha hecho, porque puede haber sido excepcional si tiene pocas
    
    return translators_attributes_df



def compute_quality_by_languages(df_filtered):
    """
    Computes the average translation quality for each translator.
    It calculates the average quality for a given language pair. 
    It fills in exceptional missing values with the translator's average quality for other languages (with a penalty adjustment).

    Args:
        df_filtered (pd.DataFrame): 
            DataFrame containing filtered translators' attributes (price & language).

    Returns:
        df_filtered (pd.DataFrame):
            DataFrame with an additional column for average quality.
    """
    
    # Filter by quality
    filtered_translators = df_filtered['TRANSLATOR'].unique()
    
    # Calcular calidad promedio por traductor y par de idiomas
    quality_by_translator = (
        data_df[data_df['TRANSLATOR'].isin(filtered_translators)]
        .groupby(['TRANSLATOR', 'SOURCE_LANG', 'TARGET_LANG'])['QUALITY_EVALUATION']
        .mean()
        .round(2)
        .reset_index()
        .rename(columns={'QUALITY_EVALUATION': 'AVERAGE_QUALITY'})
    )
    
    # Merge to add this average quality
    df_filtered = df_filtered.merge(
        quality_by_translator,
        on=['TRANSLATOR', 'SOURCE_LANG', 'TARGET_LANG'],
        how='left'
    )
    
    # if someone has no registers of translations in these languages we set it to its average quality 
    translators_without_lang_experience = df_filtered[df_filtered['AVERAGE_QUALITY'].isna()]['TRANSLATOR'].unique()
    general_quality = (
        data_df[data_df['TRANSLATOR'].isin(translators_without_lang_experience)]
        .groupby('TRANSLATOR')['QUALITY_EVALUATION']
        .mean() 
        .round(2) - 1 # penalization, quizás debamos marcar que además no tuvieron experiencia en estos idiomas
    )

    df_filtered.loc[df_filtered['AVERAGE_QUALITY'].isna(), 'AVERAGE_QUALITY'] = (
        df_filtered.loc[df_filtered['AVERAGE_QUALITY'].isna(), 'TRANSLATOR']
        .map(general_quality)
    )


    # if there is someone who doesn't have any registry at all of a single task, fill with general global mean but penalized (?)
    df_filtered['AVERAGE_QUALITY'] = df_filtered['AVERAGE_QUALITY'].fillna(data_df['QUALITY_EVALUATION'].mean() - 1.5)
    return df_filtered


# Ahora nos fijaremos en si habra alguna instancia en la linea de tiempo esperada (START - END) en la que el traductor trabaje
# porque como no tenemos ni idea de cuánto va a durar, pues nos da igual y lo simplificamos
def available_translators(task, df_filtered):
    """
    Check the availability of translators for a given task based on their schedules.
    If a translator is available for any part of the task's deadline, they are considered available.
    
    Args:
        task (Task object):
            The task for which we want to check the availability of translators. We use it to know the start and theoretical end of the task.
        df_filtered: DataFrame
            DataFrame containing the filtered translators' attributes of price, language, and quality.
    
    Returns:
        mask (List[bool]):
            A list of booleans that map to each translator in df_filtered, indicating their availability for the task.
    """
    weekday_map = {
        0: 'MON',
        1: 'TUES',
        2: 'WED',
        3: 'THURS',
        4: 'FRI',
        5: 'SAT',
        6: 'SUN'
    }

    task_start = task.START
    task_end = task.END
    mask = []

    
    # If the duration of the task is more than 7 days, there are no availability restrictions (everyone works at least once a week)
    if (task_end - task_start).days > 7:
        return [True] * len(df_filtered) # All translators are available


    # Filtrar schedules_df para que solo contenga los traductores válidos de df_filtered, para poder iterar únicamente sobre ellos
    # Esto es necesario porque schedules_df puede contener traductores que no están en df_filtered
    valid_translators = df_filtered["TRANSLATOR"].tolist()  # Obtener la lista de nombres válidos
    filtered_schedules = schedules_df[schedules_df["NAME"].isin(valid_translators)]

    
    for _, row in filtered_schedules.iterrows():
        translator_available = False
        current = task_start

        while current <= task_end:
            day_col = weekday_map[current.weekday()]  # 'MON', 'TUES', etc.

            if row[day_col] == 1:
                # Parse START and END as datetime.time if needed
                if isinstance(row['START'], str):
                    start_time = datetime.strptime(row['START'], "%H:%M:%S").time()
                    end_time = datetime.strptime(row['END'], "%H:%M:%S").time()
                else:
                    start_time = row['START']
                    end_time = row['END']

                translator_start = datetime.combine(current.date(), start_time)
                translator_end = datetime.combine(current.date(), end_time)

                # Comprobar si hay solapamiento con la tarea
                if translator_end > task_start and translator_start < task_end:
                    translator_available = True
                    break  # No hace falta seguir buscando si ya hay solapamiento

            # Pasamos al siguiente día
            current += timedelta(days=1)
            current = current.replace(hour=0, minute=0, second=0, microsecond=0)

        mask.append(translator_available)

    return mask


def filter_language_price_quality_availability(translators_attributes_df, task = Task, need_wildcard = False):
    """
    Filters the translators' attributes of languages, price, quality, and availability.
    If need_wildcard is True, it will skip the filter corresponding to the wildcard.
    
    Args:
        translators_attributes_df (pd.DataFrame): 
            DataFrame containing the translators' attributes (name, languages, price, speed).
        task (Task object): 
            The task for which we want to filter the translators.
        need_wildcard (bool): 
            If True, skip the filter corresponding to the wildcard.
            
    Returns:
        pd.DataFrame: 
            Filtered DataFrame containing translators who meet the criteria.
    """
    
    if not need_wildcard:
        # Filter by language, price 
        df_filtered = translators_attributes_df[
            (translators_attributes_df['SOURCE_LANG'] == task.SOURCE_LANG) & #TODO tener en cuenta la posibilidad de ofrecer también varios Spanish (como el iberian, latamer, etc. o con el inglés)?
            (translators_attributes_df['TARGET_LANG'] == task.TARGET_LANG) &
            (translators_attributes_df['HOURLY_RATE'] <= task.SELLING_HOURLY_PRICE) 
        ].copy()

        # add the average quality column
        df_filtered = compute_quality_by_languages(df_filtered)
        df_filtered = df_filtered[df_filtered['AVERAGE_QUALITY'] >= task.MIN_QUALITY]

        # Filter by availability
        availability_mask = available_translators(task, df_filtered)
        valid_translators = [name for name, available in zip(df_filtered["TRANSLATOR"], availability_mask) if available]
        return df_filtered[df_filtered["TRANSLATOR"].isin(valid_translators)]
    
    # same code as above but with the wildcard, it will skip the filter corresponding to the wildcard
    else:
        # Filter by language, price 
        df_filtered = translators_attributes_df[
            (translators_attributes_df['SOURCE_LANG'] == task.SOURCE_LANG) & #TODO tener en cuenta la posibilidad de ofrecer también varios Spanish (como el iberian, latamer, etc. o con el inglés)?
            (translators_attributes_df['TARGET_LANG'] == task.TARGET_LANG) &
            (translators_attributes_df['HOURLY_RATE'] <= task.SELLING_HOURLY_PRICE) if task.WILDCARD != "Price" else True 
        ].copy()

        if task.WILDCARD != "Quality":
            # add the average quality column
            df_filtered = compute_quality_by_languages(df_filtered)
            df_filtered = df_filtered[df_filtered['AVERAGE_QUALITY'] >= task.MIN_QUALITY]

        if task.WILDCARD != "Deadline":
            # Filter by availability
            availability_mask = available_translators(task, df_filtered)
            valid_translators = [name for name, available in zip(df_filtered["TRANSLATOR"], availability_mask) if available]
            return df_filtered[df_filtered["TRANSLATOR"].isin(valid_translators)]
        else:
            return df_filtered
    



In [None]:
def do_knn(df_filtered, task = Task, need_wildcard = False):
    """
    Performs K-Nearest Neighbors (KNN) to find the best translators based on the task's requirements.
    
    Args:
        df_filtered (pd.DataFrame): 
            DataFrame containing the filtered translators' attributes (name, language, price, quality, speed).
        task (Task object): 
            We only need it to know the wildcard
        need_wildcard (bool): 
            If True, remove the dimension of the wildcard from the KNN (since it will not be in the df_filtered).
            
    Returns:
        distances (np.ndarray): 
            Distances of the nearest neighbors.
        indexes (np.ndarray): 
            Indices of the nearest neighbors in the original DataFrame.
    """
    
    # Select the numeric features for the KNN (the order is important through the whole process)
    
    if not need_wildcard:
        X = df_filtered[['HOURLY_RATE', 'AVERAGE_QUALITY', 'AVG_DELAY_PERCENTAGE']]
    elif task.WILDCARD == "Price":
        X = df_filtered[['AVERAGE_QUALITY', 'AVG_DELAY_PERCENTAGE']]
    elif task.WILDCARD == "Quality":
        X = df_filtered[['HOURLY_RATE', 'AVG_DELAY_PERCENTAGE']]
    elif task.WILDCARD == "Deadline":
        X = df_filtered[['HOURLY_RATE', 'AVERAGE_QUALITY']]

    # Standarize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Definir pesos para las características, thumb rule.
    weights = np.array([1, 1.5, 0.25])  # The smaller the weight, the less important the feature is
                                        # Por qué calidad más? clients_df["WILDCARD"].value_counts(), vemos que la tendencia es a calidad
                                        # literalmente, para decidir el mejor, todo depende de los GUSTOS del cliente

    # lo he dejado aquí para que no perdamos el hilo
    if not need_wildcard:
        # Start defining the wildcard vector
        wildcard_vector = np.ones_like(weights)
        
        # Modificar la ponderación según 'task.WILDCARD' porque la wildcard también habla de sus preferencias, sus gustos, 
        # que al final del día es absolutamente el único factor para ordenar los traductores, no?
        if task.WILDCARD == 'Price':
            wildcard_vector[0] = 0.25  
        elif task.WILDCARD == 'Quality':
            wildcard_vector[1] = 0.25 
        elif task.WILDCARD == 'Deadline':
            wildcard_vector[2] = 0.25 
        
        weights = weights * wildcard_vector  
        
    # Apply weights to the features
    X_weighted = X_scaled * weights

    # Train
    knn = NearestNeighbors(metric='euclidean')  #TODO maybe other ways
    knn.fit(X_weighted)                                         # We can limit the n_neighbors or without limit. 
                                # If it is limited, when the client in the page wants to see more, we can discard the first 15 and do knn again with the rest.

    # Ideal outcome
    task_df = pd.DataFrame([[0, 10, -100]], # ideal values
                        columns=['HOURLY_RATE', 'AVERAGE_QUALITY', 'AVG_DELAY_PERCENTAGE'])  # tranform to dataframe to help the knn to compute it
    task_scaled = scaler.transform(task_df)
    task_weighted = task_scaled * weights  # Weight the task too

    # Find nearest neighbours
    distances, indexes = knn.kneighbors(task_weighted)
    
    return distances, indexes



def get_best_translators(df_filtered, indexes, distances):
    """
    Get the best translators based on the KNN results.
    
    Args:
        df_filtered (pd.DataFrame): 
            Contains the filtered translators' attributes (name, language, price, quality, speed).
        indexes (np.ndarray): 
            Indices of the nearest neighbors in the df_filtered.
        distances (np.ndarray): 
            Distances of the nearest neighbors.
            
    Returns:
        df_filtered (pd.DataFrame): 
            Contains the filtered translators' attributes (name, language, price, quality, speed AND similarity_score).
    """
    
    selected_translators = df_filtered.iloc[indexes[0]].copy()
    
    # Add the similarity score
    selected_translators['Similarity Score'] = distances[0].round(2)  # Round to 2 decimal places

    # Sort by similarity score (ascending: closest match first)
    selected_translators = selected_translators.sort_values(by='Similarity Score', ascending=True) # 

    return selected_translators


In [None]:
# Creates a dataframe like transl_cost_pairs_df but with information about the delay
translators_attributes_df = compute_speed()

# Example of generating a task
new_task = generate_task()
tasks = []
tasks.append(new_task)

for task in tasks:
    need_wildcard = False
    print("New task:")
    print(task)
    #TODO es esto ineficiente?
    df_filtered = filter_language_price_quality_availability(translators_attributes_df, new_task)
    
    if df_filtered.empty:
        print("\nNo available translators. Possible reasons:")
        print("1. No translators available because the deadline is too short")
        print("2. No translators available because the quality is too high")
        print("3. No translators available because the price is too low")
        print("PD: the possible selection of languages to translate is assured to exist in the generate_task function")
        print("\nTrying with the wildcard...\n")        
        need_wildcard = True
        df_filtered = filter_language_price_quality_availability(translators_attributes_df, new_task, need_wildcard = need_wildcard)
        if df_filtered.empty:
            print("No available translators even with the wildcard")
            #TODO implement, see which is the strict filter or something inside the function to know the remedy
            # ofrecer ignorando calidad, precio y deadline. Y si ni siquiera así hay, pues ofrecer otros idiomas parecidos (español vs argentino, english uk vs english us, etc.)
            continue
        
    
    # Compute the KNN
    distances, indexes = do_knn(df_filtered, task, need_wildcard = need_wildcard)

    # Get the dataframe with the best translators
    best_translators_df = get_best_translators(df_filtered, indexes, distances)
    display(best_translators_df)

    
    
# TODO: what I am thinking next
# availability, quizas debamos tener en cuenta cuantas horas trabaja a la semana (si dura +7 dias ?), se consideraria como velocidad

# relación cliente traductor
# experiencia en pareja de idioma:    veces u horas de traduccion para este par de idiomas,
# velocidad:    horas que trabaja a la semana o rapidez de entrega
# experiencia en sector
# experiencia en el tipo de tarea
# si cumpliste con las expectativas de calidad?
# poner una opción de sus preferencias para usarlo en los weights? (relacion calidad precio...), porque quizas si pone mucho dinero de presupuesto es para tener mejor de las demas características
# lo mismo con la calidad, quizas pide poca para que el precio sea bajo

### There are 3 types of mistakes in the dataset, they are listed in the 3 below cells, with its solution

In [None]:
# # hay 13 que tiene el horario de salida (END) con un valor y tipo incorrecto
# # END de schedules que no son datetime.time
# # Ver filas donde END no es datetime.time
# invalid_rows = schedules_df.loc[
#     ~schedules_df["END"].apply(lambda x: isinstance(x, datetime.time) if isinstance(x, datetime.time) or isinstance(x, str) else False)
# ]

# display((invalid_rows))

# # Substituimos este valor por los que tenemos registrados en data_df
# # Extraer la hora del dataframe histórico data_df y asignarla a las filas incorrectas
# for index, row in invalid_rows.iterrows():
#     # Buscar el nombre en data_df
#     historical_entry = data_df[data_df["TRANSLATOR"] == row["NAME"]]

#     if not historical_entry.empty:
#         # Extraer solo la hora de START ignorando la fecha
#         correct_time = historical_entry.iloc[0]["END"].time()

#         # Reemplazar el valor incorrecto en schedules_df
#         schedules_df.at[index, "END"] = correct_time

# # FIXED
# # GUARDAR EL NUEVO EXCEL
# # schedules_df.to_excel(os.path.join(data_path, "Schedules_fixed.xlsx"), index=False)

In [None]:
# hay algunos con fecha de START == END y si START > ASSIGNED (son errores porque no tiene sentido); pero si START <= ASSIGNED, 
# probablemente signifique que END (deadline) era urgente y no puso fecha límite, no son errores


# provisionalmente cambiamos el START como ASSIGNED para estos casos
# data_df.loc[data_df['START'] >= data_df['ASSIGNED'], 'START'] = data_df['ASSIGNED'] 

In [None]:
# hay 1 fecha irreal (año 1201)
# def detectar_fechas_fuera_de_rango(col):
#     fechas_parsed = pd.to_datetime(data_df[col], errors='coerce', dayfirst=True)
#     fechas_originales = data_df[col]
#     fechas_invalidas = fechas_originales[fechas_parsed.isna()]
#     return fechas_invalidas

# # Aplicarlo a tus columnas:
# fechas_start_invalidas = detectar_fechas_fuera_de_rango('START')
# fechas_end_invalidas = detectar_fechas_fuera_de_rango('END')
# fechas_delivered_invalidas = detectar_fechas_fuera_de_rango('DELIVERED')

# # Mostrarlas
# print("Fechas START inválidas:\n", fechas_start_invalidas)
# print("Fechas END inválidas:\n", fechas_end_invalidas)
# print("Fechas DELIVERED inválidas:\n", fechas_delivered_invalidas)

# # Suponiendo que la fecha problemática está en la columna 'START'
# data_df['START'] = data_df['START'].replace("29/05/1201 18:00:00", "29/04/2012 18:00:00")

# # Ahora convierte la columna de nuevo
# data_df['START'] = pd.to_datetime(data_df['START'], errors='coerce', dayfirst=True)

# print(data_df[data_df['START'] == '29/04/2012 18:00:00'])
#check