In [1]:
import os
import numpy as np
import seaborn as sns
import pandas as pd

import random

from datetime import datetime, timedelta

from matplotlib import rcParams
import matplotlib.pyplot as plt

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from IPython.display import display

### Environmental Configurations

In [2]:
data_path = os.path.join("..", "Data")
schedules_df = pd.read_excel(os.path.join(data_path, "Schedules.xlsx"))
data_df = pd.read_excel(os.path.join(data_path, "Data.xlsx"))
clients_df = pd.read_excel(os.path.join(data_path, "Clients.xlsx"))
transl_cost_pairs_df = pd.read_excel(os.path.join(data_path, "TranslatorsCost+Pairs.xlsx"))

### Envirinmental Variables

In [3]:
RANDOM_SEED = 42

TRANSLATORS_UNAVAILABLE = []

wildcards = [None, "Quality", "Time", "Cost"]
task_types = data_df["TASK_TYPE"].unique()
unique_language_pairs = data_df[["SOURCE_LANG", "TARGET_LANG"]].drop_duplicates().reset_index(drop=True)
min_qualities = [0, 7, 7.5, 8]
pm = ['PMT', 'KMT', 'BMT', 'RMT']

> **Note:** `TRANSLATORS_UNAVAILABLE` is a list for keeping track of translators alredy assigned or performing a task

In [4]:
## TODO we have to think what to do with translators that can perform multiple tasks

#### Task Information
##### General Info
- **PROJECT_ID**: Project code (additional info, likely not necessary).
- **PM**: Responsible management team.
- **TASK_ID**: Task code.
##### Dates
- **START**: Task start date.
- **END**: Theoretical task delivery date (can be compared with `DELIVERED` to check for delays).
##### Task Type (`TASK_TYPE`)
Some considerations must be taken into account:
- **DTP**: Desktop-Publishing tasks.
- **Engineering**: Engineering tasks such as file conversions, coding, etc.
- **LanguageLead**: Linguistic management tasks. Assigned to highly experienced and quality-oriented individuals who regularly work on the project.
- **Management**: General management tasks.
- **Miscellaneous**: Various linguistic tasks.
- **PostEditing**: Post-editing tasks. Similar to Translation tasks but with slightly different skills required for the TRANSLATOR.
- **ProofReading**: Full review of a Translation or PostEditing. Always follows a Translation or PostEditing. The TRANSLATOR assigned must have more 
experience than the person who performed the initial step.
- **Spotcheck**: Partial review of a Translation or PostEditing. Similar conditions as ProofReading.
- **TEST**: Test required to qualify for working with a client. Should be assigned to the most experienced and high-quality TRANSLATOR 
for the client or topic, regardless of price but considering the deadline.
- **Training**: Translator experience and quality are not considered.
- **Translation**: Translation task. The translator’s quality can be slightly lower if the ProofReading (not Spotcheck) is done by a superior. If 
Spotcheck is done, the required quality must be met.
##### Languages
- **SOURCE_LANG**: Source language.
- **TARGET_LANG**: Target language.
##### Workflow 
- **TRANSLATOR**: Translator responsible for the task.
- **ASSIGNED**: Time when the task is assigned (pre-notice) to the TRANSLATOR.
- **READY**: Time when the TRANSLATOR is notified they can start.
- **WORKING**: Time when the TRANSLATOR starts the task.
- **DELIVERED**: Time when the TRANSLATOR delivers the task.
- **RECEIVED**: Time when the PM receives the task.
- **CLOSE**: Time when the PM marks the task as completed.
##### Cost & Quality
- **FORECAST**: Estimated hours for completion.
- **HOURLY_RATE**: Task hourly rate.
- **COST**: Total task cost.
- **QUALITY_EVALUATION**: Quality control evaluation.
##### Client Info
- **MANUFACTURER**: Client.
- **MANUFACTURER_SECTOR**: Level 1 client categorization.
- **MANUFACTURER_INDUSTRY_GROUP**: Level 2 client categorization.
- **MANUFACTURER_INDUSTRY**: Level 3 client categorization.
- **MANUFACTURER_SUBINDUSTRY**: Level 4 client categorization.

In [5]:
class Task:
    def __init__(self, PROJECT_ID, TASK_ID, ASSIGNED, END, SELLING_HOURLY_PRICE, MIN_QUALITY, WILDCARD, TASK_TYPE, SOURCE_LANG, TARGET_LANG, MANUFACTURER, MANUFACTURER_SECTOR, 
                 MANUFACTURER_INDUSTRY_GROUP, MANUFACTURER_INDUSTRY, MANUFACTURER_SUBINDUSTRY, START=None, PM=None, TRANSLATOR=None, READY=None, WORKING=None, DELIVERED=None, 
                 RECEIVED=None, CLOSE=None, FORECAST=None, HOURLY_RATE=None, COST=None, QUALITY_EVALUATION=None):
        """
        A class used to represent a Task. 
        The arguments initialized to None are the information that is not given at the beginning
        """
        self.ASSIGNED = ASSIGNED
        self.TASK_TYPE = TASK_TYPE
        self.SOURCE_LANG = SOURCE_LANG
        self.TARGET_LANG = TARGET_LANG
        self.MANUFACTURER = MANUFACTURER
        self.MANUFACTURER_SECTOR = MANUFACTURER_SECTOR
        self.MANUFACTURER_INDUSTRY_GROUP = MANUFACTURER_INDUSTRY_GROUP
        self.MANUFACTURER_INDUSTRY = MANUFACTURER_INDUSTRY
        self.MANUFACTURER_SUBINDUSTRY = MANUFACTURER_SUBINDUSTRY
        self.MIN_QUALITY = MIN_QUALITY
        self.WILDCARD = WILDCARD 
        self.SELLING_HOURLY_PRICE = SELLING_HOURLY_PRICE

        self.END = END # not given
        self.PROJECT_ID = PROJECT_ID # not given
        self.START = START #not given
        self.PM = PM # not given
        self.TASK_ID = TASK_ID # not given
        self.TRANSLATOR = TRANSLATOR # not given
        self.READY = READY # not given
        self.WORKING = WORKING # not given
        self.DELIVERED = DELIVERED # not given
        self.RECEIVED = RECEIVED # not given
        self.CLOSE = CLOSE # not given
        self.FORECAST = FORECAST # not given
        self.HOURLY_RATE = HOURLY_RATE # not given
        self.COST = COST # not given
        self.QUALITY_EVALUATION = QUALITY_EVALUATION  # not given
        
    
    def __str__(self):
        return (
            f"Task Details:\n"
            f"  - Task ID: {self.TASK_ID}\n"
            f"  - Type: {self.TASK_TYPE}\n"
            f"  - Sector: {self.MANUFACTURER_SECTOR}\n"
            f"  - Industry (Subsector): {self.MANUFACTURER_INDUSTRY}\n"
            f"  - Start: {self.START}\n"
            f"  - Budget: {self.SELLING_HOURLY_PRICE}\n"
            f"  - Quality: {self.MIN_QUALITY}\n"
            f"  - Wildcard: {self.WILDCARD}\n"
            f"  - Source Language: {self.SOURCE_LANG}\n"
            f"  - Target Language: {self.TARGET_LANG}"
        )

### Split Data

In [6]:
# Split into train and validation (e.g., 80% train, 20% validation)
train_df, validation_df = train_test_split(data_df, test_size=0.2, random_state=42)

In [7]:
def drop_and_save_translator_labels(df, translator_column="TRANSLATOR"):
    """
    Extracts and removes translators from test_df, then saves them as a label dict.
    
    Args:
        df (pd.DataFrame): The dataframe containing translator data.
        translator_column (str): The column name that holds the translator labels.
        
    Returns:
        pd.DataFrame: The test_df without the translators column.
        dict: Dictionary of translator labels {index: translators}
    """
    if translator_column not in df.columns:
        raise ValueError(f"Column '{translator_column}' not found in test_df.")
    
    # Extract labels
    translator_labels = df[translator_column].to_dict()
    
    # Drop the column from the DataFrame
    df = df.drop(columns=[translator_column])
    
    return df, translator_labels

train_df_clean, train_translator_labels = drop_and_save_translator_labels(train_df)
validation_df_clean, validation_translator_labels = drop_and_save_translator_labels(validation_df)

In [8]:
train_df_clean.head()

Unnamed: 0,PROJECT_ID,PM,TASK_ID,START,END,TASK_TYPE,SOURCE_LANG,TARGET_LANG,ASSIGNED,READY,...,CLOSE,FORECAST,HOURLY_RATE,COST,QUALITY_EVALUATION,MANUFACTURER,MANUFACTURER_SECTOR,MANUFACTURER_INDUSTRY_GROUP,MANUFACTURER_INDUSTRY,MANUFACTURER_SUBINDUSTRY
308387,217493,RMT,10749045,2019-06-20 13:36:00,2019-06-20 15:15:00,Translation,English,Catalan,2019-06-20 13:57:56,2019-06-20 14:02:01,...,2019-06-20 15:38:59,0.49,17,8.33,8,TrueConnect,Communication Services,Interactive Media & Services,Internet Services & Infrastructure,Internet Services & Infrastructure
445182,219796,PMT,11017600,2021-06-21 09:51:00,2021-06-25 17:00:00,Translation,English,Spanish (Iberian),2021-06-22 09:45:13,2021-06-22 10:14:59,...,2021-06-25 10:45:13,5.44,15,81.6,7,MotorForge,Consumer Discretionary,Automobiles & Components,Automobiles,Automobile Manufacturers
243409,216356,RMT,10608215,2018-03-21 09:56:00,2018-03-22 12:00:00,Translation,English,Spanish (Iberian),2018-03-21 10:12:19,2018-03-21 10:12:28,...,2018-03-22 12:43:01,0.5,17,8.5,7,TrueConnect,Communication Services,Interactive Media & Services,Internet Services & Infrastructure,Internet Services & Infrastructure
206327,215492,KMT,10530327,2017-05-25 16:00:00,2017-05-29 14:00:00,ProofReading,English,Spanish (Iberian),2017-05-24 13:13:35,2017-05-25 13:17:42,...,2017-05-29 15:38:40,0.36,15,5.4,8,Mercury Rail,Industrials,Industrial Conglomerates,Industrial Conglomerates,Industrial Conglomerates
318763,218225,KMT,10773972,2019-08-28 16:41:00,2019-08-29 10:00:00,Miscellaneous,English,Spanish (Iberian),2019-08-28 16:41:32,2019-08-28 16:43:49,...,2019-08-29 09:12:36,0.25,15,3.75,6,VitalSign Innovations,Health Care,Health Care Equipment & Supplies,Health Care Equipment & Supplies,Health Care Equipment & Supplies


In [9]:
for i, (key, value) in enumerate(train_translator_labels.items()):
    print(f"Task Index: {key}, Translator: {value}")
    if i == 10:
        break

Task Index: 308387, Translator: Daiana Rosario
Task Index: 445182, Translator: Laurina Santiago
Task Index: 243409, Translator: Nieves Leocadia
Task Index: 206327, Translator: Almudena Fiamma
Task Index: 318763, Translator: Abelardo
Task Index: 256003, Translator: Connor
Task Index: 532685, Translator: Artur Fulgencio
Task Index: 514381, Translator: Sussana
Task Index: 437780, Translator: Guillermo
Task Index: 61640, Translator: Nieves Leocadia
Task Index: 404833, Translator: Casiano


### Explanation of what is happening

In the global scope:

0. We use as base of useful features the dataframe `transl_cost_pairs_df`
It includes:
- TRANSLATOR: Translator name.
- SOURCE_LANG: Source language.
- TARGET_LANG: Target language.
- HOURLY_RATE: Cost per hour.


1. We compute the **average proportional delay** (speed) from the `data_df` for each translator and merge it with this base dataframe


Now for the each task:

2. We start to the strict filter:
    - Filter of **languages**: only consider the translators who offer this translation
    - Filter of **price**: only consider the prices below the threshold
    - Filter of **quality** by language: this is done by making an average of the quality of these languages for each translator, and then using it as a threshold. 
    - Filter of **availability**: if the taks lasts less than 7 days we check whether the translator will even work before the theoretical deadline (it is 7 days because everyone works at least once a week)

3. We do a **weighted knn**:
    - We do it on the *perfect point* (price = 0, quality = 10, speed = 100%, experience = 10... (orientative values))
    - The weights are chosen by the wildcards and by the expereince required for the type of task
        A weighted knn, after normalizing, it distorts the chosen axis size to give more or less weight. 
    - The similarity score is the final ranking. 

4. Outcome possibilities: ***(work in progress)***
    - We get None or too few translators: if it is None we use the wildcard to completely ignore that factor in the strict filter
    - We get a lot of translators: (we need the rest of features to give better recommendations)

    

In [10]:
# ----- THESE ARE GENERAL FUNCTIONS THAT CAN BE USED TO ACTUALIZE THE DATA IF NEW TASKS WERE ADDED TO THE DATASET -----
def compute_delay_percentage(data_df):
    """
    Compute the delay_percentage of each translator based on the average time taken to complete tasks. 
    This is the time overrun percentage: Negative values mean the task was early, positives mean it was late. 
    
    Args:
        data_df (pd.DataFrame):
            DataFrame containing the data of the tasks (to actualize data).
        
    Returns:
        translators_attributes_df (pd.DataFrame) with the delay_percentage.
    """
    date_cols = ['START', 'END', 'DELIVERED']
    for col in date_cols:
        data_df[col] = pd.to_datetime(data_df[col], errors='coerce')  # convert str a Timestamps
        
    # Compute the time overrun percentage of each task
    # Avoid division by zero
    duration = data_df['END'] - data_df['START']
    duration = duration.replace(pd.Timedelta(0), pd.NaT)  # Avoid zero duration
    data_df['DELAY_PERCENTAGE'] = ((data_df['DELIVERED'] - data_df['END']) / (duration)) * 100

    data_df['DELAY_PERCENTAGE'] = data_df['DELAY_PERCENTAGE'].replace([np.inf, np.nan], 0)  # Replace inf and NaN with 0

    # Limit the delay percentage to 100%
    data_df['DELAY_PERCENTAGE'] = data_df['DELAY_PERCENTAGE'].clip(upper=100)  # Limit to 100% -> double the time predicted

    # Compute the mean delay percentage for each translator
    avg_delay_by_translator = data_df.groupby('TRANSLATOR')['DELAY_PERCENTAGE'].mean().reset_index()
    avg_delay_by_translator['DELAY_PERCENTAGE'] = avg_delay_by_translator['DELAY_PERCENTAGE'].round(2)

    # Rename the column for clarity
    avg_delay_by_translator.rename(columns={'DELAY_PERCENTAGE': 'AVG_DELAY_PERCENTAGE'}, inplace=True)

    # Merge the average delay with the translator cost pairs DataFrame
    translators_attributes_df = transl_cost_pairs_df.merge(avg_delay_by_translator, on='TRANSLATOR', how='left')

    #TAKE INTO ACCOUNT: Task count is added to the dataset to help judge reliability (e.g if the quality of translator is calculated using a few tasks). 
    return translators_attributes_df

In [11]:
# ----- THESE ARE FUNCTIONS TO CALCULATE QUALITY AND EXPERIENCE ONCE MADE THE GENERAL FILTERING -----
def compute_number_tasks(data_df, df_filtered):
    """
    Computes the number of tasks for each translator.
    
    Args:
        data_df (pd.DataFrame): 
            DataFrame containing the data of the tasks.
        df_filtered (pd.DataFrame): 
            DataFrame containing the filtered translators' attributes.

    Returns:
        pd.DataFrame: 
            The filtered DataFrame (`df_filtered`) with the number of tasks performed for each translator.
    """
    # Count the number of tasks each translator has done
    task_counts = data_df.groupby('TRANSLATOR').size().reset_index(name='NUM_TASKS')

    # Merge the task counts into the filtered dataframe
    df_filtered = df_filtered.merge(task_counts, on='TRANSLATOR', how='left')

    # Fill missing values (i.e., translators with no tasks) with 0
    df_filtered['NUM_TASKS'] = df_filtered['NUM_TASKS'].fillna(0).astype(int)

    return df_filtered


def compute_quality_by_languages(df_filtered, source_lang, target_lang):
    """
    Computes average quality for a given language pair (source_lang → target_lang).
    
    Args:
        df_filtered (pd.DataFrame): Filtered translators.
        source_lang (str): Source language.
        target_lang (str): Target language.
    
    Returns:
        pd.DataFrame: Same df_filtered with new 'AVG_QUALITY_BY_LG' column.
    """
    # Filter original dataframe by the language pair and translators in df_filtered
    mask_lang_pair = (
        (data_df['SOURCE_LANG'] == source_lang) &
        (data_df['TARGET_LANG'] == target_lang) &
        (data_df['TRANSLATOR'].isin(df_filtered['TRANSLATOR']))
    )

    # Compute the average quality for each translator in the filtered dataframe
    avg_quality = (
        data_df[mask_lang_pair]
        .groupby('TRANSLATOR')['QUALITY_EVALUATION']
        .mean()
        .round(2)
    )

    # Assing the average quality to the filtered df
    df_filtered['AVG_QUALITY_BY_LG'] = df_filtered['TRANSLATOR'].map(avg_quality)

    return df_filtered


def compute_quality_by_task_type(df_filtered, task_type):
    """
    Computes the average quality for each translator for a given task type.
    If the translator has no experience with that task, falls back to:
      - their overall average quality (with a penalty), or
      - the global average quality (with a stronger penalty).
    
    Args:
        df_filtered (pd.DataFrame): DataFrame with filtered translators.
        task_type (str): The specific task type to evaluate.
    
    Returns:
        pd.DataFrame: df_filtered with 'AVG_QUALITY_BY_TASK' and 'QUALITY_SOURCE_TASK'.
    """
    translators = df_filtered['TRANSLATOR'].unique()

    # 1. Compute average quality for given task type
    mask_task = (
        (data_df['TASK_TYPE'] == task_type) &
        (data_df['TRANSLATOR'].isin(translators))
    )

    avg_by_task = (
        data_df[mask_task]
        .groupby('TRANSLATOR')['QUALITY_EVALUATION']
        .mean()
        .round(2)
    )

    df_filtered['AVG_QUALITY_BY_TASK'] = df_filtered['TRANSLATOR'].map(avg_by_task)
    df_filtered['QUALITY_SOURCE_TASK'] = 'original'

    # 2. Fallback to penalized overall average
    mask_missing = df_filtered['AVG_QUALITY_BY_TASK'].isna()

    overall_avg = (
        data_df[data_df['TRANSLATOR'].isin(translators)]
        .groupby('TRANSLATOR')['QUALITY_EVALUATION']
        .mean()
        .round(2)
        .apply(lambda x: x - 1 if pd.notnull(x) else None)  # configurable penalization, for flexibility
        #For a data-driven approach, use can standard deviation or percentile-based penalization to adapt to the distribution of quality scores
    )

    df_filtered.loc[mask_missing, 'AVG_QUALITY_BY_TASK'] = df_filtered.loc[mask_missing, 'TRANSLATOR'].map(overall_avg)
    df_filtered.loc[mask_missing, 'QUALITY_SOURCE_TASK'] = 'overall_penalized'

    # 3. Fallback to global average (penalized)
    mask_global = df_filtered['AVG_QUALITY_BY_TASK'].isna()
    global_mean = data_df['QUALITY_EVALUATION'].mean()

    df_filtered.loc[mask_global, 'AVG_QUALITY_BY_TASK'] = global_mean - 1.5  #Same as the other penalization, can be configured
    df_filtered.loc[mask_global, 'QUALITY_SOURCE_TASK'] = 'global_penalized'

    return df_filtered


def compute_experience(df_filtered, task_type, source_lang, target_lang, industry, subindustry):
    """
    Computes a soft experience score for each translator based on how many
    dimensions match (task_type, language pair, industry, subindustry).

    Args:
        df_filtered (pd.DataFrame): Filtered translators' dataframe.

    Returns:
        pd.DataFrame: With added column 'EXPERIENCE_SCORE'.
    """
    TASK_TYPE_BONUS = {
    'LanguageLead': 0.5,
    'ProofReading': 0.5,
    'Spotcheck': 0.5
    }
    
    translators = df_filtered['TRANSLATOR'].unique()
    df = data_df[data_df['TRANSLATOR'].isin(translators)].copy()

    # Base score: match on source, target, task_type
    df['score'] = 0
    df['score'] += (df['SOURCE_LANG'] == source_lang).astype(int)
    df['score'] += (df['TARGET_LANG'] == target_lang).astype(int)
    df['score'] += (df['TASK_TYPE'] == task_type).astype(int)

    # Add only 1 point if either industry or subindustry match (but not double)
    industry_match = (df['MANUFACTURER_INDUSTRY'] == industry)
    subindustry_match = (df['MANUFACTURER_SUBINDUSTRY'] == subindustry)
    df['score'] += ((industry_match | subindustry_match)).astype(int)

    # Advanced task bonus
    bonus_df = df[df['TASK_TYPE'].isin(TASK_TYPE_BONUS)]
    bonus_df['bonus'] = bonus_df['TASK_TYPE'].map(TASK_TYPE_BONUS)
    bonus_scores = bonus_df.groupby('TRANSLATOR')['bonus'].sum()

    # Base score
    base_scores = df.groupby('TRANSLATOR')['score'].sum()

    # Total experience = base + bonus
    total_score = base_scores.add(bonus_scores, fill_value=0)

    df_filtered['EXPERIENCE_SCORE'] = df_filtered['TRANSLATOR'].map(total_score).fillna(0).round(2)

    ## TODO normalize by the total number of tasks done ?? 

    return df_filtered


def compute_experience_for_client(df_filtered, client):
    """
    Computes an experience score for each translator based on a specific client

    Args:
        df_filtered (pd.DataFrame): Filtered translators' dataframe.

    Returns:
        pd.DataFrame: With added column 'EXPERIENCE_CLIENT'.
    """
    translators = df_filtered['TRANSLATOR'].unique()
    df = data_df[data_df['TRANSLATOR'].isin(translators)].copy()


    df['score'] = 0
    df['score'] += (df['MANUFACTURER'] == client).astype(int)

    # Total experience score = sum of weights per translator
    experience_scores = df.groupby('TRANSLATOR')['score'].sum()

    # Add to filtered dataframe
    df_filtered['EXPERIENCE_CLIENT'] = df_filtered['TRANSLATOR'].map(experience_scores).fillna(0).astype(int)

    # TODO: The normalization aso can be applied ???

    return df_filtered



# ----- PRINCIPAL FUNCTION TO FILTER THE TRANSLATORS' ATTRIBUTES -----
def available_translators(task, translators_attributes_df, schedules_df, TRANSLATORS_UNAVAILABLE):
    """
    Checks if translators are available for the task based on their weekly working schedule.
    This, for now just takes into account the day of the week and the start time of the task. 
    TAKE INTO ACCOUNT: This can have problems if the translator is at the end of their weekly shedule, it also doesnt take into account multitasking.
    
    Args:
        task (Task object): The task for which we want to check availability.
        translators_attributes_df (pd.DataFrame): DataFrame containing the translators' attributes.
        schedules_df (pd.DataFrame): DataFrame containing the weekly schedules of translators.
        TRANSLATORS_UNAVAILABLE (list): List of translators who are unavailable.
        
    Returns:
        df_filtered (pd.DataFrame): Filtered DataFrame containing translators who are available.
    """
    # 1. Remove explicitly unavailable translators
    ##TODO:check multitasking
    df_filtered = translators_attributes_df[~translators_attributes_df['TRANSLATOR'].isin(TRANSLATORS_UNAVAILABLE)].copy()

    # 2. Extract day of week and time from task
    task_day = task.ASSIGNED.strftime('%a').upper()  #day of the week  e.g., 'MON', 'TUE'
    task_start_time = task.ASSIGNED.time() #time of the day e.g., 10:00:00
    task_end_time = task.ASSIGNED + timedelta(hours=1) #time of the day e.g., 11:00:00

    # 3. Merge schedule info
    df_filtered = df_filtered.merge(schedules_df, left_on='TRANSLATOR', right_on='NAME', how='left')

    def is_available(row):
        # 3.1 Check if works that day
        if row[task_day] != 1:
            return False

        # 3.2 Parse working hours
        work_start = row['START']
        work_end = row['END']

        task_start_time = timedelta(hours=task.ASSIGNED.hour, minutes=task.ASSIGNED.minute)
        task_end_time = timedelta(hours=task.ASSIGNED.hour +1, minutes=task.ASSIGNED.minute)
        work_start_time = timedelta(hours=row['START'].hour, minutes=row['START'].minute)
        work_end_time = timedelta(hours=row['END'].hour, minutes=row['END'].minute)
        
        return (task_start_time >= work_start_time) and (task_end_time < work_end_time) #TODO ensure its not too close to the end of the shift, i dont know how to do this 

    # 4. Apply availability logic
    df_filtered['IS_AVAILABLE'] = df_filtered.apply(is_available, axis=1)
    df_filtered = df_filtered[df_filtered['IS_AVAILABLE'] == True].drop(columns=['IS_AVAILABLE'])

    return df_filtered


def filter_language_price_quality_availability(data_df, schedules_df, translators_attributes_df, task = Task, need_wildcard = False):
    """
    Filters the translators' attributes by languages, price, quality and availability.
    If need_wildcard is True, it will skip the filter corresponding to the wildcard.
    
    Args:
        translators_attributes_df (pd.DataFrame): 
            DataFrame containing the translators' attributes (name, languages, price, speed).
        task (Task object): 
            The task for which we want to filter the translators.
        need_wildcard (bool): 
            If True, skip the filter corresponding to the wildcard.
            
    Returns:
        pd.DataFrame: 
            Filtered DataFrame containing translators who meet the criteria.
    """
    
    if not need_wildcard:
        # Filter by language, price HARD FILTER
        df_filtered = translators_attributes_df[
            (translators_attributes_df['SOURCE_LANG'] == task.SOURCE_LANG) & 
            (translators_attributes_df['TARGET_LANG'] == task.TARGET_LANG) &
            (translators_attributes_df['HOURLY_RATE'] <= task.SELLING_HOURLY_PRICE) 
        ].copy()

        df_filtered = compute_number_tasks(data_df, df_filtered)

        # add the average quality column
        df_filtered = compute_quality_by_task_type(df_filtered, task_type=task.TASK_TYPE)
        df_filtered = compute_quality_by_languages(df_filtered, source_lang=task.SOURCE_LANG, target_lang=task.TARGET_LANG)

        df_filtered = df_filtered[df_filtered['AVG_QUALITY_BY_LG'] >= task.MIN_QUALITY]
        df_filtered = df_filtered[df_filtered['AVG_QUALITY_BY_TASK'] >= task.MIN_QUALITY]

        # Filter by availability
        df_filtered = available_translators(task, df_filtered, schedules_df, TRANSLATORS_UNAVAILABLE)
        
        return df_filtered
    
    # same code as above but with the wildcard, it will skip the filter corresponding to the wildcard
    else:
        # if the wildcard is "Price", we don't filter by price
        price_condition = (translators_attributes_df['HOURLY_RATE'] <= task.SELLING_HOURLY_PRICE) if task.WILDCARD != "Price" else True
        # Filter by language, price 
        df_filtered = translators_attributes_df[
            (translators_attributes_df['SOURCE_LANG'] == task.SOURCE_LANG) & 
            (translators_attributes_df['TARGET_LANG'] == task.TARGET_LANG) &
            price_condition 
        ].copy()

        translators_attributes_df = compute_number_tasks(train_df_clean, df_filtered)

        if task.WILDCARD != "Quality":
            # add the average quality column
            df_filtered = compute_quality_by_languages(df_filtered, source_lang=task.SOURCE_LANG, target_lang=task.TARGET_LANG)
            df_filtered = compute_quality_by_task_type(df_filtered, task_type=task.TASK_TYPE)

            df_filtered = df_filtered[df_filtered['AVG_QUALITY_BY_LG'] >= task.MIN_QUALITY]
            df_filtered = df_filtered[df_filtered['AVG_QUALITY_BY_TASK'] >= task.MIN_QUALITY]

        if task.WILDCARD != "Deadline":
            # Filter by availability
            df_filtered = available_translators(task, df_filtered, schedules_df, TRANSLATORS_UNAVAILABLE)
        
        return df_filtered

In [None]:
### -------------- DEBUGG PART --------------

In [12]:
print(validation_df_clean.iloc[0])

PROJECT_ID                                                 219728
PM                                                            BMT
TASK_ID                                                  11057300
START                                         2021-09-29 09:15:00
END                                           2021-09-29 19:30:00
TASK_TYPE                                             PostEditing
SOURCE_LANG                                               English
TARGET_LANG                                          Spanish (LA)
ASSIGNED                                      2021-09-29 10:06:41
READY                                         2021-09-29 10:06:48
WORKING                                       2021-09-29 16:01:30
DELIVERED                                     2021-09-29 19:44:06
RECEIVED                                      2021-09-29 19:50:27
CLOSE                                         2021-09-29 19:50:32
FORECAST                                                     3.42
HOURLY_RAT

In [None]:
translators_attributes_df = compute_delay_percentage(train_df)

# Take a task from the validation set
new_task = validation_df_clean.iloc[0].copy()
new_task.to_dict()

#Change the columns
match = clients_df[clients_df['CLIENT_NAME'] == new_task['MANUFACTURER']]

if not match.empty:
    new_task['WILDCARD'] = match.iloc[0]['WILDCARD']
    new_task['HOURLY_RATE'] = match.iloc[0]['SELLING_HOURLY_PRICE']
    new_task['QUALITY_EVALUATION'] = match.iloc[0]['MIN_QUALITY']
else:
    print("WARNING: No match found in schedules_df for the given client. Setting default values.")
    # Default to a specific wildcard if no match is found
    new_task['WILDCARD'] = 'Quality'
    new_task['HOURLY_RATE'] = new_task['SELLING_HOURLY_PRICE']
    new_task['QUALITY_EVALUATION'] = new_task['MIN_QUALITY']

new_task['ASSIGNED'] = datetime.now()  # Sets to current datetime
new_task = new_task.rename({'HOURLY_RATE': 'SELLING_HOURLY_PRICE'})
new_task = new_task.rename({'QUALITY_EVALUATION': 'MIN_QUALITY'})

# Convert to dict to use **kwargs
# **kwargs: Pass a dictionary of named arguments (key=value) in a descomposed way (what we want here)
new_task = Task(**new_task.to_dict())

need_wildcard = False
print("New task:")
print(new_task)

df_filtered = filter_language_price_quality_availability(train_df, schedules_df, translators_attributes_df, new_task)

New task:
Task Details:
  - Task ID: 11057300
  - Type: PostEditing
  - Sector: Consumer Discretionary
  - Industry (Subsector): Internet & Direct Marketing Retail
  - Start: 2021-09-29 09:15:00
  - Budget: 20
  - Quality: 7.0
  - Wildcard: Price
  - Source Language: English
  - Target Language: Spanish (LA)


In [14]:
print(df_filtered.head())

        TRANSLATOR SOURCE_LANG   TARGET_LANG  HOURLY_RATE  \
0            Abdon     English  Spanish (LA)           17   
1  Acacio Cayetano     English  Spanish (LA)           12   
2     Alejo Esdras     English  Spanish (LA)           13   
3     Alfonso Odon     English  Spanish (LA)           12   
4  Almudena Fiamma     English  Spanish (LA)           15   

   AVG_DELAY_PERCENTAGE  NUM_TASKS  AVG_QUALITY_BY_TASK QUALITY_SOURCE_TASK  \
0                -24.84         20                 7.42            original   
1                -17.77        390                 7.03            original   
2                -21.24        415                 7.47            original   
3                 50.30          2                 7.00            original   
4                -75.90       9288                 7.05            original   

   AVG_QUALITY_BY_LG             NAME     START       END  MON  TUE  WED  THU  \
0               7.62            Abdon  10:00:00  20:00:00    1    1    1    1

In [18]:
compute_experience(df_filtered, task_type=new_task.TASK_TYPE, source_lang=new_task.SOURCE_LANG, target_lang=new_task.TARGET_LANG, industry=new_task.MANUFACTURER_INDUSTRY, subindustry=new_task.MANUFACTURER_SUBINDUSTRY)
compute_experience_for_client(df_filtered, client=new_task.MANUFACTURER)

print(df_filtered.head())

        TRANSLATOR SOURCE_LANG   TARGET_LANG  HOURLY_RATE  \
0            Abdon     English  Spanish (LA)           17   
1  Acacio Cayetano     English  Spanish (LA)           12   
2     Alejo Esdras     English  Spanish (LA)           13   
3     Alfonso Odon     English  Spanish (LA)           12   
4  Almudena Fiamma     English  Spanish (LA)           15   

   AVG_DELAY_PERCENTAGE  NUM_TASKS  AVG_QUALITY_BY_TASK QUALITY_SOURCE_TASK  \
0                -24.84         20                 7.42            original   
1                -17.77        390                 7.03            original   
2                -21.24        415                 7.47            original   
3                 50.30          2                 7.00            original   
4                -75.90       9288                 7.05            original   

   AVG_QUALITY_BY_LG             NAME  ...       END MON  TUE  WED  THU  FRI  \
0               7.62            Abdon  ...  20:00:00   1    1    1    1    1  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bonus_df['bonus'] = bonus_df['TASK_TYPE'].map(TASK_TYPE_BONUS)


In [15]:
"""
At this stage, we have a filtered dataframe with the translators that are available for the task,
it contains the following columns:
    - TRANSLATOR: Name of the translator
    - SOURCE_LANG: Source language of the translator
    - TARGET_LANG: Target language of the translator
    - HOURLY_RATE: Hourly rate of the translator
    - Filtered by the availability

    Things to take into account for the calculation of the scores:
    - AVG_QUALITY_BY_LNG: Average quality by language pair (if applicable)
    - AVG_QUALITY_BY_TASK: Average quality by task type (if applicable)
    - QUALITY_SOURCE_TASK: Source of the quality by task score (original, overall_penalized, global_penalized)
    - NUM_TASKS: Number of tasks performed by the translator (to take into account the reliability of the translator's quality and delay percentage, not experience because it is calculated based on specific tasks)

    - EXPERIENCE_SCORE: Experience score based on task type, language pair, industry, and subindustry
    - EXPERIENCE_CLIENT: Experience score based on the specific client (if applicable)
    - AVG_DELAY_PERCENTAGE: Average delay percentage of the translator (if applicable)

    
There are some key considerations regarding the experience and quality weights:
    - Proofreading and Spotcheck need more expereinced translators.
    - LanguageLead is a more advanced task, so it needs more experience and quality.
    - Test should be assigned to the most experienced and high-quality TRANSLATOR for the client or topic, regardless of price.
    - For training we dont need to take into account the experience nor the quality

"""

"\nAt this stage, we have a filtered dataframe with the translators that are available for the task,\nit contains the following columns:\n    - TRANSLATOR: Name of the translator\n    - SOURCE_LANG: Source language of the translator\n    - TARGET_LANG: Target language of the translator\n    - HOURLY_RATE: Hourly rate of the translator\n    - Filtered by the availability\n\n    Things to take into account for the calculation of the scores:\n    - AVG_QUALITY_BY_LNG: Average quality by language pair (if applicable)\n    - AVG_QUALITY_BY_TASK: Average quality by task type (if applicable)\n    - QUALITY_SOURCE_TASK: Source of the quality by task score (original, overall_penalized, global_penalized)\n    - NUM_TASKS: Number of tasks performed by the translator (to take into account the reliability of the translator's quality and delay percentage, not experience because it is calculated based on specific tasks)\n\n    - EXPERIENCE_SCORE: Experience score based on task type, language pair, in

In [16]:
# ----- KNN ----
def knn(df_filtered, task, need_wildcard=False):
    """
    Optimized KNN to find the best translators based on the task's requirements.
    
    Args:
        df_filtered (pd.DataFrame): DataFrame containing the filtered translators' attributes.
        task (Task object): The task for which we are finding suitable translators.
        need_wildcard (bool): Whether to ignore the wildcard feature in KNN calculation.

    Returns:
        distances (np.ndarray): Distances of the nearest neighbors.
        indexes (np.ndarray): Indices of the nearest neighbors in the original DataFrame.
    """
    # Define features for the KNN
    features = ['HOURLY_RATE', 'AVG_QUALITY_BY_LNG', 'AVG_QUALITY_BY_TASK', 'AVG_DELAY_PERCENTAGE', 'EXPERIENCE_SCORE', 'EXPERIENCE_CLIENT']
    ideal_values = [1, 10, 10, -100, np.inf, np.inf]  # Ideal values for the features (price, quality, speed, experience), this is like the ideal translator
    
    # Initialize weights (weights for each feature)
    weights = np.array([1, 1.5, 1.5, 0.25, 1, 0.5])  # Default weights for the features

    # Adjust weights based on task type
    if task.TASK_TYPE == 'ProofReading' or task.TASK_TYPE == 'Spotcheck':
        # Higher weight for experience and quality
        weights[4] *= 2  # Experience score weight increased
        weights[1] *= 2  # Quality by language pair weight increased
        weights[2] *= 2  # Quality by task weight increased
    elif task.TASK_TYPE == 'LanguageLead':
        # LanguageLead requires more experience and quality
        weights[4] *= 2
        weights[1] *= 2
        weights[2] *= 2
    elif task.TASK_TYPE == 'Test':
        # Test task needs the highest experience and quality, ignore price
        weights[0] = 0  # Price doesn't matter for Test
        weights[1] *= 2  # Quality by language pair weight increased
        weights[2] *= 2  # Quality by task weight increased
        weights[4] *= 2  # Experience score weight increased
    elif task.TASK_TYPE == 'Training':
        # Training task doesn't consider experience or quality
        weights[1] = 0  # Set quality to 0
        weights[2] = 0  # Set quality to 0
        weights[4] = 0  # Set experience to 0

    # If wildcard is required, adjust the weights based on the wildcard preference
    if not need_wildcard:
        wildcard_vector = np.ones_like(weights)

        if task.WILDCARD == 'Price':
            wildcard_vector[0] = 0.25  # Price becomes less important
        elif task.WILDCARD == 'Quality':
            wildcard_vector[1] = 0.25  # Quality becomes less important
        elif task.WILDCARD == 'Deadline':
            wildcard_vector[3] = 0.25  # Deadline (Avg. Delay) becomes less important

        weights = weights * wildcard_vector  # Apply adjusted weights based on wildcard

    # Select features for the KNN calculation
    X = df_filtered[features]

    # Standardize the features (scaling)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_weighted = X_scaled * weights  # Apply the weights to scaled features

    # Initialize and train the KNN model
    knn = NearestNeighbors(metric='euclidean')
    knn.fit(X_weighted)

    # Define ideal task values as a DataFrame
    task_df = pd.DataFrame([ideal_values], columns=features)
    task_scaled = scaler.transform(task_df)
    task_weighted = task_scaled * weights  # Weight the task ideal values too

    # Find the nearest neighbors based on the task's ideal values
    distances, indexes = knn.kneighbors(task_weighted, n_neighbors=len(df_filtered))  # Adjust the number of neighbors as needed

    return distances, indexes


def get_best_translators(df_filtered, indexes, distances):
    """
    Get the best translators based on the KNN results.
    
    Args:
        df_filtered (pd.DataFrame): 
            Contains the filtered translators' attributes (name, language, price, quality, speed).
        indexes (np.ndarray): 
            Indices of the nearest neighbors in the df_filtered.
        distances (np.ndarray): 
            Distances of the nearest neighbors.
            
    Returns:
        df_filtered (pd.DataFrame): 
            Contains the filtered translators' attributes (name, language, price, quality, speed AND similarity_score).
    """
    
    selected_translators = df_filtered.iloc[indexes[0]].copy()
    
    # Add the similarity score
    selected_translators['Similarity Score'] = distances[0].round(2)  # Round to 2 decimal places

    # Sort by similarity score (ascending: closest match first)
    selected_translators = selected_translators.sort_values(by='Similarity Score', ascending=True) 

    return selected_translators

In [17]:
# ----- MAIN CODE ----
# Creates a dataframe with  the additional attributes
translators_attributes_df = compute_delay_percentage(train_df_clean)

# Example of generating a task
new_task = validation_df_clean[0].copy()
tasks = []
tasks.append(new_task)

for task in tasks:
    need_wildcard = False
    print("New task:")
    print(task)
    df_filtered = filter_language_price_quality_availability(train_df_clean, schedules_df, translators_attributes_df, new_task)
    
    if df_filtered.empty:
        print("\nNo available translators. Possible reasons:")
        print("1. No translators available because the quality is too high")
        print("2. No translators available because the price is too low")
        print("\nTrying with the wildcard...\n")        
        need_wildcard = True
        df_filtered = filter_language_price_quality_availability(translators_attributes_df, new_task, need_wildcard = need_wildcard)
        if df_filtered.empty:
            print("No available translators even with the wildcard")
            #TODO implement, see which is the strict filter or something inside the function to know the remedy
            # ofrecer ignorando calidad, precio y deadline. Y si ni siquiera así hay, pues ofrecer otros idiomas parecidos (español vs argentino, english uk vs english us, etc.)
            continue
        
    # Compute the KNN
    distances, indexes = knn(df_filtered, task, need_wildcard = need_wildcard)

    # Get the dataframe with the best translators
    best_translators_df = get_best_translators(df_filtered, indexes, distances)
    display(best_translators_df)

KeyError: 'TRANSLATOR'

### Types of mistakes in the dataset they have been mostly added to the analysis to keep things clean

one of them is that some of the translators (13) finish after 0am 

In [None]:
# hay algunos con fecha de START == END y si START > ASSIGNED (son errores porque no tiene sentido); pero si START <= ASSIGNED, 
# probablemente signifique que END (deadline) era urgente y no puso fecha límite, no son errores


# provisionalmente cambiamos el START como ASSIGNED para estos casos
# data_df.loc[data_df['START'] > data_df['ASSIGNED'], 'START'] = data_df['ASSIGNED'] 

## TODO we have to discuss this further, it have been added to data analysis but not shure how to address it 

### Next steps
actualize the unavailable translators:
- add it to the list when one is selected by the client in the forntend
- remove it from the list when the task has been finished 