Tyfenn Eloy  
Mathias Berthonneau  
Arthur-Louis Bonneau  
Loris Nezan

<h1 style="text-align:center">Process Mining</h1>
<h2 style="text-align:center">Project - Study of potential bias in the rental process</h2>

In [None]:
import pm4py
import pandas as pd
import matplotlib.pyplot as plt
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
log_low = pm4py.read_xes('data/renting_log_low.xes')
log_medium = pm4py.read_xes('data/renting_log_medium.xes')
log_high = pm4py.read_xes('data/renting_log_high.xes')

In [None]:
log_low.columns

## General trace analysis

In [None]:
CASE_ID_COL = 'concept:name'
ACTIVITY_COL = 'activity'
TIMESTAMP_COL = 'time:timestamp'

print(f"Case ID Column: {CASE_ID_COL}")
print(f"Activity Column: {ACTIVITY_COL}")
print(f"Timestamp Column: {TIMESTAMP_COL}")

Statistics for the `low` dataset:

In [None]:
num_cases = len(log_low.groupby(CASE_ID_COL))
num_events = len(log_low)

variants = pm4py.get_variants(log_low)
num_variants = len(variants)

print(f"Number of cases: {num_cases}")
print(f"Number of events: {num_events}")
print(f"Number of variants: {num_variants}")

Statistics for the `medium` dataset:

In [None]:
num_cases = len(log_medium.groupby(CASE_ID_COL))
num_events = len(log_medium)

variants = pm4py.get_variants(log_medium)
num_variants = len(variants)

print(f"Number of cases: {num_cases}")
print(f"Number of events: {num_events}")
print(f"Number of variants: {num_variants}")

Statistics for the `high` dataset:

In [None]:
num_cases = len(log_high.groupby(CASE_ID_COL))
num_events = len(log_high)

variants = pm4py.get_variants(log_high)
num_variants = len(variants)

print(f"Number of cases: {num_cases}")
print(f"Number of events: {num_events}")
print(f"Number of variants: {num_variants}")

In [None]:
#Getting first and last event
first_event = pm4py.get_start_activities(log_low)
last_event = pm4py.get_end_activities(log_low)

print(f"First event: {first_event}")
print(f"Last event: {last_event}")

#Getting first and last event
first_event = pm4py.get_start_activities(log_medium)
last_event = pm4py.get_end_activities(log_medium)

print(f"First event: {first_event}")
print(f"Last event: {last_event}")

#Getting first and last event
first_event = pm4py.get_start_activities(log_high)
last_event = pm4py.get_end_activities(log_high)

print(f"First event: {first_event}")
print(f"Last event: {last_event}")



The general statistics of the different datasets are similar in term of size. We will be able to compare them easily.

## Drawing a general diagram of the flow

In [None]:
from pm4py.algo.discovery.alpha import algorithm as alpha_miner

In [None]:
net, initial_marking, final_marking = alpha_miner.apply(log_low)
pm4py.view_petri_net(net, initial_marking, final_marking)
pm4py.view_events_distribution_graph(log_low)
pm4py.discover_directly_follows_graph(log_low)

In [None]:
# The following function output a big picture of the process. It is not very readable and heavy to compute, so you can find the result in the `prefix_tree.png` file.

# tree = pm4py.discover_prefix_tree(log_low)
# pm4py.view_prefix_tree(tree)

In [None]:
pm4py.view_footprints(pm4py.discover_footprints(log_low))

In [None]:
declare = pm4py.discover_declare(log_low)
diag = pm4py.conformance_declare(log_low, declare, return_diagnostics_dataframe=True)

In [None]:
diag

In the renting process, we can identify 4 main steps:

- The user is applying for a place
- The landlord is accepting or rejecting the application
- The user is paying the rent or not
- The user can get evicted or not

Based on this, we can look for potential bias in the process regarding the additional information we have on the users :

- Age
- Education
- Gender
- Citizenship
- Life situation (marriage, single)

In [None]:
# split per "case:age"
log_young_low = log_low[log_low['case:age'] < 25]
log_middle_low = log_low[(log_low['case:age'] >= 25) & (log_low['case:age'] <= 50)]
log_old_low = log_low[log_low['case:age'] > 50]

net, initial_marking, final_marking = alpha_miner.apply(log_old_low)
pm4py.view_petri_net(net, initial_marking, final_marking)

In [None]:
# split per "case:yearsOfEducation"

log_uneducated_low = log_low[log_low['case:yearsOfEducation'] < 5]
log_medium_education_low = log_low[(log_low['case:yearsOfEducation'] >= 5) & (log_low['case:yearsOfEducation'] <= 15)]
log_high_education_low = log_low[log_low['case:yearsOfEducation'] > 16]

net, initial_marking, final_marking = alpha_miner.apply(log_high_education_low)
pm4py.view_petri_net(net, initial_marking, final_marking)

In [None]:
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization

dfg = dfg_discovery.apply(log_low)
gviz = dfg_visualization.apply(dfg, log=log_low, variant=dfg_visualization.Variants.FREQUENCY)
dfg_visualization.view(gviz)


## Racism yeah

In [None]:
def printSexism(log, logName):
    # split per "case:age"
    log_male = log[log['case:gender'] ==  False]
    log_female = log[log['case:gender'] == True]
    nb_trace_male = len(log_male)
    nb_trace_female = len(log_female)

    # Get len events "Sign Contract" 
    signed_contract_male = len(log_male[log_male['activity'] == 'Sign Contract'])
    signed_contract_female = len(log_female[log_female['activity'] == 'Sign Contract'])

    # Get len events "Reject Prospective Tenant"
    rejected_contract_male = len(log_male[log_male['activity'] == 'Reject Prospective Tenant'])
    rejected_contract_female = len(log_female[log_female['activity'] == 'Reject Prospective Tenant'])

    # get % of rejected contract
    pc_reject_male = nb_trace_male/rejected_contract_male
    pc_reject_female = nb_trace_female/rejected_contract_female

    # get % of accepted contract
    pc_signed_male = nb_trace_male/signed_contract_male
    pc_signed_female = nb_trace_female/signed_contract_female

    #Print the results
    print("-"*30)
    print("Event log: " + logName)
    print("Nombre traces hommes: {}, nombre de signatures: {}, nombre de refus: {}".format(nb_trace_male, signed_contract_male, rejected_contract_male))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_male, pc_signed_male))
    print("Nombre traces femme: {}, nombre de signatures: {}, nombre de refus: {}".format(nb_trace_female, signed_contract_female, rejected_contract_female))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_female, pc_signed_female))
    print("-"*30)

In [None]:
printSexism(log_low, "low")
printSexism(log_medium, "medium")
printSexism(log_high, "high")

In [None]:
#On cherche a trouver les discrimntion dans les logs
#Cette fois ci on vas regarder les contrats accéptés en fonction de l'age 

def printAgeism(log, logName):
    #split per "case:age"
    log_young = log[log['case:age'] < 25]
    log_middle = log[(log['case:age'] >= 25) & (log['case:age'] <= 50)]
    log_old = log[log['case:age'] > 50]

    # Get len events "Sign Contract" for each age
    signed_contract_young = len(log_young[log_young['activity'] == 'Sign Contract'])
    signed_contract_middle = len(log_middle[log_middle['activity'] == 'Sign Contract'])
    signed_contract_old = len(log_old[log_old['activity'] == 'Sign Contract'])

    # Get len events "Reject Prospective Tenant" for each age
    rejected_contract_young = len(log_young[log_young['activity'] == 'Reject Prospective Tenant'])
    rejected_contract_middle = len(log_middle[log_middle['activity'] == 'Reject Prospective Tenant'])
    rejected_contract_old = len(log_old[log_old['activity'] == 'Reject Prospective Tenant'])

    # get % of rejected contract for each age
    pc_reject_young = rejected_contract_young / len(log_young) * 100
    pc_reject_middle = rejected_contract_middle/len(log_middle) * 100
    pc_reject_old = rejected_contract_old/len(log_old) * 100

    # get % of accepted contract for each age
    pc_signed_young = signed_contract_young/len(log_young) * 100
    pc_signed_middle = signed_contract_middle/len(log_middle) * 100
    pc_signed_old = signed_contract_old/len(log_old) * 100




    #Print the results
    print("-"*30)
    print("Event log: " + logName)
    print("Nombre traces jeunes: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_young), signed_contract_young, rejected_contract_young))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_young, pc_signed_young))
    print("Nombre traces moyen: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_middle), signed_contract_middle, rejected_contract_middle))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_middle, pc_signed_middle))
    print("Nombre traces vieux: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_old), signed_contract_old, rejected_contract_old))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_old, pc_signed_old))

printAgeism(log_low, "low")
printAgeism(log_medium, "medium")
printAgeism(log_high, "high")

In [None]:
# Ageism regarding eviction 

def AgeismEviction (log, logName):
    #split per "case:age" by tens from 20 to 80
    log_young = log[(log['case:age'] >= 20) & (log['case:age'] <= 30)]
    log_middle = log[(log['case:age'] >= 30) & (log['case:age'] <= 40)]
    log_old = log[(log['case:age'] >= 40) & (log['case:age'] <= 50)]
    log_very_old = log[(log['case:age'] >= 50) & (log['case:age'] <= 60)]
    log_ancient = log[(log['case:age'] >= 60) & (log['case:age'] <= 70)]
    log_very_ancient = log[(log['case:age'] >= 70) & (log['case:age'] <= 80)]
    # Note : les noms de variables sont funny

    # Get len events "Evict Tenant" for each age
    evict_young = len(log_young[log_young['activity'] == 'Evict Tenant'])
    evict_middle = len(log_middle[log_middle['activity'] == 'Evict Tenant'])
    evict_old = len(log_old[log_old['activity'] == 'Evict Tenant'])
    evict_very_old = len(log_very_old[log_very_old['activity'] == 'Evict Tenant'])
    evict_ancient = len(log_ancient[log_ancient['activity'] == 'Evict Tenant']) 
    evict_very_ancient = len(log_very_ancient[log_very_ancient['activity'] == 'Evict Tenant'])

    # # For each age, get the number of cases
    nb_trace_young = len(log_young)
    nb_trace_middle = len(log_middle)
    nb_trace_old = len(log_old)
    nb_trace_very_old = len(log_very_old)
    nb_trace_ancient = len(log_ancient)
    nb_trace_very_ancient = len(log_very_ancient)

    # get % of evicted contract for each age
    pc_evict_young = evict_young / len(log_young) * 100
    pc_evict_middle = evict_middle/len(log_middle) * 100
    pc_evict_old = evict_old/len(log_old) * 100
    pc_evict_very_old = evict_very_old/len(log_very_old) * 100
    pc_evict_ancient = evict_ancient/len(log_ancient) * 100
    pc_evict_very_ancient = evict_very_ancient/len(log_very_ancient) * 100
    

    #Print the results
    print("-"*30)
    print("Event log: " + logName)
    print("Nombre traces jeunes: {}, nombre de evictions: {}".format(nb_trace_young, evict_young))
    print("-- % evictions: {}".format(pc_evict_young))
    print("Nombre traces moyen: {}, nombre de evictions: {}".format(nb_trace_middle, evict_middle))
    print("-- % evictions: {}".format(pc_evict_middle))
    print("Nombre traces vieux: {}, nombre de evictions: {}".format(nb_trace_old, evict_old))
    print("-- % evictions: {}".format(pc_evict_old))
    print("Nombre traces très vieux: {}, nombre de evictions: {}".format(nb_trace_very_old, evict_very_old))
    print("-- % evictions: {}".format(pc_evict_very_old))
    print("Nombre traces ancien: {}, nombre de evictions: {}".format(nb_trace_ancient, evict_ancient))
    print("-- % evictions: {}".format(pc_evict_ancient))
    print("Nombre traces très ancien: {}, nombre de evictions: {}".format(nb_trace_very_ancient, evict_very_ancient))
    print("-- % evictions: {}".format(pc_evict_very_ancient))

    return [pc_evict_young, pc_evict_middle, pc_evict_old, pc_evict_very_old, pc_evict_ancient, pc_evict_very_ancient]


a = AgeismEviction(log_low, "low")

b = AgeismEviction(log_medium, "medium")
c = AgeismEviction(log_high, "high")

In [None]:
#German speaking racism regarding eviction

def GermanismEviction (log, logName):
    log_geman = log[log['case:german speaking'] == True]
    log_not_german = log[log['case:german speaking'] == False]

    # Get len events "Evict Tenant" for each language
    evict_german = len(log_geman[log_geman['activity'] == 'Evict Tenant'])
    evict_not_german = len(log_not_german[log_not_german['activity'] == 'Evict Tenant'])

    # # For each language, get the number of cases
    nb_trace_german = len(log_geman)
    nb_trace_not_german = len(log_not_german)

    # get % of evicted contract for each language
    pc_evict_german = evict_german / len(log_geman) * 100
    pc_evict_not_german = evict_not_german/len(log_not_german) * 100

    #Print the results
    print("-"*30)
    print("Event log: " + logName)
    print("Nombre traces german: {}, nombre de evictions: {}".format(nb_trace_german, evict_german))
    print("-- % evictions: {}".format(pc_evict_german))
    print("Nombre traces not german: {}, nombre de evictions: {}".format(nb_trace_not_german, evict_not_german))
    print("-- % evictions: {}".format(pc_evict_not_german))
    

In [None]:
GermanismEviction(log_low, "low")
GermanismEviction(log_medium, "medium")
GermanismEviction(log_high, "high")


In [None]:
def getGermanAcceptance(log, logname):
    log_german = log[log['case:german speaking'] == True]
    log_not_german = log[log['case:german speaking'] == False]

    # Get len events "Sign Contract" for each language
    signed_contract_german = len(log_german[log_german['activity'] == 'Sign Contract'])
    signed_contract_not_german = len(log_not_german[log_not_german['activity'] == 'Sign Contract'])

    # Get len events "Reject Prospective Tenant" for each language
    rejected_contract_german = len(log_german[log_german['activity'] == 'Reject Prospective Tenant'])
    rejected_contract_not_german = len(log_not_german[log_not_german['activity'] == 'Reject Prospective Tenant'])

    # get % of rejected contract for each language
    pc_reject_german = rejected_contract_german / len(log_german) * 100
    pc_reject_not_german = rejected_contract_not_german/len(log_not_german) * 100

    # get % of accepted contract for each language
    pc_signed_german = signed_contract_german/len(log_german) * 100
    pc_signed_not_german = signed_contract_not_german/len(log_not_german) * 100

    #Print the results
    print("-"*30)
    print("Event log: " + logname)
    print("Nombre traces german: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_german), signed_contract_german, rejected_contract_german))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_german, pc_signed_german))
    print("Nombre traces not german: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_not_german), signed_contract_not_german, rejected_contract_not_german))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_not_german, pc_signed_not_german))
    print("-"*30)

In [None]:
getGermanAcceptance(log_low, "low")
getGermanAcceptance(log_medium, "medium")
getGermanAcceptance(log_high, "high")

In [None]:
def getCitizenAcceptance(log,logname):
    log_citizen = log[log['case:citizen'] == True]
    log_not_citizen = log[log['case:citizen'] == False]

    signed_contract_citizen = len(log_citizen[log_citizen['activity'] == 'Sign Contract'])
    signed_contract_not_citizen = len(log_not_citizen[log_not_citizen['activity'] == 'Sign Contract'])

    rejected_contract_citizen = len(log_citizen[log_citizen['activity'] == 'Reject Prospective Tenant'])
    rejected_contract_not_citizen = len(log_not_citizen[log_not_citizen['activity'] == 'Reject Prospective Tenant'])

    pc_reject_citizen = rejected_contract_citizen / len(log_citizen) * 100
    pc_reject_not_citizen = rejected_contract_not_citizen/len(log_not_citizen) * 100

    pc_signed_citizen = signed_contract_citizen/len(log_citizen) * 100
    pc_signed_not_citizen = signed_contract_not_citizen/len(log_not_citizen) * 100

    #Print the results
    print("-"*30)
    print("Event log: " + logname)
    print("Nombre traces citizen: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_citizen), signed_contract_citizen, rejected_contract_citizen))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_citizen, pc_signed_citizen))
    print("Nombre traces not citizen: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_not_citizen), signed_contract_not_citizen, rejected_contract_not_citizen))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_not_citizen, pc_signed_not_citizen))
    print("-"*30)

In [None]:
getCitizenAcceptance(log_low, "low")
getCitizenAcceptance(log_medium, "medium")
getCitizenAcceptance(log_high, "high")

In [None]:
# racism by marital status
def getMarriedAcceptance(log, logname): 
    log_married = log[log['case:married'] == True]
    log_not_married = log[log['case:married'] == False]

    signed_contract_married = len(log_married[log_married['activity'] == 'Sign Contract'])
    signed_contract_not_married = len(log_not_married[log_not_married['activity'] == 'Sign Contract'])

    rejected_contract_married = len(log_married[log_married['activity'] == 'Reject Prospective Tenant'])
    rejected_contract_not_married = len(log_not_married[log_not_married['activity'] == 'Reject Prospective Tenant'])

    pc_reject_married = rejected_contract_married / len(log_married) * 100
    pc_reject_not_married = rejected_contract_not_married/len(log_not_married) * 100

    pc_signed_married = signed_contract_married/len(log_married) * 100
    pc_signed_not_married = signed_contract_not_married/len(log_not_married) * 100

    #Print the results
    print("-"*30)
    print("Event log: " + logname)
    print("Nombre traces married: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_married), signed_contract_married, rejected_contract_married))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_married, pc_signed_married))
    print("Nombre traces not married: {}, nombre de signatures: {}, nombre de refus: {}".format(len(log_not_married), signed_contract_not_married, rejected_contract_not_married))
    print("-- % refus: {}, % acceptation: {}".format(pc_reject_not_married, pc_signed_not_married))
    print("-"*30)

In [None]:
getMarriedAcceptance(log_low, "low")
getMarriedAcceptance(log_medium, "medium")
getMarriedAcceptance(log_high, "high")

## Eviction analysis

### Eviction rate

In [None]:
# get eviction
def get_number_of_evictions(log, logname):
    log_evict = log[log['activity'] == 'Evict Tenant']
    percentage = len(log_evict) / len(log) * 100
    print("Event log: " + logname)
    print("Nombre traces evictions: {}".format(len(log_evict)))
    print("- % evictions: {}".format(percentage))

In [None]:
get_number_of_evictions(log_low, "low")
get_number_of_evictions(log_medium, "medium")
get_number_of_evictions(log_high, "high")

In [None]:
# get number of average issued warnings per case where eviction is issued
def get_average_number_of_warnings(log, logname):
    log_evict = log[log['activity'] == 'Evict Tenant']
    log_issue_warning = log[log['activity'] == 'Issue Warning']
    warnings = log_issue_warning.groupby('case:concept:name').size()
    average = warnings.mean()
    print("Event log: " + logname)
    print("Nombre moyen de warnings par case: {}".format(average))

In [None]:
get_average_number_of_warnings(log_low, "low")
get_average_number_of_warnings(log_medium, "medium")
get_average_number_of_warnings(log_high, "high")

Eviction rate based on marital and citizenship status:

In [None]:
def eviction_by_status(log, logname, precision=2):
    log_evict = log[log['activity'] == 'Evict Tenant']
    
    # create new df with only name, activity, case:citizen and case:married
    df = log_evict[['case:concept:name', 'activity', 'case:citizen', 'case:married']]

    # group by case name and get the first row of each group
    df = df.groupby('case:concept:name').first()

    # get the number of evictions by status
    married = len(df[df['case:married'] == True])
    total_married = len(log[log['case:married'] == True])
    not_married = len(df[df['case:married'] == False])
    total_not_married = len(log[log['case:married'] == False])
    citizen = len(df[df['case:citizen'] == True])
    total_citizen = len(log[log['case:citizen'] == True])
    not_citizen = len(df[df['case:citizen'] == False])
    total_not_citizen = len(log[log['case:citizen'] == False])
    
    # get percentage on log
    married_percentage = (married / total_married * 100).__round__(precision)
    not_married_percentage = (not_married / total_not_married * 100).__round__(precision)
    citizen_percentage = (citizen / total_citizen * 100).__round__(precision)
    not_citizen_percentage = (not_citizen / total_not_citizen * 100).__round__(precision)
    married_and_citizen = (len(df[(df['case:married'] == True) & (df['case:citizen'] == True)]) / (total_married + total_citizen) * 100).__round__(precision)
    married_and_not_citizen = (len(df[(df['case:married'] == True) & (df['case:citizen'] == False)]) / (total_married + total_not_citizen) * 100).__round__(precision)
    not_married_and_citizen = (len(df[(df['case:married'] == False) & (df['case:citizen'] == True)]) / (total_not_married + total_citizen) * 100).__round__(precision)
    not_married_and_not_citizen = (len(df[(df['case:married'] == False) & (df['case:citizen'] == False)]) / (total_not_married + total_not_citizen) * 100).__round__(precision)

    # print results
    print("Event log: " + logname)
    print("Nombre d'expulsions de personnes mariées: {}".format(married))
    print("Nombre d'expulsions de personnes non mariées: {}".format(not_married))
    print("Nombre d'expulsions de citoyens: {}".format(citizen))
    print("Nombre d'expulsions de non citoyens: {}".format(not_citizen))
    print("Pourcentage d'expulsions de personnes mariées: {}%".format(married_percentage))
    print("Pourcentage d'expulsions de personnes non mariées: {}%".format(not_married_percentage))
    print("Pourcentage d'expulsions de citoyens: {}%".format(citizen_percentage))
    print("Pourcentage d'expulsions de non citoyens: {}%".format(not_citizen_percentage))
    print("Pourcentage d'expulsions de personnes mariées et citoyens: {}%".format(married_and_citizen))
    print("Pourcentage d'expulsions de personnes mariées et non citoyens: {}%".format(married_and_not_citizen))
    print("Pourcentage d'expulsions de personnes non mariées et citoyens: {}%".format(not_married_and_citizen))
    print("Pourcentage d'expulsions de personnes non mariées et non citoyens: {}%".format(not_married_and_not_citizen))
    print("-"*30)

In [None]:
eviction_by_status(log_low, "low", 3)
eviction_by_status(log_medium, "medium", 3)
eviction_by_status(log_high, "high", 3)