In [None]:
import pm4py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
log_low = pm4py.read_xes('data/renting_log_low.xes')
log_medium = pm4py.read_xes('data/renting_log_medium.xes')
log_high = pm4py.read_xes('data/renting_log_high.xes')

In [None]:
def flat_dataset(log):
    flat_data = log \
        .groupby(['case:case']) \
        .agg(
            germanSpeaking=('case:german speaking', 'first'),
            yearsOfEducation=('case:yearsOfEducation', 'first'),
            age=('case:age', 'first'),
            gender=('case:gender', 'first'),
            citizen=('case:citizen', 'first'),
            protected=('case:protected', 'first'),
            married=('case:married', 'first')
        )
    
    flat_data['ageGroup'] = pd.cut(
        flat_data['age'],
        bins=[0, 25, 50, 200],
        labels=['0-25', '25-50', '>50']
    )
    
    flat_data['screening'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: 'Screen Prospective Tenant' in x.values)
    
    flat_data['extensiveScreening'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: 'Extensive Screening' in x.values)
    
    flat_data['contract'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: 'Sign Contract' in x.values)
    
    flat_data['denied'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: 'Reject Prospective Tenant' in x.values)
    
    flat_data['missedPayments'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: x.str.contains('Miss Rent Payment').sum())
    
    flat_data['latePayments'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: x.str.contains('Accept Late Payment').sum())
    
    flat_data['evicted'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: 'Evict Tenant' in x.values)
    
    flat_data['cancelled'] = log \
        .groupby(['case:case'])['concept:name'] \
        .transform(lambda x: 'Tenant Cancels Appartment' in x.values)
    
    flat_data['duration'] = log \
        .groupby(['case:case'])['time:timestamp'] \
        .transform(lambda x: (x.max() - x.min()).days)

    return flat_data

In [None]:
flat_low = flat_dataset(log_low)
flat_medium = flat_dataset(log_medium)
flat_high = flat_dataset(log_high)

In [None]:
flat_low

## Acceptance / Denial

In [None]:
def print_acceptance_denial_counts(flat_data):
    for column in ['gender', 'ageGroup', 'germanSpeaking', 'married', 'citizen', 'protected', 'yearsOfEducation']:
        print('-------------------------------------------------------')
        print('Column: ', column)

        distinct_values = flat_data[column].unique()

        for value in sorted(distinct_values):
            accepted_n = len(flat_data[(flat_data[column] == value) & (flat_data['contract'] == True)])
            denied_n = len(flat_data[(flat_data[column] == value) & (flat_data['denied'] == True)])
            total_n = len(flat_data[flat_data[column] == value])

            print("\tValue: ", value)
            print('\t- Accepted: ', accepted_n, '->', round(accepted_n / total_n, 5) * 100, '%' )
            print('\t- Denied: ', denied_n, '->', round(denied_n / total_n, 5) * 100, '%' )
            print()

        # plot bar chart
        fig, ax = plt.subplots(figsize=(10, 5))
        sns.countplot(x=column, hue='contract', data=flat_data, ax=ax)
        plt.title(column)
        plt.show()

### Low dataset

In [None]:
print_acceptance_denial_counts(flat_low)

### Medium dataset

In [None]:
print_acceptance_denial_counts(flat_medium)

### High dataset

In [None]:
print_acceptance_denial_counts(flat_high)

## Eviction / Cancelation

In [None]:
def print_eviction_cancellation_counts():
    for column in ['gender', 'ageGroup', 'germanSpeaking', 'married', 'citizen', 'protected', 'yearsOfEducation']:
        print('-------------------------------------------------------')
        print('Column: ', column)

        for dataset_name, dataset in [('low', flat_low), ('medium', flat_medium), ('high', flat_high)]:
            print(f'\tDataset: {dataset_name}')

            distinct_values = dataset[column].unique()

            for value in sorted(distinct_values):
                evicted_n = len(dataset[(dataset[column] == value) & (dataset['contract'] == True) & (dataset['evicted'] == True)])
                canceled_n = len(dataset[(dataset[column] == value) & (dataset['contract'] == True) & (dataset['cancelled'] == True)])
                total_n = len(dataset[(dataset[column] == value) & (dataset['contract'] == True)]) + 1e-10

                print("\t\tValue: ", value)
                print(f"\t\t- Evicted: {evicted_n} -> {round(evicted_n / total_n, 5) * 100} %")
                print(f"\t\t- Canceled: {canceled_n} -> {round(canceled_n / total_n, 5) * 100} %")
                print()

        # plot bar chart for all three datasets in a same figure
        fig, ax = plt.subplots(1, 3, figsize=(15, 5))
        fig.suptitle(column)

        sns.countplot(x=column, hue='evicted', data=flat_low[flat_low['contract'] == True], ax=ax[0])
        ax[0].set_title('low')

        sns.countplot(x=column, hue='evicted', data=flat_medium[flat_medium['contract'] == True], ax=ax[1])
        ax[1].set_title('medium')

        sns.countplot(x=column, hue='evicted', data=flat_high[flat_high['contract'] == True], ax=ax[2])
        ax[2].set_title('high')



In [None]:
print_eviction_cancellation_counts()