In [1]:
import warnings
warnings.filterwarnings('ignore')
import csv
import psutil
import time
# from memory_profiler import profile
import pandas as pd


logs = [
        'bpic2012',
        'bpic2017', 
        #'trafficFines',
        ]


# standardize column names: ['case_id', 'activity', 'resource', 'timestamp'] for all logs
case_id_col = 'case_nr'
activity_col = 'activity'
resource_col = 'resource'
timestamp_col = 'timestamp'

# dataset_name: [case_id_col, activity_col, resource_col, timestamp_col]
# NOTE - Basic column names are defined by users. These names will be renamed and standardized later 
# standardize column names: ['case_id', 'activity', 'resource', 'timestamp']
dataset_dict = {
    'bpic2012': ['case_id', 'activity', 'resource', 'start_time'],
    'bpic2017': ['case:concept:name', 'concept:name', 'org:resource', 'time:timestamp'],
    #'trafficFines': ['case:concept:name', 'concept:name', 'org:resource', "time:timestamp"], 
}
incomplete_dict = {
    'bpic2012': ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED", "A_CANCELLED", "A_DECLINED"],
    'bpic2017': ["A_Pending", "A_Denied", "A_Cancelled",],
    #'trafficFines': ["Send for Credit Collection"]
}


# Define positive and negative labels
label_old = "outcome"
neg_label = 0
pos_label = 1
positive_activities_dict = {
    'bpic2017': ["A_Pending"],
    'bpic2012': ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED",],
    #'trafficFines': ["Send for Credit Collection"]
}

chunk_s = 10000

In [2]:
class DataPreprocessorModinDask:
    def __init__(self, log_name, input_data_path):
        self.log_name = log_name
        self.input_data_path = input_data_path

    def _print_metrics(self, elapsed_time):
        # Helper function to print resource usage metrics
        cpu_percent = psutil.cpu_percent()
        memory_info = psutil.virtual_memory()

    # @profile
    def read_log(self):
        start_time = time.time()
        print("Reading log...")

        # Read the log based on its extension
        if self.input_data_path.lower().endswith('.csv'):
            with open(self.input_data_path, 'r') as file:
                # Use Sniffer to infer the delimiter
                dialect = csv.Sniffer().sniff(file.read(10000))
            try:
                # Read CSV with Dask, specifying dtype for certain columns
                log_file = pd.read_csv(self.input_data_path, sep=dialect.delimiter, dtype={'Resource': 'object', 'article': 'object'})
            except:
                log_file = pd.read_csv(self.input_data_path, sep=dialect.delimiter)

            log_file = log_file.rename(columns=lambda x: x.strip().lower().replace(' ', '_'))
            log_file = log_file.rename(
                columns=dict(zip(dataset_dict[self.log_name], [case_id_col, activity_col, resource_col, timestamp_col])))
        else:
            raise ValueError("Unsupported file extension. Supported extensions: .csv, .xes")

        # Calculate elapsed time and print resource usage metrics
        end_time = time.time()
        elapsed_time = end_time - start_time
        self._print_metrics(elapsed_time)
        return log_file

    # @profile
    def clean_data(self, log_file):
        start_time = time.time()
        print("Cleaning data...")

        # Convert timestamp column to datetime
        log_file[timestamp_col] = pd.to_datetime(log_file[timestamp_col], format="mixed", infer_datetime_format=True)
        log_file = log_file.sort_values(by=[timestamp_col])

       # Remove white spaces from column values
        log_file = log_file.applymap(lambda x: x.strip() if isinstance(x, str) else x)

        # Convert the 'resource' column to string
        log_file[resource_col] = log_file[resource_col].astype(str)

        # Replace unique resource values with 'res{i}' format
        unique_resources = log_file[resource_col].unique()
        resource_mapping = {original_value: f'res{i + 1}' for i, original_value in enumerate(unique_resources)}
        log_file[resource_col] = log_file[resource_col].replace(resource_mapping)

        # Make sure that a 0 in the following columns is replaced by NaN
        if self.log_name == 'bpic2017':
            columns_to_check = ['firstwithdrawalamount', 'monthlycost', 'creditscore', 'offeredamount', "numberofterms"]
            log_file[columns_to_check] = log_file[columns_to_check].replace(0, pd.NA)

        # There are some columns that have NaN up until a certain point in a case, then they are filled in one row. Make sure that they have the same value starting from the first occurrence up until the next change
        filled_file = log_file.groupby(case_id_col, group_keys=False).ffill()
        # make sure to have column case_id_col still in the dataframe
        filled_file[case_id_col] = log_file[case_id_col]
        log_file = filled_file

        # activities_to_check = incomplete_dict[log_name]  # ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED", "A_CANCELLED", "A_DECLINED"]
        activities_to_check = incomplete_dict[self.log_name]  # ["A_APPROVED", "A_REGISTERED", "A_ACTIVATED", "A_CANCELLED", "A_DECLINED"]              
        contains_activity = log_file[activity_col].isin(activities_to_check).groupby(log_file[case_id_col]).max().reset_index()
        complete_cases = contains_activity[contains_activity[activity_col] == True][case_id_col]
        log_file = log_file[log_file[case_id_col].isin(complete_cases.tolist())]       

        # Calculate elapsed time and print resource usage metrics
        end_time = time.time()
        elapsed_time = end_time - start_time
        self._print_metrics(elapsed_time)
        return log_file
    
    # @profile
    def extract_temporal_features(self, log_file):
        start_time = time.time()
        print("Extracting timestamp features...")

        # Calculate event_nr
        log_file['event_nr'] = log_file.groupby(case_id_col).cumcount() + 1

        # Calculate case_length
        log_file['case_length'] = log_file.groupby(case_id_col)['event_nr'].transform('max')

        log_file['elapsed_time'] = log_file[timestamp_col] - log_file.groupby(case_id_col)[timestamp_col].transform('min')
        log_file['elapsed_time'] = log_file['elapsed_time'].dt.total_seconds()

        # Calculate elapsed time and print resource usage metrics
        end_time = time.time()
        elapsed_time = end_time - start_time
        self._print_metrics(elapsed_time)

        return log_file

In [3]:
# Usage example:
results_data = []
# Initialize your DataPreprocessorModinDask object and call read_log() and clean_data()
for log_name in logs:
    print("\n==================\n Log: %s\n==================\n" % (log_name,))
    input_data_path = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\%s\\%s.csv" % (log_name, log_name)
    output_data_path = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\prepared_data\\%s" % log_name

    data_preprocessor = DataPreprocessorModinDask(log_name, input_data_path=input_data_path)
    log_file = data_preprocessor.read_log()
    cleaned_data = data_preprocessor.clean_data(log_file)
    features_data = data_preprocessor.extract_temporal_features(cleaned_data)
    features_data.name = "%s_cleaned.csv" % log_name
    results_data.append(features_data)
    
    print("Saving csv file...")
    results_dir = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\%s" % log_name
    import os

    if not os.path.exists(os.path.join(results_dir)):
        os.makedirs(os.path.join(results_dir))

    features_data.to_csv(
        os.path.join(
            results_dir, features_data.name
        ),
        index=False,
        sep=";",
    )

    print("Done!")


 Log: bpic2012

Reading log...
Cleaning data...
Extracting timestamp features...
Saving csv file...
Done!

 Log: bpic2017

Reading log...
Cleaning data...
Extracting timestamp features...
Saving csv file...
Done!


# Possible Interventions For bpic2017 from the winner student report:

1. **Sending Another Loan Offer:**
   - *Intervention:* Send offers to clients as soon as possible. For all case endpoints, this has been shown to have the greatest effect on cancellation rates. Sending offers to clients within 4 days may decrease cancellation rates by 5% up to 10%.
   - *Treatment 1:* Cases that receive only one offer are in the control group, while cases that receive more than one offer are in the treatment group.



In [4]:
import pandas as pd
from tqdm import tqdm

path_17 = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2017\\bpic2017_cleaned.csv"
path_12 = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2012\\bpic2012_cleaned.csv"

results_data = [pd.read_csv(path_12, sep=";"), pd.read_csv(path_17, sep=";")]

results_data2 = []
for log_name in logs:
    print("\n==================\n Log: %s\n==================\n" % (log_name,))
    positive_activities = positive_activities_dict[log_name]

    # Initialize a global state to keep track of positive activities across cases
    global_positive_cases = set()

    # Define a function to label each group
    def label_group(chunk):
        # Check if any positive activity exists in the case
        chunk['is_positive'] = chunk.groupby(case_id_col)[activity_col].transform(lambda x: any(activity in positive_activities for activity in x))
        
        # Update the global positive cases state
        positive_cases_from_chunk = set(chunk.loc[chunk['is_positive'], case_id_col])
        global_positive_cases.update(positive_cases_from_chunk)
        
        # Assign label based on the result
        chunk['outcome'] = chunk.apply(lambda row: pos_label if row['is_positive'] else neg_label, axis=1)
        
        return chunk.drop('is_positive', axis=1)

    features_data = results_data[logs.index(log_name)]

    grouped = features_data.groupby(case_id_col, as_index=False)
    c = 0
    results = []
    for name, group in tqdm(grouped, desc="Labeling groups"):
        if c % 3000 == 0:
            print("This is chunk number: ", c)
        # Reset the index for each group
        group.reset_index(drop=True, inplace=True)
        labeled_group = label_group(group)
        results.append(labeled_group)
        c += 1

    # Concatenate the results of all chunks into a final DataFrame
    labeled_data = pd.concat(results, ignore_index=True)
    results_data2.append(labeled_data)

    labeled_data.name = "%s_labeled.csv" % log_name

    print("Saving csv file...")
    results_dir = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\%s" % log_name
    import os

    if not os.path.exists(os.path.join(results_dir)):
        os.makedirs(os.path.join(results_dir))

    labeled_data.to_csv(
        os.path.join(
            results_dir, labeled_data.name
        ),
        index=False,
        sep=";",
    )

    print("Done!\n")


 Log: bpic2012



Labeling groups:   1%|          | 75/12688 [00:00<00:17, 741.23it/s]

This is chunk number:  0


Labeling groups:  25%|██▍       | 3169/12688 [00:03<00:11, 850.79it/s]

This is chunk number:  3000


Labeling groups:  48%|████▊     | 6119/12688 [00:07<00:08, 767.90it/s]

This is chunk number:  6000


Labeling groups:  72%|███████▏  | 9177/12688 [00:11<00:04, 857.26it/s]

This is chunk number:  9000


Labeling groups:  95%|█████████▌| 12104/12688 [00:15<00:00, 785.05it/s]

This is chunk number:  12000


Labeling groups: 100%|██████████| 12688/12688 [00:16<00:00, 775.33it/s]


Saving csv file...
Done!


 Log: bpic2017



Labeling groups:   0%|          | 78/31411 [00:00<02:05, 250.06it/s]

This is chunk number:  0


Labeling groups:  10%|▉         | 3139/31411 [00:04<00:39, 723.60it/s]

This is chunk number:  3000


Labeling groups:  19%|█▉        | 6119/31411 [00:09<00:41, 615.92it/s]

This is chunk number:  6000


Labeling groups:  29%|██▉       | 9105/31411 [00:13<00:30, 743.19it/s]

This is chunk number:  9000


Labeling groups:  39%|███▊      | 12152/31411 [00:18<00:27, 710.76it/s]

This is chunk number:  12000


Labeling groups:  48%|████▊     | 15135/31411 [00:22<00:21, 746.85it/s]

This is chunk number:  15000


Labeling groups:  58%|█████▊    | 18105/31411 [00:26<00:20, 646.95it/s]

This is chunk number:  18000


Labeling groups:  67%|██████▋   | 21076/31411 [00:30<00:13, 777.27it/s]

This is chunk number:  21000


Labeling groups:  77%|███████▋  | 24143/31411 [00:35<00:10, 720.66it/s]

This is chunk number:  24000


Labeling groups:  86%|████████▋ | 27107/31411 [00:39<00:06, 690.04it/s]

This is chunk number:  27000


Labeling groups:  96%|█████████▌| 30150/31411 [00:44<00:01, 682.16it/s]

This is chunk number:  30000


Labeling groups: 100%|██████████| 31411/31411 [00:46<00:00, 680.18it/s]


Saving csv file...
Done!



In [8]:
import pandas as pd
from tqdm import tqdm

path_17 = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2017\\bpic2017_labeled.csv"
path_12 = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2012\\bpic2012_labeled.csv"

results_data2 = [pd.read_csv(path_12, sep=";"), pd.read_csv(path_17, sep=";")]

results_data3 = []

relevant_activities_dict = {
    "bpic2017": ['O_Sent (mail and online)', 'O_Sent (online only)'],
    "bpic2012": ['O_SENT'],
    "trafficFines": ['Add penalty']
    }

# Function for Treatment 1: Increase the number of offers
def apply_treatment1(group, log_name):
    if log_name == "bpic2017":     
        group['treatment'] = 0

        # IMPORTANT NOTE: IN PREVIOUS PAPERS --> INTERVENTION IS SENDING MULTIPLE OFFERS, HOWEVER, THE DECISION IS ALREADY MADE WHEN THAT OFFER IS CREATED FOR BPIC2017 !
        # the offer is not always sent when Create Offer occurs, but this is the actual treatment we are interested, we just have to make sure that the previous offer was already sent to the client

        # indices = group.index[(group[activity_col] == 'O_Sent (mail and online)') | (group[activity_col] == 'O_Sent (online only)')].tolist()
        indices = group.index[(group[activity_col] == 'O_Create Offer')].tolist()
        indices_offer_sent = group.index[(group[activity_col] == 'O_Sent (mail and online)') | (group[activity_col] == 'O_Sent (online only)')].tolist()
        # check the case_nr
        if group[case_id_col].iloc[0] == "Application_1001114274":
            print("Case 1001114274")
            print(group[activity_col])
            print(indices)
            print(indices_offer_sent)
            print('lol')
        if len(indices) > 1 and len(indices_offer_sent) > 0:
            # put it one event before the second offer, as the decision is made before the offer is sent
            # check whether the timestamp of the first offer sent is before the timestamp of the second offer
            if group.at[indices_offer_sent[0], timestamp_col] < group.at[indices[1], timestamp_col]:
                group.at[indices[1]-1, 'treatment'] = 1

        # Reset the index for each group
        group.reset_index(drop=True, inplace=True)

        return group

    elif log_name == "bpic2012":
        group["treatment"] = 0
        # indices = group.index[(group[activity_col] == 'O_SENT')].tolist()

        # IMPORTANT NOTE: IN PREVIOUS PAPERS --> INTERVENTION IS SENDING MULTIPLE OFFERS, HOWEVER, THE DECISION IS ALREADY MADE WHEN THAT OFFER IS SELECTED FOR BPIC2012 ! 
        # We just also make sure that the previous offer was also sent to the client (otherwise it's also the 'first offer')

        # SO DECISION IS: WE SENT AN OFFER, THE CLIENT REFUSED, SO DO WE CREATE A NEW OFFER OR NOT?

        indices = group.index[(group[activity_col] == 'O_SELECTED')].tolist()
        indices_offer_sent = group.index[(group[activity_col] == 'O_SENT')].tolist()
        if len(indices) > 1 and len(indices_offer_sent) > 0:
            # put it one event before the second offer, as the decision is made before the offer is sent
            # check whether the timestamp of the first offer sent is before the timestamp of the second offer
            if group.at[indices_offer_sent[0], timestamp_col] < group.at[indices[1], timestamp_col]:
                group.at[indices[1]-1, 'treatment'] = 1
        
        # Reset the index for each group
        group.reset_index(drop=True, inplace=True)
        return group


treatments_functions = [apply_treatment1,] # apply_treatment2, apply_treatment3, apply_treatment4]

# Set the maximum number of cases per chunk
max_cases_per_chunk = 100

# List to store futures
futures = []

max_workers = 1  # You can adjust this number based on your system's capabilities

def determine_overall_treatment(row):
    treatments = ['Treatment1', 'Treatment2', 'Treatment3', 'Treatment4']
    for treatment in treatments:
        if row[treatment] == 'Treatment':
            return treatment
    return 'Controle'

for log_name in logs:
    print("\n==================\n Log: %s\n==================\n" % (log_name,))    
    relevant_activities = relevant_activities_dict[log_name]
    labeled_data = results_data2[logs.index(log_name)]
    futures = []
    results = []
    for treatment_function in treatments_functions:
        grouped = labeled_data.groupby(case_id_col, as_index=False)
        c = 0
        for name, group in tqdm(grouped, desc="Applying Treatments"):
            if c % 5000 == 0:
                print("This is chunk number: ", c)
            # Reset the index for each group
            group.reset_index(drop=True, inplace=True)
            treated_chunk = treatment_function(group.copy(), log_name)
            results.append(treated_chunk)
            c += 1
    final_result = pd.concat(results, ignore_index=True)
    final_result.reset_index(drop=True, inplace=True)  # Resetting index to avoid duplicate indices
    # add time to last event column
    activities_to_track = incomplete_dict[log_name]

    # Display the final result
    print("\nDone! - Final DataFrame:\n")
    final_result.name = "%s_treatments.csv" % log_name

    print("Saving csv file...")
    results_dir = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\%s" % log_name
    import os

    results_data3.append(final_result)

    if not os.path.exists(os.path.join(results_dir)):
        os.makedirs(os.path.join(results_dir))

    final_result.to_csv(
        os.path.join(
            results_dir, final_result.name
        ),
        index=False,
        sep=";",
    )

    print("Done!\n")
    final_result = None


 Log: bpic2012



Applying Treatments:   0%|          | 0/12688 [00:00<?, ?it/s]

This is chunk number:  0


Applying Treatments:  41%|████▏     | 5240/12688 [00:04<00:05, 1370.61it/s]

This is chunk number:  5000


Applying Treatments:  79%|███████▊  | 9968/12688 [00:07<00:01, 1392.28it/s]

This is chunk number:  10000


Applying Treatments: 100%|██████████| 12688/12688 [00:09<00:00, 1298.59it/s]



Done! - Final DataFrame:

Saving csv file...
Done!


 Log: bpic2017



Applying Treatments:   0%|          | 1/31411 [00:00<1:18:28,  6.67it/s]

This is chunk number:  0
Case 1001114274
0       A_Create Application
1                A_Submitted
2             W_Handle leads
3             W_Handle leads
4     W_Complete application
               ...          
64    W_Validate application
65               O_Cancelled
66               O_Cancelled
67               O_Cancelled
68               O_Cancelled
Name: activity, Length: 69, dtype: object
[10, 18, 37, 48, 51]
[12, 20, 39, 50, 53]


Applying Treatments:   0%|          | 18/31411 [01:50<53:19:45,  6.12s/it]


KeyboardInterrupt: 

In [6]:
path_17 = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2017\\bpic2017_treatments.csv"
path_12 = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2012\\bpic2012_treatments.csv"

results_data3 = [pd.read_csv(path_12, sep=";"), pd.read_csv(path_17, sep=";")]

resource_max = 5
case_goal_max = 6

for log_name in logs:
    df = results_data3[logs.index(log_name)]

    # for every case_id group, check how whether "O_CREATED" occured more than once, and then check if the treatment is 0, if so, add to a counter
    # group = df.groupby(case_id_col)
    # counter = 0
    # treated_counter = 0
    # nr_treated_cases = 0
    # created_counter = 0
    # if log_name == 'bpic2017':
    #     activity_to_check_now = "O_Create Offer"
    # else:
    #     activity_to_check_now = "O_SELECTED"
    # for name, group in group:
    #     if group[activity_col].eq(activity_to_check_now).sum() > 1:
    #         created_counter += 1
    #     if group["treatment"].eq(1).sum() == 1:
    #         nr_treated_cases += 1
    #     if group[activity_col].eq(activity_to_check_now).sum() > 1 and group['treatment'].eq(1).sum() >= 1:
    #         treated_counter += 1
    #     if group[activity_col].eq(activity_to_check_now).sum() > 1 and group['treatment'].eq(0).all():
    #         print('lol')
    #         counter += 1

    # for each value in column 'resource', count in how many unique cases it appears, and get this in a separate df with columns 'resource' and 'case_count'
    resource_df = df.groupby('resource')[case_id_col].nunique().reset_index()
    resource_df.columns = ['resource', 'case_count']
    # sort
    resource_df = resource_df.sort_values(by='case_count', ascending=False)
    # print the top 10
    print(resource_df.head(resource_max))
    # then make sure that in the resource column of the original df, only the top 10 are specified, and all others are set to 'other'
    top_resources = resource_df.head(resource_max)['resource'].tolist()
    df['resource'] = df['resource'].apply(lambda x: x if x in top_resources else 'other')
    # print the unique values in the resource column
    print(df['resource'].unique())

    #drop the coluns 'unnamed:_0' and 'case_length'
    if 'case_length' in df.columns:
        df = df.drop(columns=['case_length'])
    if 'unnamed: 0' in df.columns:
        df = df.drop(columns=['unnamed: 0'])
    if 'end_time' in df.columns:
        df = df.drop(columns=['end_time'])
    if 'reg_date' in df.columns:
        df = df.drop(columns=['reg_date'])
    df = df.reset_index(drop=True)

    if 'case:loangoal' in df.columns:
        # if the value is 'Unknown', or 'Not specified', set it to 'Unknown'
        df['case:loangoal'] = df['case:loangoal'].apply(lambda x: 'Unknown' if x == 'Not speficied' else x)
        # Now do the same for the case loan goal column
        case_goal_df = df.groupby('case:loangoal')[case_id_col].nunique().reset_index()
        case_goal_df.columns = ['case_goal', 'case_count']
        print(case_goal_df)
        # sort
        case_goal_df = case_goal_df.sort_values(by='case_count', ascending=False)
        # print all unique values
        print(case_goal_df['case_goal'].unique())
        # print the top 10
        print(case_goal_df.head(case_goal_max))
        # then make sure that in the case:loangoal column of the original df, only the top 10 are specified, and all others are set to 'other'
        top_case_goals = case_goal_df.head(case_goal_max)['case_goal'].tolist()
        df['case:loangoal'] = df['case:loangoal'].apply(lambda x: x if x in top_case_goals else 'Other, see explanation')
        # print the unique values in the case:loangoal column
        print(df['case:loangoal'].unique())

    # check if the case_nr column is a string or not. It it is a string, take only the 'number' part of the string, and convert it to an integer
    if df[case_id_col].dtype == 'O':
        df[case_id_col] = df[case_id_col].apply(lambda x: int(x.split('_')[-1]))

    name = "%s_final.csv" % log_name

    # save
    import os
    results_dir = "C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\%s" % log_name
    df.to_csv(
        os.path.join(
            results_dir, name
        ),
        index=False,
        sep=";",
    )

   resource  case_count
0      res1       12688
22     res3        3513
42    res48        1689
34    res40        1665
21    res29        1642
['res1' 'res3' 'other' 'res48' 'res40' 'res29']
    resource  case_count
0       res1       23377
104     res6        6052
62     res21        5801
56     res16        4784
78     res36        4403
['res1' 'other' 'res21' 'res6' 'res36' 'res16']
                 case_goal  case_count
0                     Boat         201
1            Business goal          30
2                      Car        9307
3         Caravan / Camper         369
4       Debt restructuring           2
5   Existing loan takeover        5574
6     Extra spending limit         623
7         Home improvement        7646
8               Motorcycle         275
9   Other, see explanation        2976
10     Remaining debt home         835
11            Tax payments         152
12                 Unknown        3421
['Car' 'Home improvement' 'Existing loan takeover' 'Unknown'
 'O

In [4]:
import pandas as pd

df = pd.read_csv("C:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\ProCause\\data\\bpic2017\\bpic2017_final.csv", sep=";")