In [6]:
import pandas as pd
df_bpi= pd.read_csv("BPI_Challenge_2017.csv")
df_bpi['time'] = pd.to_datetime(df_bpi['time'], errors='coerce')


In [7]:
# Function to create prefixes and compute total time of prefix in seconds
def make_prefixes(dftest):
    dftest['prefix'] = ''  # Initialize the 'prefix' column
    dftest['total_time'] = None  # Initialize the 'total_time' column

    # Group the DataFrame by case_id
    grouped = dftest.groupby('case')

    # Iterate through each case group
    for case_id, group in grouped:
        prefix_list = []  # Initialize an empty list to track prefixes
        first_time = None  # To track the first activity time
        for index, row in group.iterrows():
            # Append the current event to the prefix list
            prefix_list.append(row['event'])

            # Track the time of the first activity in the prefix
            if first_time is None:
                first_time = row['time']

            # Calculate the total time difference (current time - first activity time)
            total_time = (row['time'] - first_time).total_seconds()  # in seconds

            # Assign the updated prefix and total time to the respective columns
            dftest.at[index, 'prefix'] = prefix_list.copy()
            dftest.at[index, 'total_time'] = total_time

    # Ensure the prefix column contains lists
    dftest['prefix'] = dftest['prefix'].apply(lambda x: list(x))
    
    # Optional: Drop any rows if required
    dftest = dftest[:-1]
    return dftest

# Call the function
df = make_prefixes(df_bpi)

# Display the result
df


Unnamed: 0,case,event,time,lifecycle:transition,ApplicationType,LoanGoal,RequestedAmount,MonthlyCost,org:resource,Selected,...,OfferID,FirstWithdrawalAmount,Action,Accepted,CreditScore,NumberOfTerms,EventOrigin,OfferedAmount,prefix,total_time
0,Application_652823628,A_Create Application,2016-01-01 10:51:15.304,COMPLETE,New credit,Existing loan takeover,20000.0,,User_1,,...,,,Created,,,,Application,,[A_Create Application],0.0
1,Application_652823628,A_Submitted,2016-01-01 10:51:15.352,COMPLETE,New credit,Existing loan takeover,20000.0,,User_1,,...,,,statechange,,,,Application,,"[A_Create Application, A_Submitted]",0.048
2,Application_652823628,W_Handle leads,2016-01-01 10:51:15.774,SCHEDULE,New credit,Existing loan takeover,20000.0,,User_1,,...,,,Created,,,,Workflow,,"[A_Create Application, A_Submitted, W_Handle l...",0.47
3,Application_652823628,W_Handle leads,2016-01-01 10:52:36.392,WITHDRAW,New credit,Existing loan takeover,20000.0,,User_1,,...,,,Deleted,,,,Workflow,,"[A_Create Application, A_Submitted, W_Handle l...",81.088
4,Application_652823628,W_Complete application,2016-01-01 10:52:36.403,SCHEDULE,New credit,Existing loan takeover,20000.0,,User_1,,...,,,Created,,,,Workflow,,"[A_Create Application, A_Submitted, W_Handle l...",81.099
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1202261,Application_1350494635,W_Call after offers,2017-01-02 20:29:25.989,SUSPEND,New credit,Home improvement,20000.0,,User_96,,...,,,Released,,,,Workflow,,"[A_Create Application, A_Submitted, W_Handle l...",165092.773
1202262,Application_1350494635,W_Call after offers,2017-01-06 07:33:02.212,ATE_ABORT,New credit,Home improvement,20000.0,,User_1,,...,,,Deleted,,,,Workflow,,"[A_Create Application, A_Submitted, W_Handle l...",464108.996
1202263,Application_1350494635,W_Call after offers,2017-01-06 07:33:02.221,SCHEDULE,New credit,Home improvement,20000.0,,User_1,,...,,,Created,,,,Workflow,,"[A_Create Application, A_Submitted, W_Handle l...",464109.005
1202264,Application_1350494635,A_Cancelled,2017-01-16 10:51:21.114,COMPLETE,New credit,Home improvement,20000.0,,User_28,,...,,,statechange,,,,Application,,"[A_Create Application, A_Submitted, W_Handle l...",1340007.898


In [8]:
df['prefix_length'] = df['prefix'].apply(len) #compute prefix length

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['prefix_length'] = df['prefix'].apply(len) #compute prefix length


In [9]:
case_traces = df.groupby('case')['event'].apply(list)

# Map the aggregated traces back to the original DataFrame
df['case_trace'] = df['case'].map(case_traces)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['case_trace'] = df['case'].map(case_traces)


In [11]:
df['A_Cancelled_in_prefix'] = df['prefix'].apply(lambda x: x.count('A_Cancelled')) #New column which checks if A_cancelled has happened in the prefix

df['A_Cancelled_occured'] = df['case_trace'].apply(lambda x: x.count('A_Cancelled')) #New col which checks if A_Cancelled has/will happen in the case

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['A_Cancelled_in_prefix'] = df['prefix'].apply(lambda x: x.count('A_Cancelled'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['A_Cancelled_occured'] = df['case_trace'].apply(lambda x: x.count('A_Cancelled'))


In [15]:
df = df[(df['A_Cancelled_in_prefix'] != 1)] #Remove rows where A_Cancelled has already happened

In [16]:
# Create new columns which are index encodings 
def index_encoding(df_testje):

    df_testje.reset_index(inplace=True)
    max_length = df_testje['prefix'].apply(len).max()  # Find the longest prefix
    expanded_prefix = pd.DataFrame(df_testje['prefix'].tolist(), columns=[f'event_{i+1}' for i in range(max_length)])

    # Concatenate the original dataframe with the expanded prefixes
    #df_testje = pd.concat([df_testje, expanded_prefix], axis=1)
    expanded_prefix

    result = df_testje.join(expanded_prefix)
    result.drop('index', axis=1)
    return result



In [17]:
df = index_encoding(df)
df

Unnamed: 0,index,case,event,time,lifecycle:transition,ApplicationType,LoanGoal,RequestedAmount,MonthlyCost,org:resource,...,event_171,event_172,event_173,event_174,event_175,event_176,event_177,event_178,event_179,event_180
0,0,Application_652823628,A_Create Application,2016-01-01 10:51:15.304,COMPLETE,New credit,Existing loan takeover,20000.0,,User_1,...,,,,,,,,,,
1,1,Application_652823628,A_Submitted,2016-01-01 10:51:15.352,COMPLETE,New credit,Existing loan takeover,20000.0,,User_1,...,,,,,,,,,,
2,2,Application_652823628,W_Handle leads,2016-01-01 10:51:15.774,SCHEDULE,New credit,Existing loan takeover,20000.0,,User_1,...,,,,,,,,,,
3,3,Application_652823628,W_Handle leads,2016-01-01 10:52:36.392,WITHDRAW,New credit,Existing loan takeover,20000.0,,User_1,...,,,,,,,,,,
4,4,Application_652823628,W_Complete application,2016-01-01 10:52:36.403,SCHEDULE,New credit,Existing loan takeover,20000.0,,User_1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168381,1202259,Application_1350494635,W_Call after offers,2017-01-02 20:27:20.472,START,New credit,Home improvement,20000.0,,User_96,...,,,,,,,,,,
1168382,1202260,Application_1350494635,A_Complete,2017-01-02 20:27:20.474,COMPLETE,New credit,Home improvement,20000.0,,User_96,...,,,,,,,,,,
1168383,1202261,Application_1350494635,W_Call after offers,2017-01-02 20:29:25.989,SUSPEND,New credit,Home improvement,20000.0,,User_96,...,,,,,,,,,,
1168384,1202262,Application_1350494635,W_Call after offers,2017-01-06 07:33:02.212,ATE_ABORT,New credit,Home improvement,20000.0,,User_1,...,,,,,,,,,,


In [19]:
df = df.drop('index', axis=1)

In [24]:
df_x.columns.to_list()

['case',
 'event',
 'time',
 'lifecycle:transition',
 'ApplicationType',
 'LoanGoal',
 'RequestedAmount',
 'MonthlyCost',
 'org:resource',
 'Selected',
 'EventID',
 'OfferID',
 'FirstWithdrawalAmount',
 'Action',
 'Accepted',
 'CreditScore',
 'NumberOfTerms',
 'EventOrigin',
 'OfferedAmount',
 'prefix',
 'total_time',
 'prefix_length',
 'case_trace',
 'A_Cancelled_in_prefix',
 'A_Cancelled_occured',
 'event_1',
 'event_2',
 'event_3',
 'event_4',
 'event_5',
 'event_6',
 'event_7',
 'event_8',
 'event_9',
 'event_10',
 'event_11',
 'event_12',
 'event_13',
 'event_14',
 'event_15',
 'event_16',
 'event_17',
 'event_18',
 'event_19',
 'event_20',
 'event_21',
 'event_22',
 'event_23',
 'event_24',
 'event_25',
 'event_26',
 'event_27',
 'event_28',
 'event_29',
 'event_30',
 'event_31',
 'event_32',
 'event_33',
 'event_34',
 'event_35',
 'event_36',
 'event_37',
 'event_38',
 'event_39',
 'event_40',
 'event_41',
 'event_42',
 'event_43',
 'event_44',
 'event_45',
 'event_46',
 'event_

In [32]:
df_x.isnull().sum(axis=0) #Check for Nan values in columns

lifecycle:transition          0
ApplicationType               0
LoanGoal                      0
RequestedAmount               0
org:resource                  0
                         ...   
event_176               1168376
event_177               1168378
event_178               1168380
event_179               1168382
event_180               1168384
Length: 193, dtype: int64

In [34]:
df_x = df.copy()
remove_these_cols = ['case', 'event', 'time', 'MonthlyCost', 'Selected', 'OfferID', 
                     'FirstWithdrawalAmount', 'Accepted', 'CreditScore', 'NumberOfTerms', 
                     'OfferedAmount','A_Cancelled_occured']
df_x = df_x.drop(remove_these_cols, axis=1)
df_y = df[['A_Cancelled_occured']]
df_y.replace({0: False, 1:True}, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_y.replace({0: False, 1:True}, inplace=True)


In [35]:
df_y

Unnamed: 0,A_Cancelled_occured
0,False
1,False
2,False
3,False
4,False
...,...
1168381,True
1168382,True
1168383,True
1168384,True


In [36]:
df_x

Unnamed: 0,lifecycle:transition,ApplicationType,LoanGoal,RequestedAmount,org:resource,EventID,Action,EventOrigin,prefix,total_time,...,event_171,event_172,event_173,event_174,event_175,event_176,event_177,event_178,event_179,event_180
0,COMPLETE,New credit,Existing loan takeover,20000.0,User_1,Application_652823628,Created,Application,[A_Create Application],0.0,...,,,,,,,,,,
1,COMPLETE,New credit,Existing loan takeover,20000.0,User_1,ApplState_1582051990,statechange,Application,"[A_Create Application, A_Submitted]",0.048,...,,,,,,,,,,
2,SCHEDULE,New credit,Existing loan takeover,20000.0,User_1,Workitem_1298499574,Created,Workflow,"[A_Create Application, A_Submitted, W_Handle l...",0.47,...,,,,,,,,,,
3,WITHDRAW,New credit,Existing loan takeover,20000.0,User_1,Workitem_1673366067,Deleted,Workflow,"[A_Create Application, A_Submitted, W_Handle l...",81.088,...,,,,,,,,,,
4,SCHEDULE,New credit,Existing loan takeover,20000.0,User_1,Workitem_1493664571,Created,Workflow,"[A_Create Application, A_Submitted, W_Handle l...",81.099,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1168381,START,New credit,Home improvement,20000.0,User_96,Workitem_358206591,Obtained,Workflow,"[A_Create Application, A_Submitted, W_Handle l...",164967.256,...,,,,,,,,,,
1168382,COMPLETE,New credit,Home improvement,20000.0,User_96,ApplState_1120616436,statechange,Application,"[A_Create Application, A_Submitted, W_Handle l...",164967.258,...,,,,,,,,,,
1168383,SUSPEND,New credit,Home improvement,20000.0,User_96,Workitem_146325658,Released,Workflow,"[A_Create Application, A_Submitted, W_Handle l...",165092.773,...,,,,,,,,,,
1168384,ATE_ABORT,New credit,Home improvement,20000.0,User_1,Workitem_1817549786,Deleted,Workflow,"[A_Create Application, A_Submitted, W_Handle l...",464108.996,...,,,,,,,,,,
