The data used for this notebook can be downloaded from https://www.kaggle.com/datasets/sst2023/kdd-cup-2015. After unzipping the folder, you will need to place all individual CSV files in a single 'KDD Cup 2015' folder for the below code to work. The reason is that train and test data is artificially split for the Kaggle version, whereas we want to work with the full data and create random train-test splits for cross-validation.

### Import libraries, load train and test CSVs, and merge

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

df_train = pd.read_csv('KDD Cup 2015/log_train.csv')
df_test = pd.read_csv('KDD Cup 2015/log_test.csv')
df = pd.concat([df_train, df_test], ignore_index=True)

df_enroll_train = pd.read_csv('KDD Cup 2015/enrollment_train.csv')
df_enroll_test = pd.read_csv('KDD Cup 2015/enrollment_test.csv')
df_enroll = pd.concat([df_enroll_train, df_enroll_test], ignore_index=True)

df_date = pd.read_csv('KDD Cup 2015/date.csv')

### Create basic event count and date features

In [None]:
enroll_dict = {}
for enroll_id, time, source, event in zip(df['enrollment_id'], df['time'], df['source'], df['event']):
    server = 1 if source == 'server' else 0
    browser = 1 if source == 'browser' else 0
    
    nav_s = 1 if ((event == 'navigate') and server) else 0
    nav_b = 1 if ((event == 'navigate') and browser) else 0
    
    acc_s = 1 if ((event == 'access') and server) else 0
    acc_b = 1 if ((event == 'access') and browser) else 0
    
    prob_s = 1 if ((event == 'problem') and server) else 0
    prob_b = 1 if ((event == 'problem') and browser) else 0
    
    vid_s = 1 if ((event == 'video') and server) else 0
    vid_b = 1 if ((event == 'video') and browser) else 0
    
    pc_s = 1 if ((event == 'page_close') and server) else 0
    pc_b = 1 if ((event == 'page_close') and browser) else 0
    
    wik_s = 1 if ((event == 'wiki') and server) else 0
    wik_b = 1 if ((event == 'wiki') and browser) else 0
    
    dis_s = 1 if ((event == 'discussion') and server) else 0
    dis_b = 1 if ((event == 'discussion') and browser) else 0
    
    date_time = time[0:10]
    
    if enroll_id in enroll_dict:
        enroll_dict[enroll_id]['total'] += 1
        
        enroll_dict[enroll_id]['server'] += server
        enroll_dict[enroll_id]['browser'] += browser
        
        enroll_dict[enroll_id]['nav_s'] += nav_s
        enroll_dict[enroll_id]['nav_b'] += nav_b
        
        enroll_dict[enroll_id]['acc_s'] += acc_s
        enroll_dict[enroll_id]['acc_b'] += acc_b
        
        enroll_dict[enroll_id]['prob_s'] += prob_s
        enroll_dict[enroll_id]['prob_b'] += prob_b
        
        enroll_dict[enroll_id]['vid_s'] += vid_s
        enroll_dict[enroll_id]['vid_b'] += vid_b
        
        enroll_dict[enroll_id]['pc_s'] += pc_s
        enroll_dict[enroll_id]['pc_b'] += pc_b
        
        enroll_dict[enroll_id]['wik_s'] += wik_s
        enroll_dict[enroll_id]['wik_b'] += wik_b
        
        enroll_dict[enroll_id]['dis_s'] += dis_s
        enroll_dict[enroll_id]['dis_b'] += dis_b
        
        if date_time not in enroll_dict[enroll_id]['date_times']:
            enroll_dict[enroll_id]['date_times'].append(date_time)
            enroll_dict[enroll_id]['day_count'] += 1
    else:
        enroll_dict[enroll_id] = {'total': 1, 'server': server, 'browser': browser,
                                  'nav_s': nav_s, 'nav_b': nav_b,
                                  'acc_s': acc_s, 'acc_b': acc_b,
                                  'prob_s': prob_s, 'prob_b': prob_b,
                                  'vid_s': vid_s, 'vid_b': vid_b,
                                  'pc_s': pc_s, 'pc_b': pc_b,
                                  'wik_s': wik_s, 'wik_b': wik_b,
                                  'dis_s': dis_s, 'dis_b': dis_b,
                                  'day_count': 1, 'date_times': [date_time]}

In [None]:
for key in enroll_dict.keys():
    date_times = enroll_dict[key]['date_times']
    date_objects = [datetime.strptime(date_time, '%Y-%m-%d') for date_time in date_times]
    date_range = max(date_objects) - min(date_objects)
    enroll_dict[key]['date_range'] = int(date_range.days)

In [None]:
data = []
for e_id, stats in enroll_dict.items():
    data.append({
        'enroll_id': e_id,
        'total_count': stats['total'],
        'server_count': stats['server'],
        'browser_count': stats['browser'],
        'navigate_s': stats['nav_s'],
        #'navigate_b': stats['nav_b'], all zeros
        'access_s': stats['acc_s'],
        'access_b': stats['acc_b'],
        'problem_s': stats['prob_s'],
        'problem_b': stats['prob_b'],
        #'video_s': stats['vid_s'], all zeros
        'video_b': stats['vid_b'],
        #'page_close_s': stats['pc_s'], all zeros
        'page_close_b': stats['pc_b'],
        'wiki_s': stats['wik_s'],
        #'wiki_b': stats['wik_b'], all zeros
        'discussion_s': stats['dis_s'],
        #'discussion_b': stats['dis_b'], all zeros
        'days_active_count': stats['day_count'],
        'date_range': stats['date_range']
    })

df_feature = pd.DataFrame(data)

### Add advanced date features (binary per day 1-30)

In [None]:
df_dates = pd.merge(df_enroll, df_date, how = 'left', on = 'course_id')
enrollment_from_dict = {}
for e_id, from_date in zip(df_dates['enrollment_id'], df_dates['from']):
    enrollment_from_dict[e_id] = datetime.strptime(from_date, '%Y-%m-%d')
    
date_arr_dict = {}
for e_id in df_feature['enroll_id']:
    date_arr = np.zeros(30)
    from_date = enrollment_from_dict[e_id]
    for date_str in enroll_dict[e_id]['date_times']:
        date_object = datetime.strptime(date_str, '%Y-%m-%d')
        date_index = int((date_object - from_date).days)
        date_arr[date_index] = 1
        
    date_arr_dict[e_id] = date_arr
    
for i in range(30):
    feature_str = 'active_day_' + str(i+1)
    df_feature[feature_str] = [date_arr_dict[e_id][i] for e_id in df_feature['enroll_id']]

### Merge feature df with labels

In [None]:
df_truth_train = pd.read_csv('KDD Cup 2015/truth_train.csv', names=['enroll_id', 'dropout'])
df_truth_test = pd.read_csv('KDD Cup 2015/truth_test.csv', names=['enroll_id', 'dropout'])
df_truth = pd.concat([df_truth_train, df_truth_test], ignore_index=True)

df_full = pd.merge(df_truth, df_feature, on='enroll_id', how='left')

### Add features signaling earlier courses completed or dropped out

It is important to take into account that dropout information becomes available only 10 days after the end of a course, see https://www.biendata.xyz/competition/kddcup2015/ for more details. For predictions to be fair, we can only include 'historical' data from courses that have a 'to date' more than 10 days before the end of the current course.

In [None]:
e_dict = {}
for e_id, u_id, to_date in zip(df_dates['enrollment_id'], df_dates['username'], df_dates['to']):
    e_dict[e_id] = {'u_id': u_id, 'to_date': datetime.strptime(to_date, '%Y-%m-%d'), 'dropout': 0}
    
for e_id, dropout in zip(df_full['enroll_id'], df_full['dropout']):
    e_dict[e_id]['dropout'] = dropout
    
u_dict = {}
for e_id in df_full['enroll_id']:
    u_id = e_dict[e_id]['u_id']
    to_date = e_dict[e_id]['to_date']
    dropout = e_dict[e_id]['dropout']
    if u_id in u_dict:
        u_dict[u_id]['to_dates'].append(to_date)
        u_dict[u_id]['dropouts'].append(dropout)
    else:
        u_dict[u_id] = {'to_dates': [to_date], 'dropouts': [dropout]}
        
prev_complete = np.zeros(len(df_full))
prev_dropout = np.zeros(len(df_full))
for idx, e_id in enumerate(df_full['enroll_id']):
    course_date = e_dict[e_id]['to_date']
    
    u_id = e_dict[e_id]['u_id']
    all_dates = u_dict[u_id]['to_dates']
    all_dropouts = u_dict[u_id]['dropouts']
    
    for earlier_date, earlier_dropout in zip(all_dates, all_dropouts):
        if (course_date - earlier_date).days > 10:
            if earlier_dropout == 1:
                prev_dropout[idx] += 1
            else:
                prev_complete[idx] += 1
    
df_full['prev_complete'] = prev_complete.tolist()
df_full['prev_dropout'] = prev_dropout.tolist()

### Store data as CSV

In [None]:
df_full.to_csv('kdd_cup_2015_features.csv', index = False)