#### Imports

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from cairosvg import svg2png
from PIL import Image
from io import BytesIO
from tqdm import tqdm
# from sklearn.metrics import roc_auc_score

pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

KAGGLE = 0

#### Global Variables

In [2]:
TRAIN_CSV_PATH = 'train.feather'
QUESTIONS_CSV_PATH = 'questions.csv'
LECTURES_CSV_PATH = 'lectures.csv'
TEST_CSV_PATH = 'test.csv'
SAMPLE_CSV_PATH = 'example_sample_submission.csv'

if KAGGLE:
    TRAIN_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/train.csv'
    QUESTIONS_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/questions.csv'
    LECTURES_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/lectures.csv'
    TEST_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/test.csv'
    SAMPLE_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv'

COLUMN_TYPES = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}

In [3]:
# train_df = pd.read_csv(TRAIN_CSV_PATH, nrows=10**5, dtype=COLUMN_TYPES)
train_df = pd.read_feather(TRAIN_CSV_PATH)
train_df = train_df.iloc[:3 * (10 ** 5)]
questions_df = pd.read_csv(QUESTIONS_CSV_PATH)
lectures_df = pd.read_csv(LECTURES_CSV_PATH)

### Check for question specific features

In [4]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.00000,False
2,2,118363,115,128,0,0,0,1,55000.00000,False
3,3,131167,115,7860,0,3,0,1,19000.00000,False
4,4,137965,115,7922,0,4,1,1,11000.00000,False
...,...,...,...,...,...,...,...,...,...,...
299995,299995,410223,5615405,2946,0,6,0,0,39000.00000,False
299996,299996,528172,5615405,2594,0,7,2,1,27666.00000,False
299997,299997,528172,5615405,2593,0,7,3,1,27666.00000,False
299998,299998,528172,5615405,2595,0,7,3,0,27666.00000,False


In [5]:
questions_df[questions_df.tags.isna()] = questions_df[questions_df.tags.isna()].fillna("")
questions_df[questions_df.tags.isna()]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags


In [6]:
tag_counts = questions_df[['tags', 'question_id']].groupby('tags')['tags'].count()
tag_counts_list = list(tag_counts.sort_values().index.values)
questions_df['tag_count_wise_id'] = questions_df.apply(lambda row: tag_counts_list.index(row.tags), axis=1)

In [7]:
questions_df['number_of_tags'] = questions_df.apply(lambda row: len(row.tags.split(" ")), axis=1)


### The lecture specific data creation

In [8]:
# The lecture based dict for type of and part
lecture_dict = lectures_df.set_index("lecture_id").to_dict(orient="index")
lecture_comprehensive_type_of_dict = lectures_df.set_index('type_of').groupby(level=0).apply(lambda row: row.to_dict('list')).to_dict()
lecture_comprehensive_part_dict = lectures_df.set_index('part').groupby(level=0).apply(lambda row: row.to_dict('list')).to_dict()

In [9]:
for key in lecture_comprehensive_part_dict:
    lecture_comprehensive_part_dict[key]['tag'] = list(set(lecture_comprehensive_part_dict[key]['tag']))
    lecture_comprehensive_part_dict[key]['type_of'] = list(set(lecture_comprehensive_part_dict[key]['type_of']))

for key in lecture_comprehensive_type_of_dict:
    lecture_comprehensive_type_of_dict[key]['tag'] = list(set(lecture_comprehensive_type_of_dict[key]['tag']))
    lecture_comprehensive_type_of_dict[key]['part'] = list(set(lecture_comprehensive_type_of_dict[key]['part']))

### common function

In [10]:
def convert_timestamp_in_parts(millis):
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours, minutes, seconds

def get_timestamp_in_parts(row):
    millis = row.timestamp
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours

def get_prior_elasped_time_in_parts(row):
    prior_millis = row.prior_question_elapsed_time
    prior_seconds = (prior_millis / 1000)
    prior_minutes = (prior_seconds / 60)
    return prior_minutes

### Set User specific features

##### 1. Get timestamp as hours and minutes

In [11]:
train_df[['days', 'hours']] = [*train_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)]
train_df['prior_minutes'] = train_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)

##### 2. Find the intro rows.

In [13]:
starter_ids = train_df[(train_df.timestamp == 0)].task_container_id.unique()
train_df['intro_section'] = train_df.apply(lambda row: (row.task_container_id in starter_ids or row.task_container_id == 0), axis=1)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,prior_minutes,intro_section
0,0,0,115,5692,0,1,3,1,,,0.0,0.0,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,0.00066,0.01582,0.61667,False
2,2,118363,115,128,0,0,0,1,55000.0,False,0.00137,0.03288,0.91667,True
3,3,131167,115,7860,0,3,0,1,19000.0,False,0.00152,0.03644,0.31667,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,0.0016,0.03832,0.18333,False


##### 3. The actions after the lectures should be tagged with something.

In [14]:
question_dict = questions_df.set_index("question_id").to_dict(orient="index")

In [15]:
# https://en.wikipedia.org/wiki/Forgetting_curve
# https://psychology.stackexchange.com/questions/5199/which-equation-is-ebbinghauss-forgetting-curve-and-what-do-the-constants-repres
def apply_filter_based_on_previous_row(func):
    prev_row = {
        "has_seen_lecture": False,
        "previous_lecture_hours": 0,
        "previous_lecture": {},
        "user_id": ""
    }
    def wrapper(curr_row, **kwargs):
        content_id = curr_row['content_id']
        tag_of_prev_lecture = ""
        part_of_prev_lecture = ""
        has_seen_same_tag_as_lecture = False
        has_seen_same_part_as_lecture = False
        has_part_common_with_type_of = False
        has_tag_common_with_type_of = False
        has_tag_common_with_part_dict = False
        has_type_of_common_with_part_dict = False
        if prev_row['user_id'] == curr_row['user_id']:
            if curr_row['content_type_id'] != 0:
                prev_row['has_seen_lecture'] = True
                prev_row['previous_lecture_hours'] = curr_row['hours']
                prev_row['previous_lecture'] = lecture_dict[content_id]
            else:
                has_seen_same_tag_as_lecture = (str(prev_row['previous_lecture'].get("tag", "")) in str(question_dict[content_id]['tags']).split())
                has_seen_same_part_as_lecture = (prev_row['previous_lecture'].get("part") == question_dict[content_id]['part'])
                prev_type_of = prev_row['previous_lecture'].get("type_of")
                prev_part = prev_row['previous_lecture'].get("part")
                has_part_common_with_type_of = question_dict[content_id]['part'] in  lecture_comprehensive_type_of_dict.get(prev_type_of, {}).get('part', [])
                has_tag_common_with_type_of = bool(set(map(int, str(question_dict[content_id]['tags']).split())).intersection(lecture_comprehensive_type_of_dict.get(prev_type_of, {}).get('tag', [])))
                has_tag_common_with_part_dict = bool(set(map(int, str(question_dict[content_id]['tags']).split())).intersection(lecture_comprehensive_part_dict.get(prev_part, {}).get('tag', [])))
                has_type_of_common_with_part_dict = prev_type_of in lecture_comprehensive_part_dict[question_dict[content_id]['part']]['type_of']
                tag_of_prev_lecture = prev_row['previous_lecture'].get('tag', '')
                part_of_prev_lecture = prev_row['previous_lecture'].get('part')
        else:
            prev_row['user_id'] = curr_row['user_id']
            if curr_row['content_type_id'] != 0:
                prev_row['has_seen_lecture'] = True
                prev_row['previous_lecture_hours'] = curr_row['hours']
                prev_row['previous_lecture'] = lecture_dict[content_id]
            else:
                prev_row['has_seen_lecture'] = False
                prev_row['previous_lecture_hours'] = 0
                prev_row['previous_lecture'] = {}

        timestamp_difference = curr_row['hours'] - prev_row['previous_lecture_hours']
        retention = func(timestamp_difference)
        return retention, tag_of_prev_lecture, part_of_prev_lecture, prev_row['has_seen_lecture'], has_seen_same_tag_as_lecture, has_seen_same_part_as_lecture, has_part_common_with_type_of, has_tag_common_with_type_of, has_tag_common_with_part_dict, has_type_of_common_with_part_dict
    return wrapper

@apply_filter_based_on_previous_row
def running_retention(timestamp_difference):
    retention = 1.48 / ((1.25 * timestamp_difference) + 1.48)
    return retention

train_df["retention"], train_df["previous_lecture_tag"], train_df["previous_lecture_part"], train_df["has_seen_lecture_before"], train_df["has_seen_same_tag_as_lecture"], train_df["has_seen_same_part_as_lecture"], train_df["has_part_common_with_type_of"], train_df["has_tag_common_with_type_of"], train_df["has_tag_common_with_part_dict"], train_df["has_type_of_common_with_part_dict"] = zip(*train_df.apply(running_retention, axis=1))

In [16]:
train_df = train_df[train_df['content_type_id'] == 0]

##### 4. Now lets find out the lag time

In [17]:
# lag time
train_df['event_time'] = train_df[['user_id', 'timestamp']].groupby('user_id')['timestamp'].diff()
train_df['shift_event_time'] = train_df[['event_time', 'user_id']].groupby('user_id').event_time.shift(-1)
train_df['shift_elapsed_time'] = train_df[['prior_question_elapsed_time', 'user_id']].groupby('user_id').prior_question_elapsed_time.shift(-1)
train_df['event_lag_time'] = train_df['shift_event_time'] - train_df['shift_elapsed_time']

In [18]:
train_df_elapsed_time_groupby = train_df[['content_id', 'shift_elapsed_time']].groupby(['content_id']).shift_elapsed_time
individual_question_min_time_dict = train_df_elapsed_time_groupby.min().to_dict()
individual_question_mean_time_dict = train_df_elapsed_time_groupby.mean().to_dict()

del train_df_elapsed_time_groupby

In [19]:
# def get_lag_time(content_id, shift_event_time):
#     question_min_time = individual_question_min_time_dict[content_id]
#     return shift_event_time - question_min_time

# train_df['lag_time'] = train_df[train_df['content_type_id'] == 0].apply(lambda row: get_lag_time(row.content_id, row.shift_event_time), axis=1)

def get_prior_elapsed_time_difference(content_id, shift_elapsed_time):
    question_min_time = individual_question_min_time_dict[content_id]
    return shift_elapsed_time - question_min_time

train_df['lag_time'] = train_df.apply(lambda row: get_prior_elapsed_time_difference(row.content_id, row.shift_elapsed_time), axis=1)

In [20]:
def has_elapsed_time_greater_than_average_time(content_id, shift_elapsed_time):
    question_min_time = individual_question_mean_time_dict[content_id]
    return shift_elapsed_time > question_min_time

train_df['question_answered_late'] = train_df.apply(lambda row: has_elapsed_time_greater_than_average_time(row.content_id, row.shift_elapsed_time), axis=1)

In [21]:
average_question_timestamp_difference_dict = train_df[(train_df['shift_event_time'] < 3600000)][['content_id', 'shift_event_time']].groupby('content_id').shift_event_time.mean().to_dict()

In [22]:
def has_event_time_greater_than_average(content_id, event_time):
    average_question_time = average_question_timestamp_difference_dict.get(content_id, 0)
    return event_time > average_question_time

train_df['event_time_greater_than_average'] = train_df.apply(lambda row: has_event_time_greater_than_average(row.content_id, row.shift_event_time), axis=1)

##### 5. lets find the average time took for each question

In [23]:
# The average time per question can be considered based on the every candidate or based on the current candidate previous question answering time.

In [24]:
user_average_time_to_elapsed_dict = train_df.groupby("user_id").prior_question_elapsed_time.mean().to_dict()

In [25]:
train_df['question_took_more_than_average_user_time'] = train_df.apply(lambda row: row.prior_question_elapsed_time > user_average_time_to_elapsed_dict[row.user_id], axis=1)

In [26]:
# check average question answering time
average_question_prior_question_elapsed_time_dict = train_df.groupby(["content_id"]).prior_question_elapsed_time.mean().to_dict()

In [27]:
train_df.sort_values(['user_id', 'timestamp'], inplace=True)
bundle_time_relation_dict = {}

prev_row = {
    "user_id": "",
    "bundle_id": "",
    "previous_bundle_elapsed_time": ""
}


def get_question_time_taken(curr_row):
    current_bundle_id = question_dict[curr_row.content_id]['bundle_id']
    if prev_row['user_id'] == curr_row['user_id']:
        if prev_row['bundle_id'] == current_bundle_id:
            time_taken_for_question = prev_row['previous_bundle_elapsed_time']
        else:
            time_taken_for_question = curr_row['prior_question_elapsed_time']
            prev_row['previous_bundle_elapsed_time'] = curr_row['prior_question_elapsed_time']
            prev_row['bundle_id'] = current_bundle_id
    else:
        prev_row['bundle_id'] = current_bundle_id
        prev_row['previous_bundle_elapsed_time'] = curr_row['prior_question_elapsed_time']
        time_taken_for_question = 0
    prev_row['user_id'] = curr_row.user_id
    return time_taken_for_question


for index, row in train_df.iloc[::-1].iterrows():
    if row.content_id in bundle_time_relation_dict and bundle_time_relation_dict:
        bundle_time_relation_dict[row.content_id]['question_time_list'].append(get_question_time_taken(row))
    else:
        bundle_time_relation_dict[row.content_id] = {
            "question_time_list": [get_question_time_taken(row)]
        }

In [28]:
for content_id in bundle_time_relation_dict.keys():
    bundle_time_relation_dict[content_id]['minimum_time'] = min(bundle_time_relation_dict[content_id]['question_time_list'])
    bundle_time_relation_dict[content_id]['average_time'] = np.mean(bundle_time_relation_dict[content_id]['question_time_list'])

In [29]:
train_df['has_more_than_average_bundle_time'] = train_df.apply(lambda row: row.shift_elapsed_time < bundle_time_relation_dict[row.content_id]['average_time'], axis=1)

def get_bundle_lag_time(row):
    if row.shift_elapsed_time > bundle_time_relation_dict[row.content_id]['minimum_time']:
        return row.shift_elapsed_time - bundle_time_relation_dict[row.content_id]['minimum_time']
    return 0

train_df['bundle_lag_time'] = train_df.apply(lambda row: get_bundle_lag_time(row), axis=1)

##### 6. Find the toughest questions

In [30]:
# get all the complete tags of the question
questions_df['tags'] = questions_df['tags'].astype(str)

tags = [value.split() for value in questions_df[questions_df.tags != "nan"].tags.values]
tags = [item for elem in tags for item in elem]
tags = set(tags)
tags = list(tags)
print(f'There are {len(tags)} different tags')

There are 188 different tags


In [31]:
# Find the wrong and right question based on correctly answered
correct = train_df.groupby(["content_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "content_id", columns='answered_correctly', values='size')
correct.columns = ['wrong', 'right']
correct = correct.fillna(0)
correct[['wrong', 'right']] = correct[['wrong', 'right']].astype(int)
questions_df = questions_df.merge(correct, left_on = "question_id", right_on = "content_id", how = "left")
questions_df['percentage_correct'] = questions_df.right / (questions_df.right + questions_df.wrong)
questions_df.head()

# questions_df.drop(["wrong_x", "right_x", "wrong_y", "right_y"], axis=1, inplace=True)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tag_count_wise_id,number_of_tags,wrong,right,percentage_correct
0,0,0,0,1,51 131 162 38,1055,4,1.0,23.0,0.95833
1,1,1,1,1,131 36 81,963,3,1.0,16.0,0.94118
2,2,2,0,1,131 101 162 92,574,4,58.0,77.0,0.57037
3,3,3,0,1,131 149 162 29,651,4,14.0,53.0,0.79104
4,4,4,3,1,131 5 162 38,910,4,42.0,47.0,0.52809


In [32]:
# TODO: This seems to be wrong please check again
train_df['question_bundle_id'] = train_df.apply(lambda row: question_dict[row.content_id].get('bundle_id'), axis=1)
correct = train_df.groupby(["question_bundle_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "question_bundle_id", columns='answered_correctly', values='size')
correct.columns = ['bundle_wrong', 'bundle_right']
correct = correct.fillna(0)
correct[['bundle_wrong', 'bundle_right']] = correct[['bundle_wrong', 'bundle_right']].astype(int)

questions_df = questions_df.merge(correct, left_on = "bundle_id", right_on = "question_bundle_id", how = "left")
questions_df['task_percentage_correct'] = questions_df.bundle_right / (questions_df.bundle_right + questions_df.bundle_wrong)
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tag_count_wise_id,number_of_tags,wrong,right,percentage_correct,bundle_wrong,bundle_right,task_percentage_correct
0,0,0,0,1,51 131 162 38,1055,4,1.0,23.0,0.95833,1.0,23.0,0.95833
1,1,1,1,1,131 36 81,963,3,1.0,16.0,0.94118,1.0,16.0,0.94118
2,2,2,0,1,131 101 162 92,574,4,58.0,77.0,0.57037,58.0,77.0,0.57037
3,3,3,0,1,131 149 162 29,651,4,14.0,53.0,0.79104,14.0,53.0,0.79104
4,4,4,3,1,131 5 162 38,910,4,42.0,47.0,0.52809,42.0,47.0,0.52809


In [33]:
train_df.groupby(["task_container_id", 'answered_correctly'], as_index=False).count()

Unnamed: 0,task_container_id,answered_correctly,row_id,timestamp,user_id,content_id,content_type_id,user_answer,prior_question_elapsed_time,prior_question_had_explanation,...,shift_event_time,shift_elapsed_time,event_lag_time,lag_time,question_answered_late,event_time_greater_than_average,question_took_more_than_average_user_time,has_more_than_average_bundle_time,bundle_lag_time,question_bundle_id
0,0,0,306,306,306,306,306,306,5,5,...,306,306,306,306,306,306,306,306,306,306
1,0,1,707,707,707,707,707,707,12,12,...,706,706,706,706,707,707,707,707,707,707
2,1,0,485,485,485,485,485,485,482,482,...,484,484,484,484,485,485,485,485,485,485
3,1,1,527,527,527,527,527,527,522,522,...,527,527,527,527,527,527,527,527,527,527
4,2,0,575,575,575,575,575,575,575,575,...,575,575,575,575,575,575,575,575,575,575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10511,5626,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
10512,5627,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
10513,5628,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
10514,5629,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [34]:
train_df[train_df['task_container_id'] == 1]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,shift_event_time,shift_elapsed_time,event_lag_time,lag_time,question_answered_late,event_time_greater_than_average,question_took_more_than_average_user_time,has_more_than_average_bundle_time,bundle_lag_time,question_bundle_id
0,0,0,115,5692,0,1,3,1,,,...,56943.00000,37000.00000,19943.00000,35000.00000,True,False,False,False,37000.00000,5692
47,47,32683,124,7876,0,1,0,0,26000.00000,False,...,29317.00000,29000.00000,317.00000,28000.00000,True,False,True,False,29000.00000,7876
77,77,21592,2746,758,0,1,0,0,28000.00000,False,...,27477.00000,17000.00000,10477.00000,9000.00000,False,False,True,True,17000.00000,758
97,97,39828,5382,3944,0,1,1,0,24000.00000,False,...,92361.00000,35000.00000,57361.00000,34000.00000,True,False,False,False,35000.00000,3944
225,225,38769,8623,4750,0,1,1,1,16000.00000,False,...,34090.00000,33000.00000,1090.00000,20000.00000,True,False,False,False,22000.00000,4750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299239,299239,26145,5596216,7876,0,1,1,0,36000.00000,False,...,28197.00000,21000.00000,7197.00000,20000.00000,False,False,True,False,21000.00000,7876
299887,299887,27646,5603990,5466,0,1,1,0,22000.00000,False,...,38426.00000,19000.00000,19426.00000,18000.00000,True,False,False,False,16000.00000,5466
299914,299914,23857,5612057,7876,0,1,3,1,12000.00000,False,...,36786.00000,20000.00000,16786.00000,19000.00000,False,False,False,False,20000.00000,7876
299944,299944,24090,5613226,7876,0,1,0,0,20000.00000,False,...,23667.00000,22000.00000,1667.00000,21000.00000,False,False,False,False,22000.00000,7876


In [35]:
questions_df[questions_df.bundle_id == 1]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tag_count_wise_id,number_of_tags,wrong,right,percentage_correct,bundle_wrong,bundle_right,task_percentage_correct
1,1,1,1,1,131 36 81,963,3,1.0,16.0,0.94118,1.0,16.0,0.94118


In [36]:
# Find the wrong and right question tags based on correctly answered
tags_list = [value.split() for value in questions_df.tags.values]
questions_df['tags_str'] = questions_df['tags']
questions_df['tags'] = tags_list
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,tag_count_wise_id,number_of_tags,wrong,right,percentage_correct,bundle_wrong,bundle_right,task_percentage_correct,tags_str
0,0,0,0,1,"[51, 131, 162, 38]",1055,4,1.0,23.0,0.95833,1.0,23.0,0.95833,51 131 162 38
1,1,1,1,1,"[131, 36, 81]",963,3,1.0,16.0,0.94118,1.0,16.0,0.94118,131 36 81
2,2,2,0,1,"[131, 101, 162, 92]",574,4,58.0,77.0,0.57037,58.0,77.0,0.57037,131 101 162 92
3,3,3,0,1,"[131, 149, 162, 29]",651,4,14.0,53.0,0.79104,14.0,53.0,0.79104,131 149 162 29
4,4,4,3,1,"[131, 5, 162, 38]",910,4,42.0,47.0,0.52809,42.0,47.0,0.52809,131 5 162 38


In [37]:
tags_df = pd.DataFrame()
for x in range(len(tags)):
    df = questions_df[questions_df.tags.apply(lambda l: tags[x] in l)]
    df1 = df.agg({'wrong': ['sum'], 'right': ['sum']})
    df1['total_questions'] = df1.wrong + df1.right
    df1['question_ids_with_tag'] = len(df)
    df1['tag'] = tags[x]
    df1 = df1.set_index('tag')
    tags_df = tags_df.append(df1)

tags_df[['wrong', 'right', 'total_questions']] = tags_df[['wrong', 'right', 'total_questions']].astype(int)
tags_df['percent_correct'] = tags_df.right / tags_df.total_questions
tags_df = tags_df.sort_values(by = "percent_correct")

tags_df.head()

Unnamed: 0_level_0,wrong,right,total_questions,question_ids_with_tag,percent_correct
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
24,450,317,767,17,0.4133
19,437,361,798,57,0.45238
23,654,620,1274,11,0.48666
151,712,679,1391,16,0.48814
11,163,157,320,15,0.49062


In [38]:
# Question dict
questions_dict = questions_df[['question_id', 'percentage_correct', 'tags']].set_index('question_id').to_dict(orient='index')

In [39]:
# TODO: please check the values that is used in each question they might be wrong
def is_question_above_average_answering(row):
    percentage_correct = question_dict[row.content_id].get('percentage_correct')
    if percentage_correct:
        return percentage_correct > 50
    return False

def is_all_tags_above_average_answering(row):
    tags_answer_percentages = []
    tags = questions_dict[row.content_id].get('tags')
    if not tags:
        return False
    tags = tags
    for tag in tags:
        tags_answer_percentages.append(tags_df.loc[tag].percent_correct)
    overall_tag_percentage = sum(tags_answer_percentages) / len(tags)
    return overall_tag_percentage > 0.5

train_df['question_has_above_average_correctness'] = train_df.apply(lambda row: is_question_above_average_answering(row), axis=1)
train_df['tag_has_above_average_correctness'] = train_df.apply(lambda row: is_all_tags_above_average_answering(row), axis=1)

##### 6. toughtest questions relates to the lag time and the elapsed time

In [40]:
check_answered_correctly = (train_df.answered_correctly == 1)

In [42]:
lag_time_answered_correctly_mean_dict = train_df[check_answered_correctly][['content_id', 'lag_time']].groupby(['content_id']).lag_time.mean().to_dict()
train_df['has_above_average_lag_time_for_the_question'] = train_df.apply(lambda row: row.lag_time > lag_time_answered_correctly_mean_dict.get(row.content_id, 0), axis=1)

In [43]:
shift_elapsed_time_answered_correctly_mean_dict = train_df[check_answered_correctly][['content_id', 'shift_elapsed_time']].groupby(['content_id']).shift_elapsed_time.mean().to_dict()
train_df['has_above_average_shift_elpased_time_for_the_question'] = train_df.apply(lambda row: row.shift_elapsed_time > shift_elapsed_time_answered_correctly_mean_dict.get(row.content_id, 0), axis=1)

In [44]:
# 1. Lag time based on the min/average question time - Done
# 2. Check if the average question time is greater than or not - Done
# 3. The timestamp is greater or not - Done
# 4. check if the top users and top interactions has any relation - Must be run on full dataset
# 5. Find the toughest question - Done
# 6. Find if the toughest question relates to the lag time - Done
# 7. Find if the toughest question relates to the elapsed time - Done


##### 7. The prior time mean for each user should be averaged to check if the user too more than usual

In [50]:
# TODO: Do the following by setting the time rather than setting the flag
# BUG: The comparison seems to be wrong 
elapsed_time_mean_dict = train_df[check_answered_correctly][['shift_elapsed_time', 'user_id']].groupby('user_id').shift_elapsed_time.mean().to_dict()
train_df['has_above_user_average_time_to_answer'] = train_df[['user_id', 'shift_elapsed_time']].apply(lambda row: elapsed_time_mean_dict.get(row.user_id, 0) > row.shift_elapsed_time, axis=1)

In [46]:
event_time_mean = train_df[check_answered_correctly][['event_time', 'user_id']].groupby('user_id').event_time.mean()
train_df['has_above_user_average_time_for_event'] = train_df[['user_id', 'event_time']].apply(lambda row: event_time_mean[row.user_id] > row.event_time, axis=1)

In [47]:
shift_event_time_mean = train_df[check_answered_correctly][['shift_event_time', 'user_id']].groupby('user_id').shift_event_time.mean()
train_df['has_above_user_average_time_for_event'] = train_df[['user_id', 'shift_event_time']].apply(lambda row: shift_event_time_mean[row.user_id] > row.shift_event_time, axis=1)

##### 8. cum correctness of the answers made by the user

In [None]:
cum = train_df.groupby('user_id')['answered_correctly'].agg(['cumsum', 'cumcount'])
train_df['user_cum_correctness'] = cum['cumsum'] / cum['cumcount']
train_df['user_correct_cumsum'] = cum['cumsum']
train_df['user_correct_cumcount'] = cum['cumcount']

train_df['user_correct_cumsum'].fillna(0, inplace=True)
train_df.user_cum_correctness = train_df.user_cum_correctness.astype('float16')
train_df.user_correct_cumcount = train_df.user_correct_cumcount.astype('int16')
train_df.user_correct_cumsum = train_df.user_correct_cumsum.astype('int16')

user_agg = train_df.groupby('user_id')['answered_correctly'].agg(['sum', 'count'])
user_agg = user_agg.astype('int16')
train_df['user_correctness'] = cum['sum'] / cum['count']
train_df['user_correct_sum'] = cum['sum']
train_df['user_correct_count'] = cum['count']

train_df['user_correct_sum'].fillna(0, inplace=True)
train_df.user_correctness = train_df.user_correctness.astype('float16')
train_df.user_correct_count = train_df.user_correct_count.astype('int16')
train_df.user_correct_sum = train_df.user_correct_sum.astype('int16')

##### 9. cum of prior question had seen explanation 

In [None]:
train_df['has_seen_question_explanation'] = train_df.groupby('user_id').prior_question_had_explanation.shift(-1)

In [None]:
cum = train_df.groupby('user_id')['has_seen_question_explanation'].agg(['cumsum', 'cumcount'])
train_df['explanation_mean'] = cum['cumsum'] / cum['cumcount']
train_df['explanation_cumsum'] = cum['cumsum'] 
train_df.drop(columns=['lag'], inplace=True)

train_df['explanation_mean'].fillna(0, inplace=True)
train_df['explanation_cumsum'].fillna(0, inplace=True)
train_df.explanation_mean = train_df.explanation_mean.astype('float16')
train_df.explanation_cumsum = train_df.explanation_cumsum.astype('int16')

##### 10. The attempt feature

In [None]:
train_df["attempt_no"] = 1
train_df.attempt_no = train_df.attempt_no.astype('int8')
train_df["attempt_no"] = train_df[["user_id", "content_id", "attempt_no"]].groupby(["user_id", "content_id"])["attempt_no"].cumsum()

In [None]:
attempt_no_agg = train_df.groupby(["user_id","content_id"])["attempt_no"].agg(['sum'])
attempt_no_agg = attempt_no_agg.astype('int8')

##### 11. The aggregate of the answered correctly with the user_id, content_id, task_container_id

In [None]:
content_agg = train_df.groupby('content_id')['answered_correctly'].agg(['sum', 'count','var'])
content_agg = content_agg.astype('float32')
train_df['content_count'] = train_df['content_id'].map(content_agg['count']).astype('int32')
train_df['content_sum'] = train_df['content_id'].map(content_agg['sum']).astype('int32')
train_df['content_correctness'] = train_df['content_id'].map(content_agg['sum'] / content_agg['count'])
train_df.content_correctness = train_df.content_correctness.astype('float16')

task_container_agg = train_df.groupby('task_container_id')['answered_correctly'].agg(['sum', 'count','var'])
task_container_agg = task_container_agg.astype('float32')
train_df['task_container_sum'] = train_df['task_container_id'].map(task_container_agg['sum']).astype('int32')
train_df['task_container_std'] = train_df['task_container_id'].map(task_container_agg['var']).astype('float16')
train_df['task_container_correctness'] = train_df['task_container_id'].map(task_container_agg['sum'] / task_container_agg['count'])
train_df.task_container_correctness=train_df.task_container_correctness.astype('float16')

##### 12. get harmonic mean of the content

In [None]:
train_df['hmean_user_content_accuracy'] = 2 * (
    (train_df['user_correctness'] * train_df['content_correctness']) /
    (train_df['user_correctness'] + train_df['content_correctness'])
)

### Training 

In [None]:
train_df = train_df.join(questions_df.set_index("question_id"), 'content_id', how='left')

In [None]:
X_val = train_df.groupby('user_id').tail(5)
X_train = train_df[~train_df.index.isin(X_val.index)]
y_train = X_train.answered_correctly
y_val = X_val.answered_correctly

In [None]:
features = ['content_id',
       'task_container_id',
       'days', 'hours', 'prior_minutes', 'intro_section', 'retention',
       'has_seen_lecture_before', 'has_seen_same_tag_as_lecture',
       'has_seen_same_part_as_lecture', 'has_part_common_with_type_of',
       'has_tag_common_with_type_of', 'has_tag_common_with_part_dict',
       'has_type_of_common_with_part_dict', 'shift_event_time',
       'shift_elapsed_time', 'event_lag_time', 'lag_time',
       'question_answered_late', 'event_time_greater_than_average',
       'question_took_more_than_average_user_time',
       'has_more_than_average_bundle_time', 'bundle_lag_time',
       'question_has_above_average_correctness',
       'tag_has_above_average_correctness', 'bundle_id',
       'part', 'has_seen_question_explanation', 'percentage_correct']

categorical_feature = ['intro_section', 'has_seen_lecture_before', 'has_seen_same_tag_as_lecture',
       'has_seen_same_part_as_lecture', 'has_part_common_with_type_of',
       'has_tag_common_with_type_of', 'has_tag_common_with_part_dict',
       'has_type_of_common_with_part_dict', 'question_answered_late', 'event_time_greater_than_average',
       'question_took_more_than_average_user_time',
       'has_more_than_average_bundle_time', 'question_has_above_average_correctness',
       'tag_has_above_average_correctness',
       'part', 'has_seen_question_explanation']
categorical_feature_idxs = []
for feature_name in categorical_feature:
    try:
        categorical_feature_idxs.append(features.index(feature_name))
    except:
        pass

In [None]:
X_train = X_train[features]
X_val = X_val[features]

In [None]:
X_train.has_seen_question_explanation = X_train.has_seen_question_explanation.fillna(False).astype(np.bool)
X_val.has_seen_question_explanation = X_val.has_seen_question_explanation.fillna(False).astype(np.bool)
X_train.info()

In [None]:
lgbm_params = {
    'objective': 'binary',
    'metric': ['auc'],
}

In [None]:
train_data = lgb.Dataset(
    data = X_train,
    label = y_train,
    categorical_feature = None,
)

val_data = lgb.Dataset(
    data = X_val,
    label = y_val,
    categorical_feature = None,
)

In [None]:
del X_train, y_train, X_val, y_val
import gc
gc.collect()

In [None]:
def train():
    evals_result = {}
    model = lgb.train(
        params = lgbm_params,
        train_set = train_data,
        valid_sets = [val_data],
        num_boost_round = 2500,
        verbose_eval = 10,
        evals_result = evals_result,
        early_stopping_rounds = 10,
        categorical_feature = categorical_feature_idxs,
        feature_name = features,
    )

    # save model
    model.save_model(f'model_v1_2500.lgb')
    
    return model, evals_result
    
model, evals_result = train()

In [None]:
def plot_history(evals_result):
    for metric in ['auc']:
        plt.figure(figsize=(20,8))
        
        for key in evals_result.keys():
            history_len = len(evals_result.get(key)[metric])
            history = evals_result.get(key)[metric]
            x_axis = np.arange(1, history_len + 1)
            plt.plot(x_axis, history, label=key)
        
        x_ticks = list(filter(lambda e: (e % (history_len // 100 * 10) == 0) or e == 1, x_axis))
        plt.xticks(x_ticks, fontsize=12)
        plt.yticks(fontsize=12)

        plt.title(f'{metric.upper()} History of training', fontsize=18);
        plt.xlabel('EPOCH', fontsize=16)
        plt.ylabel(metric.upper(), fontsize=16)
        
        if metric in ['auc']:
            plt.legend(loc='upper left', fontsize=14)
        else:
            plt.legend(loc='upper right', fontsize=14)
        plt.grid()
        plt.show()

plot_history(evals_result)

In [None]:
# plot the feature importance in terms of gain and split
def show_feature_importances(model, importance_type, max_num_features=10**10):
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = features
    feature_importances['value'] = pd.DataFrame(model.feature_importance(importance_type))
    feature_importances = feature_importances.sort_values(by='value', ascending=False) # sort feature importance
    feature_importances.to_csv(f'feature_importances_{importance_type}.csv') # write feature importance to csv
    feature_importances = feature_importances[:max_num_features] # only show max_num_features
    
    plt.figure(figsize=(20, 8))
    plt.xlim([0, feature_importances.value.max()*1.1])
    plt.title(f'Feature {importance_type}', fontsize=18);
    sns.barplot(data=feature_importances, x='value', y='feature', palette='rocket');
    for idx, v in enumerate(feature_importances.value):
        plt.text(v, idx, "  {:.2e}".format(v))

show_feature_importances(model, 'gain')
show_feature_importances(model, 'split')

In [None]:
# show tree and save as png
def save_tree_diagraph(model):
    tree_digraph = lgb.create_tree_digraph(model, show_info=['split_gain', 'internal_count'])

    tree_png = svg2png(tree_digraph._repr_svg_(), output_width=3840)
    tree_png = Image.open(BytesIO(tree_png))

    tree_png.save('create_tree_digraph.png')

    display(tree_png)
    
save_tree_diagraph(model)

In [None]:
# remove train and validation data to free memory before prediction phase
del train_data
gc.collect()

In [None]:
# updates the user data
def update_user_data(state, features_questions_df, prev_test_df):
    for user_id, content_id, answered_correctly in prev_test_df[['user_id', 'content_id', 'answered_correctly']].values:
        # update user features
        state[user_id]['answered_correctly_user'] += answered_correctly
        state[user_id]['answered_user'] += 1
        state[user_id]['mean_user_accuracy'] = state[user_id]['answered_correctly_user'] / state[user_id]['answered_user']

In [None]:
def get_user_data(state, test_df):
    # updated data
    attempt, mean_user_accuracy, answered_correctly_user, answered_user = [], [], [], []
    
    for idx, (user_id, content_id) in test_df[['user_id', 'content_id']].iterrows():
        # check if user exists
        if user_id in state:
            # check if user already answered the question, if so update it to a maximum of 4
            if content_id in state[user_id]['user_content_attempts']:
                state[user_id]['user_content_attempts'][content_id] = min(4, state[user_id]['user_content_attempts'][content_id] + 1)
            # if user did not answered the question already, set the number of attempts to 0
            else:
                state[user_id]['user_content_attempts'][content_id] = 0
        
        # else create user with default values
        else:
            dict_keys = ['mean_user_accuracy', 'answered_correctly_user', 'answered_user', 'user_content_attempts']
            dict_default_vals = [0.680, 0, 0, dict(zip([content_id],[0]))]
            state[user_id] = dict(zip(dict_keys, dict_default_vals))
            
        # add user data to lists
        attempt.append(state[user_id]['user_content_attempts'][content_id])
        mean_user_accuracy.append(state[user_id]['mean_user_accuracy'])
        answered_correctly_user.append(state[user_id]['answered_correctly_user'])
        answered_user.append(state[user_id]['answered_user'])
    
    return attempt, mean_user_accuracy, answered_correctly_user, answered_user

### prediction

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()