#### Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

KAGGLE = 0

#### Global Variables

In [2]:
TRAIN_CSV_PATH = 'train.feather'
QUESTIONS_CSV_PATH = 'questions.csv'
LECTURES_CSV_PATH = 'lectures.csv'
TEST_CSV_PATH = 'test.csv'
SAMPLE_CSV_PATH = 'example_sample_submission.csv'

if KAGGLE:
    TRAIN_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/train.csv'
    QUESTIONS_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/questions.csv'
    LECTURES_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/lectures.csv'
    TEST_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/test.csv'
    SAMPLE_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv'

COLUMN_TYPES = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}


In [3]:
# train_df = pd.read_csv(TRAIN_CSV_PATH, nrows=10**5, dtype=COLUMN_TYPES)
train_df = pd.read_feather(TRAIN_CSV_PATH)
train_df = train_df.iloc[:1000]
questions_df = pd.read_csv(QUESTIONS_CSV_PATH)
lectures_df = pd.read_csv(LECTURES_CSV_PATH)

#### Check for user specific features

In [4]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.00000,False
2,2,118363,115,128,0,0,0,1,55000.00000,False
3,3,131167,115,7860,0,3,0,1,19000.00000,False
4,4,137965,115,7922,0,4,1,1,11000.00000,False
...,...,...,...,...,...,...,...,...,...,...
995,995,15092368574,13134,2334,0,297,3,0,22333.00000,True
996,996,15092368574,13134,2333,0,297,0,0,22333.00000,True
997,997,15092493419,13134,2182,0,298,3,0,31666.00000,True
998,998,15092493419,13134,2184,0,298,0,1,31666.00000,True


##### 1. Get timestamp as hours and minutes

In [5]:
print("user ids: ", train_df.user_id.unique(), ", user count: ", len(train_df.user_id.unique()))

user ids:  [  115   124  2746  5382  8623  8701 12741 13134] , user count:  8


In [6]:
df = train_df['user_id'].value_counts().reset_index()
df.columns = [
    'user_id', 
    'count'
]
df = df.sort_values('count')
print("number of users with less interactions: {} \n".format(df[:1]))
print("number of users with more interactions: {} \n".format(df[-1:]))

number of users with less interactions:    user_id  count
7     8701     17 

number of users with more interactions:    user_id  count
0    13134    376 



In [7]:
student_id = 115

In [8]:
feature_df = train_df[train_df['user_id']==student_id]

In [9]:
def convert_timestamp_in_parts(millis):
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours, minutes

In [10]:
def get_timestamp_in_parts(row):
    millis = row.timestamp
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours, minutes

def get_prior_elasped_time_in_parts(row):
    prior_millis = row.prior_question_elapsed_time
    prior_seconds = (prior_millis / 1000)
    prior_minutes = (prior_seconds / 60)
    return prior_minutes, prior_seconds

feature_df[['days', 'hours', 'minutes']] = [*feature_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)]
feature_df[['prior_minutes', 'prior_seconds']] = [*feature_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)]

In [11]:
train_df[['days', 'hours', 'minutes']] = [*train_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)]
train_df[['prior_minutes', 'prior_seconds']] = [*train_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)]

In [12]:
feature_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds
0,0,0,115,5692,0,1,3,1,,,0.0,0.0,0.0,,
1,1,56943,115,5716,0,2,2,1,37000.0,False,0.00066,0.01582,0.94905,0.61667,37.0
2,2,118363,115,128,0,0,0,1,55000.0,False,0.00137,0.03288,1.97272,0.91667,55.0
3,3,131167,115,7860,0,3,0,1,19000.0,False,0.00152,0.03644,2.18612,0.31667,19.0
4,4,137965,115,7922,0,4,1,1,11000.0,False,0.0016,0.03832,2.29942,0.18333,11.0
5,5,157063,115,156,0,5,2,1,5000.0,False,0.00182,0.04363,2.61772,0.08333,5.0
6,6,176092,115,51,0,6,0,1,17000.0,False,0.00204,0.04891,2.93487,0.28333,17.0
7,7,194190,115,50,0,7,3,1,17000.0,False,0.00225,0.05394,3.2365,0.28333,17.0
8,8,212463,115,7896,0,8,2,1,16000.0,False,0.00246,0.05902,3.54105,0.26667,16.0
9,9,230983,115,7863,0,9,0,1,16000.0,False,0.00267,0.06416,3.84972,0.26667,16.0


##### 2. Find the intro rows.

In [13]:
# There are no actions which are having elapsed time when the action is lecture and the explanation is false (it will be always false for lectures)
train_df[(train_df.answered_correctly == -1) &  (train_df.prior_question_had_explanation==False)].count()

row_id                            18
timestamp                         18
user_id                           18
content_id                        18
content_type_id                   18
task_container_id                 18
user_answer                       18
answered_correctly                18
prior_question_elapsed_time        0
prior_question_had_explanation    18
days                              18
hours                             18
minutes                           18
prior_minutes                      0
prior_seconds                      0
dtype: int64

In [14]:
train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==1)].count()
# There are 80 rows related to the intro lecutures

row_id                            0
timestamp                         0
user_id                           0
content_id                        0
content_type_id                   0
task_container_id                 0
user_answer                       0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
days                              0
hours                             0
minutes                           0
prior_minutes                     0
prior_seconds                     0
dtype: int64

In [15]:
len(train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==0)].user_id.unique())

8

In [16]:
len(train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==1)].user_id.unique())

0

In [17]:
# The unique user count is 393656
# The count when added will be 393576 + 80

In [18]:
# some of the intro are having the explanation
len(train_df[~(train_df.timestamp == 0) & (train_df.prior_question_had_explanation.isnull())].user_id.unique())

0

In [19]:
# most of the elapsed time is null for intro sections
len(train_df[~(train_df.timestamp == 0) & (train_df.prior_question_elapsed_time.isnull())].user_id.unique())


5

In [20]:
dropped_duplicates = train_df.drop_duplicates(subset=['user_id'])
dropped_duplicates

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds
0,0,0,115,5692,0,1,3,1,,,0.0,0.0,0.0,,
46,46,0,124,7900,0,0,0,1,,,0.0,0.0,0.0,,
76,76,0,2746,5273,0,0,1,0,,,0.0,0.0,0.0,,
96,96,0,5382,5000,0,0,0,1,,,0.0,0.0,0.0,,
224,224,0,8623,3915,0,0,3,1,,,0.0,0.0,0.0,,
336,336,0,8701,3901,0,0,2,1,,,0.0,0.0,0.0,,
353,353,0,12741,5145,0,0,3,0,,,0.0,0.0,0.0,,
624,624,0,13134,3926,0,0,3,1,,,0.0,0.0,0.0,,


In [21]:
len(train_df[(train_df.timestamp == 0)].user_id.unique())
# This proves that the timestamp 0 is the beginning for all the students

8

In [22]:
train_df['intro_section'] = np.where(train_df.timestamp == 0, True, False)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds,intro_section
0,0,0,115,5692,0,1,3,1,,,0.0,0.0,0.0,,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,0.00066,0.01582,0.94905,0.61667,37.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False,0.00137,0.03288,1.97272,0.91667,55.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False,0.00152,0.03644,2.18612,0.31667,19.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,0.0016,0.03832,2.29942,0.18333,11.0,False


In [23]:
lectures_df.type_of.unique()

array(['concept', 'solving question', 'intention', 'starter'],
      dtype=object)

In [24]:
lectures_df[lectures_df.type_of=='starter']

Unnamed: 0,lecture_id,tag,part,type_of
54,4385,181,5,starter
261,21169,151,5,starter
362,28569,27,6,starter


In [25]:
lectures_df[lectures_df.type_of=='intention']

Unnamed: 0,lecture_id,tag,part,type_of
38,3153,62,2,intention
65,5752,6,2,intention
76,6808,129,2,intention
103,8976,62,2,intention
109,9554,69,2,intention
174,14325,69,2,intention
349,27699,129,2,intention


In [26]:
starter_ids = train_df[(train_df.timestamp == 0)].task_container_id.unique()

In [27]:
questions_df[questions_df.bundle_id.isin(starter_ids)]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81


In [28]:
questions_df[questions_df.part == 1]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
...,...,...,...,...,...
12930,12930,12930,0,1,9 10 92
12931,12931,12931,2,1,131 104 92
12932,12932,12932,0,1,131 187 92
12933,12933,12933,3,1,9 10 92


In [29]:
train_df[(train_df.timestamp != 0) & (train_df.task_container_id.isin(starter_ids))]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds,intro_section
2,2,118363,115,128,0,0,0,1,55000.0,False,0.00137,0.03288,1.97272,0.91667,55.0,False
47,47,32683,124,7876,0,1,0,0,26000.0,False,0.00038,0.00908,0.54472,0.43333,26.0,False
77,77,21592,2746,758,0,1,0,0,28000.0,False,0.00025,0.006,0.35987,0.46667,28.0,False
97,97,39828,5382,3944,0,1,1,0,24000.0,False,0.00046,0.01106,0.6638,0.4,24.0,False
225,225,38769,8623,4750,0,1,1,1,16000.0,False,0.00045,0.01077,0.64615,0.26667,16.0,False
337,337,17833,8701,6671,0,1,0,1,13000.0,False,0.00021,0.00495,0.29722,0.21667,13.0,False
354,354,22273,12741,9691,0,1,3,1,13000.0,False,0.00026,0.00619,0.37122,0.21667,13.0,False
625,625,23840,13134,564,0,1,1,0,22000.0,False,0.00028,0.00662,0.39733,0.36667,22.0,False


In [30]:
# This shows that there a way more intro sections.

In [31]:
train_df['intro_section'] = np.where(train_df.task_container_id.isin(starter_ids), True, False)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds,intro_section
0,0,0,115,5692,0,1,3,1,,,0.0,0.0,0.0,,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,0.00066,0.01582,0.94905,0.61667,37.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False,0.00137,0.03288,1.97272,0.91667,55.0,True
3,3,131167,115,7860,0,3,0,1,19000.0,False,0.00152,0.03644,2.18612,0.31667,19.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,0.0016,0.03832,2.29942,0.18333,11.0,False


##### 3. The actions after the lectures should be tagged with something.

In [32]:
train_df[(train_df['user_id']==student_id) & (train_df['row_id'] >= 54177)][:100]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds,intro_section


In [33]:
# Things to do here with the issue with the lecture things being more positive towards answering
# we have to combine the question and lectures to do something like this
# 1. There rows which come after the lecture should be values based on the tag or the type of question
# example: if the user is having the lecture on concept and the question is based on some other type then the value given will not be correct so we have to come up with some thing brillent to check this.
# 2. Time is a great factor to look out for, the time the lecture is seen and the time at which it is answer will be depentent

In [34]:
# First we will check what all question and lecture relate

In [35]:
lectures_df[lectures_df.type_of=="starter"].tag.unique()

array([181, 151,  27])

In [36]:
questions_df[questions_df.tags.str.contains(r"^27", na=False)]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
6484,6484,6481,1,6,27
6488,6488,6485,3,6,27 162
6492,6492,6489,3,6,27
6496,6496,6493,0,6,27
6498,6498,6497,3,6,27
...,...,...,...,...,...
11182,11182,11179,3,6,27
11616,11616,11615,0,6,27
11621,11621,11619,2,6,27
11626,11626,11623,1,6,27


In [37]:
questions_df

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
...,...,...,...,...,...
13518,13518,13518,3,5,14
13519,13519,13519,3,5,8
13520,13520,13520,2,5,73
13521,13521,13521,0,5,125


In [38]:
lectures_df

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question
...,...,...,...,...
413,32535,8,5,solving question
414,32570,113,3,solving question
415,32604,24,6,concept
416,32625,142,2,concept


In [39]:
feature_df = train_df.join(questions_df.set_index("question_id"), 'content_id', how='left')

In [40]:
lecture_dict = lectures_df.set_index("lecture_id").to_dict(orient="index")
lecture_comprehensive_type_of_dict = lectures_df.set_index('type_of').groupby(level=0).apply(lambda row: row.to_dict('list')).to_dict()
lecture_comprehensive_part_dict = lectures_df.set_index('part').groupby(level=0).apply(lambda row: row.to_dict('list')).to_dict()

In [41]:
question_dict = questions_df.set_index("question_id").to_dict(orient="index")


In [42]:
for key in lecture_comprehensive_part_dict:
    lecture_comprehensive_part_dict[key]['tag'] = list(set(lecture_comprehensive_part_dict[key]['tag']))
    lecture_comprehensive_part_dict[key]['type_of'] = list(set(lecture_comprehensive_part_dict[key]['type_of']))

for key in lecture_comprehensive_type_of_dict:
    lecture_comprehensive_type_of_dict[key]['tag'] = list(set(lecture_comprehensive_type_of_dict[key]['tag']))
    lecture_comprehensive_type_of_dict[key]['part'] = list(set(lecture_comprehensive_type_of_dict[key]['part']))

In [43]:
for key in lecture_comprehensive_part_dict:
    print(lecture_comprehensive_part_dict[key]['type_of'])

['concept', 'solving question']
['concept', 'intention', 'solving question']
['concept', 'solving question']
['concept', 'solving question']
['concept', 'starter', 'solving question']
['concept', 'starter', 'solving question']
['concept', 'solving question']


In [44]:
questions_dict = questions_df.set_index('question_id').to_dict(orient='index')

In [45]:
question_dict[13249]

{'bundle_id': 13247, 'correct_answer': 1, 'part': 3, 'tags': '136 81 92'}

In [46]:
question_dict[13249] in lecture_comprehensive_type_of_dict['starter']['part']

False

In [47]:
def apply_filter_based_on_previous_row(func):
    prev_row = {
        "has_seen_lecture": False,
        "previous_lecture_hours": 0,
        "previous_lecture": {},
        "user_id": ""
    }
    def wrapper(curr_row, **kwargs):
        content_id = curr_row['content_id']
        has_seen_same_tag_as_lecture = False
        has_seen_same_part_as_lecture = False
        has_part_common_with_type_of = False
        has_tag_common_with_type_of = False
        has_tag_common_with_part_dict = False
        has_type_of_common_with_part_dict = False
        if prev_row['user_id'] == curr_row['user_id']:
            if curr_row['content_type_id'] != 0:
                prev_row['has_seen_lecture'] = True
                prev_row['previous_lecture_hours'] = curr_row['hours']
                prev_row['previous_lecture'] = lecture_dict[content_id]
            else:
                has_seen_same_tag_as_lecture = (str(prev_row['previous_lecture'].get("tag", "")) in str(question_dict[content_id]['tags']).split())
                has_seen_same_part_as_lecture = (prev_row['previous_lecture'].get("part") == question_dict[content_id]['part'])
                prev_type_of = prev_row['previous_lecture'].get("type_of")
                prev_part = prev_row['previous_lecture'].get("part")
                has_part_common_with_type_of = question_dict[content_id]['part'] in  lecture_comprehensive_type_of_dict.get(prev_type_of, {}).get('part', [])
                has_tag_common_with_type_of = bool(set(map(int, str(question_dict[content_id]['tags']).split())).intersection(lecture_comprehensive_type_of_dict.get(prev_type_of, {}).get('tag', [])))
                has_tag_common_with_part_dict = bool(set(map(int, str(question_dict[content_id]['tags']).split())).intersection(lecture_comprehensive_part_dict.get(prev_part, {}).get('tag', [])))
                has_type_of_common_with_part_dict = prev_type_of in lecture_comprehensive_part_dict[question_dict[content_id]['part']]['type_of']
        else:
            prev_row['user_id'] = curr_row['user_id']
            if curr_row['content_type_id'] != 0:
                prev_row['has_seen_lecture'] = True
                prev_row['previous_lecture_hours'] = curr_row['hours']
                prev_row['previous_lecture'] = lecture_dict[content_id]
            else:
                prev_row['has_seen_lecture'] = False
                prev_row['previous_lecture_hours'] = 0
                prev_row['previous_lecture'] = {}

        timestamp_difference = curr_row['hours'] - prev_row['previous_lecture_hours']
        retention = func(timestamp_difference)
        return retention, prev_row['has_seen_lecture'], has_seen_same_tag_as_lecture, has_seen_same_part_as_lecture, has_part_common_with_type_of, has_tag_common_with_type_of, has_tag_common_with_part_dict, has_type_of_common_with_part_dict
    return wrapper

@apply_filter_based_on_previous_row
def running_retention(timestamp_difference):
    retention = 1.48 / ((1.25 * timestamp_difference) + 1.48)
    return retention

train_df["retention"], train_df["has_seen_lecture_before"], train_df["has_seen_same_tag_as_lecture"], train_df["has_seen_same_part_as_lecture"], train_df["has_part_common_with_type_of"], train_df["has_tag_common_with_type_of"], train_df["has_tag_common_with_part_dict"], train_df["has_type_of_common_with_part_dict"] = zip(*train_df.apply(running_retention, axis=1))

In [48]:
train_df[770:800]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,prior_seconds,intro_section,retention,has_seen_lecture_before,has_seen_same_tag_as_lecture,has_seen_same_part_as_lecture,has_part_common_with_type_of,has_tag_common_with_type_of,has_tag_common_with_part_dict,has_type_of_common_with_part_dict
770,770,15014038675,13134,4683,0,139,1,1,19000.0,True,...,19.0,False,0.00206,True,False,False,True,True,False,True
771,771,15014061350,13134,5719,0,140,2,1,5000.0,True,...,5.0,False,0.00206,True,False,False,True,True,False,True
772,772,15015426698,13134,5905,0,141,0,1,9000.0,True,...,9.0,False,0.00206,True,False,False,True,True,False,True
773,773,15016111296,13134,10002,0,142,1,1,16000.0,True,...,16.0,False,0.00206,True,False,False,True,True,False,True
774,774,15016762442,13134,8165,0,143,1,1,22000.0,True,...,22.0,False,0.00206,True,False,False,True,True,False,True
775,775,15016774763,13134,4521,0,144,0,1,15000.0,True,...,15.0,False,0.00206,True,False,False,True,True,False,True
776,776,15016796241,13134,8786,0,145,3,1,4000.0,True,...,4.0,False,0.00206,True,False,False,True,True,False,True
777,777,15016826052,13134,9047,0,146,3,1,13000.0,True,...,13.0,False,0.00206,True,False,False,True,True,False,True
778,778,15016850331,13134,6004,0,147,0,1,18000.0,True,...,18.0,False,0.00206,True,False,False,True,True,False,True
779,779,15016872315,13134,5255,0,148,0,1,12000.0,True,...,12.0,False,0.00206,True,False,False,True,True,False,True


##### 4. lets find the average time took for each question

In [49]:
# The average time per question can be considered based on the every candidate or based on the current candidate previous question answering time.

In [50]:
user_average_time_to_elapsed_dict = train_df[train_df['content_type_id']==0].groupby("user_id").prior_question_elapsed_time.mean().to_dict()

In [51]:
train_df['question_took_more_than_average_user_time'] = train_df[train_df['content_type_id'] == 0].apply(lambda row: row.prior_question_elapsed_time > user_average_time_to_elapsed_dict[row.user_id], axis=1)

In [52]:
# check average question answering time
average_question_prior_question_elapsed_time_dict = train_df[train_df['content_type_id']==0].groupby(["content_id"]).prior_question_elapsed_time.mean().to_dict()

In [53]:
train_df.sort_values(['user_id', 'timestamp'], inplace=True)
bundle_time_relation_dict = {}

prev_row = {
    "user_id": "",
    "bundle_id": "",
    "previous_bundle_elapsed_time": ""
}
def get_question_time_taken(curr_row):
    current_bundle_id = question_dict[curr_row.content_id]['bundle_id']
    if prev_row['user_id'] == curr_row['user_id']:
        if prev_row['bundle_id'] == current_bundle_id:
            time_taken_for_question = prev_row['previous_bundle_elapsed_time']
        else:
            time_taken_for_question = curr_row['prior_question_elapsed_time']
            prev_row['previous_bundle_elapsed_time'] = curr_row['prior_question_elapsed_time']
            prev_row['bundle_id'] = current_bundle_id
    else:
        prev_row['bundle_id'] = current_bundle_id
        prev_row['previous_bundle_elapsed_time'] = curr_row['prior_question_elapsed_time']
        time_taken_for_question = None
    prev_row['user_id'] = curr_row.user_id
    return time_taken_for_question


for index, row in train_df[train_df['content_type_id'] == 0].iloc[::-1].iterrows():
    if row.content_id in bundle_time_relation_dict and bundle_time_relation_dict:
        bundle_time_relation_dict[row.content_id].append(get_question_time_taken(row))
    else:
        bundle_time_relation_dict[row.content_id] = [get_question_time_taken(row)]

##### 5. Now lets find out the lag time

In [54]:
# lag time
train_df['event_time'] = train_df[train_df['content_type_id'] == 0][['user_id', 'timestamp']].groupby(['user_id'])['timestamp'].diff()
train_df['shift_event_time'] = train_df['event_time'].shift(-1)
train_df['shift_elapsed_time'] = train_df['prior_question_elapsed_time'].shift(-1)
train_df['event_lag_time'] = train_df[train_df['content_type_id'] == 0]['shift_event_time'] - train_df[train_df['content_type_id'] == 0]['shift_elapsed_time']

In [55]:
train_df_elapsed_time_groupby = train_df[train_df['content_type_id'] == 0][['content_id', 'shift_elapsed_time']].groupby(['content_id']).shift_elapsed_time
individual_question_min_time_dict = train_df_elapsed_time_groupby.min().to_dict()
individual_question_mean_time_dict = train_df_elapsed_time_groupby.mean().to_dict()

del train_df_elapsed_time_groupby

In [56]:
# def get_lag_time(content_id, shift_event_time):
#     question_min_time = individual_question_min_time_dict[content_id]
#     return shift_event_time - question_min_time

# train_df['lag_time'] = train_df[train_df['content_type_id'] == 0].apply(lambda row: get_lag_time(row.content_id, row.shift_event_time), axis=1)
def get_prior_elapsed_time_difference(content_id, shift_elapsed_time):
    question_min_time = individual_question_min_time_dict[content_id]
    return shift_elapsed_time - question_min_time

train_df['lag_time'] = train_df[train_df['content_type_id'] == 0].apply(lambda row: get_prior_elapsed_time_difference(row.content_id, row.shift_elapsed_time), axis=1)

In [57]:
def has_elapsed_time_greater_than_average_time(content_id, shift_elapsed_time):
    question_min_time = individual_question_mean_time_dict[content_id]
    return shift_elapsed_time > question_min_time

train_df['answered_late'] = train_df[train_df['content_type_id'] == 0].apply(lambda row: has_elapsed_time_greater_than_average_time(row.content_id, row.shift_elapsed_time), axis=1)

In [58]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,has_tag_common_with_type_of,has_tag_common_with_part_dict,has_type_of_common_with_part_dict,question_took_more_than_average_user_time,event_time,shift_event_time,shift_elapsed_time,event_lag_time,lag_time,answered_late
0,0,0,115,5692,0,1,3,1,,,...,False,False,False,False,,56943.00000,37000.00000,19943.00000,0.00000,False
1,1,56943,115,5716,0,2,2,1,37000.00000,False,...,False,False,False,True,56943.00000,61420.00000,55000.00000,6420.00000,0.00000,False
2,2,118363,115,128,0,0,0,1,55000.00000,False,...,False,False,False,True,61420.00000,12804.00000,19000.00000,-6196.00000,0.00000,False
3,3,131167,115,7860,0,3,0,1,19000.00000,False,...,False,False,False,False,12804.00000,6798.00000,11000.00000,-4202.00000,0.00000,False
4,4,137965,115,7922,0,4,1,1,11000.00000,False,...,False,False,False,False,6798.00000,19098.00000,5000.00000,14098.00000,0.00000,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,15092368574,13134,2334,0,297,3,0,22333.00000,True,...,True,False,True,True,0.00000,0.00000,22333.00000,-22333.00000,0.00000,False
996,996,15092368574,13134,2333,0,297,0,0,22333.00000,True,...,True,False,True,True,0.00000,124845.00000,31666.00000,93179.00000,0.00000,False
997,997,15092493419,13134,2182,0,298,3,0,31666.00000,True,...,True,False,True,True,124845.00000,0.00000,31666.00000,-31666.00000,0.00000,False
998,998,15092493419,13134,2184,0,298,0,1,31666.00000,True,...,True,False,True,True,0.00000,0.00000,31666.00000,-31666.00000,0.00000,False


In [59]:
average_question_timestamp_difference_dict = train_df[(train_df['content_type_id'] == 0) & (train_df['shift_event_time'] < 3600000)][['content_id', 'shift_event_time']].groupby('content_id').shift_event_time.mean().to_dict()

In [60]:
def has_event_time_greater_than_average(content_id, event_time):
    average_question_time = average_question_timestamp_difference_dict.get(content_id, 0)
    return event_time > average_question_time

train_df['event_time_greater_than_average'] = train_df[train_df['content_type_id'] == 0].apply(lambda row: has_event_time_greater_than_average(row.content_id, row.shift_event_time), axis=1)

In [61]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,has_tag_common_with_part_dict,has_type_of_common_with_part_dict,question_took_more_than_average_user_time,event_time,shift_event_time,shift_elapsed_time,event_lag_time,lag_time,answered_late,event_time_greater_than_average
0,0,0,115,5692,0,1,3,1,,,...,False,False,False,,56943.00000,37000.00000,19943.00000,0.00000,False,False
1,1,56943,115,5716,0,2,2,1,37000.00000,False,...,False,False,True,56943.00000,61420.00000,55000.00000,6420.00000,0.00000,False,False
2,2,118363,115,128,0,0,0,1,55000.00000,False,...,False,False,True,61420.00000,12804.00000,19000.00000,-6196.00000,0.00000,False,False
3,3,131167,115,7860,0,3,0,1,19000.00000,False,...,False,False,False,12804.00000,6798.00000,11000.00000,-4202.00000,0.00000,False,False
4,4,137965,115,7922,0,4,1,1,11000.00000,False,...,False,False,False,6798.00000,19098.00000,5000.00000,14098.00000,0.00000,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,15092368574,13134,2334,0,297,3,0,22333.00000,True,...,False,True,True,0.00000,0.00000,22333.00000,-22333.00000,0.00000,False,False
996,996,15092368574,13134,2333,0,297,0,0,22333.00000,True,...,False,True,True,0.00000,124845.00000,31666.00000,93179.00000,0.00000,False,False
997,997,15092493419,13134,2182,0,298,3,0,31666.00000,True,...,False,True,True,124845.00000,0.00000,31666.00000,-31666.00000,0.00000,False,False
998,998,15092493419,13134,2184,0,298,0,1,31666.00000,True,...,False,True,True,0.00000,0.00000,31666.00000,-31666.00000,0.00000,False,False


##### 6. Find the toughest questions

In [62]:
# get all the complete tags of the question
questions_df['tags'] = questions_df['tags'].astype(str)

tags = [value.split() for value in questions_df[questions_df.tags != "nan"].tags.values]
tags = [item for elem in tags for item in elem]
tags = set(tags)
tags = list(tags)
print(f'There are {len(tags)} different tags')

There are 188 different tags


In [63]:
# Find the wrong and right question based on correctly answered
correct = train_df[train_df.answered_correctly != -1].groupby(["content_id", 'answered_correctly'], as_index=False).size()
correct = correct.pivot(index= "content_id", columns='answered_correctly', values='size')
correct.columns = ['wrong', 'right']
correct = correct.fillna(0)
correct[['wrong', 'right']] = correct[['wrong', 'right']].astype(int)
questions_df = questions_df.merge(correct, left_on = "question_id", right_on = "content_id", how = "left")
questions_df['percentage_correct'] = questions_df.right / (questions_df.right + questions_df.wrong)
questions_df.head()

# questions_df.drop(["wrong_x", "right_x", "wrong_y", "right_y"], axis=1, inplace=True)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,wrong,right,percentage_correct
0,0,0,0,1,51 131 162 38,,,
1,1,1,1,1,131 36 81,,,
2,2,2,0,1,131 101 162 92,1.0,0.0,0.0
3,3,3,0,1,131 149 162 29,,,
4,4,4,3,1,131 5 162 38,1.0,0.0,0.0


In [64]:
# Find the wrong and right question tags based on correctly answered
tags_list = [value.split() for value in questions_df.tags.values]
questions_df['tags_str'] = questions_df['tags']
questions_df['tags'] = tags_list
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,wrong,right,percentage_correct,tags_str
0,0,0,0,1,"[51, 131, 162, 38]",,,,51 131 162 38
1,1,1,1,1,"[131, 36, 81]",,,,131 36 81
2,2,2,0,1,"[131, 101, 162, 92]",1.0,0.0,0.0,131 101 162 92
3,3,3,0,1,"[131, 149, 162, 29]",,,,131 149 162 29
4,4,4,3,1,"[131, 5, 162, 38]",1.0,0.0,0.0,131 5 162 38


In [65]:
tags_df = pd.DataFrame()
for x in range(len(tags)):
    df = questions_df[questions_df.tags.apply(lambda l: tags[x] in l)]
    df1 = df.agg({'wrong': ['sum'], 'right': ['sum']})
    df1['total_questions'] = df1.wrong + df1.right
    df1['question_ids_with_tag'] = len(df)
    df1['tag'] = tags[x]
    df1 = df1.set_index('tag')
    tags_df = tags_df.append(df1)

tags_df[['wrong', 'right', 'total_questions']] = tags_df[['wrong', 'right', 'total_questions']].astype(int)
tags_df['percent_correct'] = tags_df.right / tags_df.total_questions
tags_df = tags_df.sort_values(by = "percent_correct")

tags_df.head()

Unnamed: 0_level_0,wrong,right,total_questions,question_ids_with_tag,percent_correct
tag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
124,4,0,4,12,0.0
123,1,0,1,29,0.0
134,2,0,2,65,0.0
31,1,0,1,10,0.0
170,3,0,3,25,0.0


In [66]:
def is_question_above_average_answering(row):
    percentage_correct = questions_df[questions_df["question_id"] == row.content_id].percentage_correct.values
    if percentage_correct:
        return percentage_correct[0] > 0.5
    return False

def is_all_tags_above_average_answering(row):
    tags_answer_percentages = []
    tags = questions_df[questions_df["question_id"] == row.content_id].tags.values
    if not tags:
        return False
    tags = tags[0]
    for tag in tags:
        tags_answer_percentages.append(tags_df.loc[tag].percent_correct)
    overall_tag_percentage = sum(tags_answer_percentages) / len(tags)
    return overall_tag_percentage > 0.5

train_df['question_has_above_average_correctness'] = train_df.apply(lambda row: is_question_above_average_answering(row), axis=1)
train_df['tag_has_above_average_correctness'] = train_df.apply(lambda row: is_all_tags_above_average_answering(row), axis=1)

In [67]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,question_took_more_than_average_user_time,event_time,shift_event_time,shift_elapsed_time,event_lag_time,lag_time,answered_late,event_time_greater_than_average,question_has_above_average_correctness,tag_has_above_average_correctness
0,0,0,115,5692,0,1,3,1,,,...,False,,56943.00000,37000.00000,19943.00000,0.00000,False,False,True,True
1,1,56943,115,5716,0,2,2,1,37000.00000,False,...,True,56943.00000,61420.00000,55000.00000,6420.00000,0.00000,False,False,True,True
2,2,118363,115,128,0,0,0,1,55000.00000,False,...,True,61420.00000,12804.00000,19000.00000,-6196.00000,0.00000,False,False,True,True
3,3,131167,115,7860,0,3,0,1,19000.00000,False,...,False,12804.00000,6798.00000,11000.00000,-4202.00000,0.00000,False,False,True,True
4,4,137965,115,7922,0,4,1,1,11000.00000,False,...,False,6798.00000,19098.00000,5000.00000,14098.00000,0.00000,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,15092368574,13134,2334,0,297,3,0,22333.00000,True,...,True,0.00000,0.00000,22333.00000,-22333.00000,0.00000,False,False,False,True
996,996,15092368574,13134,2333,0,297,0,0,22333.00000,True,...,True,0.00000,124845.00000,31666.00000,93179.00000,0.00000,False,False,False,True
997,997,15092493419,13134,2182,0,298,3,0,31666.00000,True,...,True,124845.00000,0.00000,31666.00000,-31666.00000,0.00000,False,False,False,True
998,998,15092493419,13134,2184,0,298,0,1,31666.00000,True,...,True,0.00000,0.00000,31666.00000,-31666.00000,0.00000,False,False,True,True


##### 6. toughtest questions relates to the lag time and the elapsed time

In [68]:
train_df[(train_df['content_type_id']==0)][['content_id', 'lag_time', 'question_has_above_average_correctness']].groupby(['content_id']).sum()
# The graph needs to be drawn to get more details

Unnamed: 0_level_0,lag_time,question_has_above_average_correctness
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,0.00000,0
4,0.00000,0
6,0.00000,1
9,0.00000,0
18,1000.00000,2
...,...,...
12550,0.00000,0
12551,0.00000,1
12576,0.00000,1
12577,0.00000,1


In [69]:
train_df[(train_df['content_type_id']==0)][['content_id', 'shift_elapsed_time', 'question_has_above_average_correctness']].groupby(['content_id']).sum()

Unnamed: 0_level_0,shift_elapsed_time,question_has_above_average_correctness
content_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,26000.00000,0
4,20000.00000,0
6,22000.00000,1
9,0.00000,0
18,51000.00000,2
...,...,...
12550,32000.00000,0
12551,32000.00000,1
12576,34666.00000,1
12577,25000.00000,1


In [70]:
# 1. Lag time based on the min/average question time - Done
# 2. Check if the average question time is greater than or not - Done
# 3. The timestamp is greater or not - Done
# 4. check if the top users and top interactions has any relation - Must be run on full dataset
# 5. Find the toughest question - Done
# 6. Find if the toughest question relates to the lag time
# 7. Find if the toughest question relates to the elapsed time


##### 7. The questions asked based on the ai system or repeated questions asked should be checked

In [85]:
train_df['days_int'] = train_df.apply(lambda row: int(row.days), axis=1)

In [109]:
previous_data = {}
def check_if_same_day_same_question_asked(row):
    if (row.user_id, row.content_id) in previous_data:
        if previous_data[(row.user_id, row.content_id)].get("day") != row.days_int:
            previous_data[(row.user_id, row.content_id)]["day"] =  row.days_int
            return False
        return True
    else:
        previous_data[(row.user_id, row.content_id)] = {"day": row.days_int}
        return False

train_df[train_df['content_type_id'] == 0]['same_day_question_asked'] = train_df[train_df['content_type_id'] == 0][['content_id', 'user_id', 'days_int']].apply(lambda row: check_if_same_day_same_question_asked(row), axis=1)

In [120]:
previous_days_data = {}
def check_if_repeated_question_asked(row):
    if (row.user_id, row.content_id) in previous_days_data:
        previous_days_data[(row.user_id, row.content_id)]["days"].append(row.days_int)
        return True
    else:
        previous_days_data[(row.user_id, row.content_id)] = {"days": [row.days_int]}
        return False

train_df['repeated_question'] = train_df[train_df['content_type_id'] == 0][['content_id', 'user_id', 'days_int']].apply(lambda row: check_if_repeated_question_asked(row), axis=1)