#### Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)

KAGGLE = 0

#### Global Variables

In [2]:
TRAIN_CSV_PATH = 'train.feather'
QUESTIONS_CSV_PATH = 'questions.csv'
LECTURES_CSV_PATH = 'lectures.csv'
TEST_CSV_PATH = 'test.csv'
SAMPLE_CSV_PATH = 'example_sample_submission.csv'

if KAGGLE:
    TRAIN_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/train.csv'
    QUESTIONS_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/questions.csv'
    LECTURES_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/lectures.csv'
    TEST_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/test.csv'
    SAMPLE_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv'

COLUMN_TYPES = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}


In [7]:
# train_df = pd.read_csv(TRAIN_CSV_PATH, nrows=10**5, dtype=COLUMN_TYPES)
train_df = pd.read_feather(TRAIN_CSV_PATH)
questions_df = pd.read_csv(QUESTIONS_CSV_PATH)
lectures_df = pd.read_csv(LECTURES_CSV_PATH)

#### Check for user specific features

In [8]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False
...,...,...,...,...,...,...,...,...,...,...
101230327,101230327,428564420,2147482888,3586,0,22,0,1,18000.0,True
101230328,101230328,428585000,2147482888,6341,0,23,3,1,14000.0,True
101230329,101230329,428613475,2147482888,4212,0,24,3,1,14000.0,True
101230330,101230330,428649406,2147482888,6343,0,25,1,0,22000.0,True


##### 1. Get timestamp as hours and minutes

In [9]:
print("user ids: ", train_df.user_id.unique(), ", user count: ", len(train_df.user_id.unique()))

user ids:  [       115        124       2746 ... 2147481750 2147482216 2147482888] , user count:  393656


In [10]:
df = train_df['user_id'].value_counts().reset_index()
df.columns = [
    'user_id', 
    'count'
]
df = df.sort_values('count')
print("number of users with less interactions: {} \n".format(df[:1]))
print("number of users with more interactions: {} \n".format(df[-1:]))

number of users with less interactions:           user_id  count
393655  607601423      1 

number of users with more interactions:      user_id  count
0  801103753  17917 



In [11]:
student_id = 1283420

In [12]:
feature_df = train_df[train_df['user_id']==student_id]

In [13]:
def get_timestamp_in_parts(row):
    millis = row.timestamp
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours, minutes

def get_prior_elasped_time_in_parts(row):
    prior_millis = row.prior_question_elapsed_time
    prior_seconds = (prior_millis / 1000)
    prior_minutes = (prior_seconds / 60)
    return prior_minutes, prior_seconds

feature_df[['days', 'hours', 'minutes']] = [*feature_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)]
feature_df[['prior_minutes', 'prior_seconds']] = [*feature_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)]

In [14]:
feature_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds
54058,54058,0,1283420,3822,0,0,1,1,,,0.000000,0.000000,0.000000,,
54059,54059,29795,1283420,638,0,1,0,1,11000.0,False,0.000345,0.008276,0.496583,0.183333,11.000
54060,54060,91982,1283420,4231,0,2,0,0,28000.0,False,0.001065,0.025551,1.533033,0.466667,28.000
54061,54061,117342,1283420,7978,0,3,3,0,60000.0,False,0.001358,0.032595,1.955700,1.000000,60.000
54062,54062,145733,1283420,8225,0,4,1,1,23000.0,False,0.001687,0.040481,2.428883,0.383333,23.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61622,61622,22364257029,1283420,2041,0,5260,0,1,16000.0,True,258.845567,6212.293619,372737.617150,0.266667,16.000
61623,61623,22364257029,1283420,2039,0,5260,3,1,16000.0,True,258.845567,6212.293619,372737.617150,0.266667,16.000
61624,61624,22373215518,1283420,3401,0,5261,1,1,26333.0,True,258.949254,6214.782088,372886.925300,0.438883,26.333
61625,61625,22373215518,1283420,3400,0,5261,2,0,26333.0,True,258.949254,6214.782088,372886.925300,0.438883,26.333


##### 2. Find the intro rows.

In [17]:
# There are no actions which are having elapsed time when the action is lecture and the explanation is false (it will be always false for lectures)
train_df[(train_df.answered_correctly == -1) &  (train_df.prior_question_had_explanation==False)].count()

row_id                            1959032
timestamp                         1959032
user_id                           1959032
content_id                        1959032
content_type_id                   1959032
task_container_id                 1959032
user_answer                       1959032
answered_correctly                1959032
prior_question_elapsed_time             0
prior_question_had_explanation    1959032
dtype: int64

In [18]:
train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==1)].count()
# There are 80 rows related to the intro lecutures

row_id                            80
timestamp                         80
user_id                           80
content_id                        80
content_type_id                   80
task_container_id                 80
user_answer                       80
answered_correctly                80
prior_question_elapsed_time        0
prior_question_had_explanation    80
dtype: int64

In [21]:
len(train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==0)].user_id.unique())

393576

In [28]:
len(train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==1)].user_id.unique())

80

In [None]:
# The unique user count is 393656
# The count when added will be 393576 + 80

In [22]:
# some of the intro are having the explanation
len(train_df[~(train_df.timestamp == 0) & (train_df.prior_question_had_explanation.isnull())].user_id.unique())

65

In [24]:
# most of the elapsed time is null for intro sections
len(train_df[~(train_df.timestamp == 0) & (train_df.prior_question_elapsed_time.isnull())].user_id.unique())


149605

In [25]:
dropped_duplicates = train_df.drop_duplicates(subset=['user_id'])
dropped_duplicates

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
46,46,0,124,7900,0,0,0,1,,
76,76,0,2746,5273,0,0,1,0,,
96,96,0,5382,5000,0,0,0,1,,
224,224,0,8623,3915,0,0,3,1,,
...,...,...,...,...,...,...,...,...,...,...
101228989,101228989,0,2147470770,7900,0,0,0,1,,
101229217,101229217,0,2147470777,7900,0,0,2,0,,
101229975,101229975,0,2147481750,4137,0,0,0,0,,
101230025,101230025,0,2147482216,3748,0,0,1,0,,


In [27]:
len(train_df[(train_df.timestamp == 0)].user_id.unique())
# This proves that the timestamp 0 is the beginning for all the students

393656

In [31]:
train_df['intro_section'] = np.where(train_df.timestamp == 0, True, False)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
0,0,0,115,5692,0,1,3,1,,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,False
2,2,118363,115,128,0,0,0,1,55000.0,False,False
3,3,131167,115,7860,0,3,0,1,19000.0,False,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,False


##### 3. The actions after the lectures should be tagged with something.

In [49]:
train_df[(train_df['user_id']==student_id) & (train_df['row_id'] >= 54177)][:100]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
54177,54177,1107546182,1283420,17124,1,89,-1,-1,,False,False
54178,54178,1107572322,1283420,5347,0,90,2,1,37000.0,True,False
54179,54179,1107613497,1283420,5330,0,91,0,1,12000.0,True,False
54180,54180,1107657638,1283420,3990,0,88,1,1,26000.0,True,False
54181,54181,1107714985,1283420,9150,0,92,1,0,21000.0,True,False
54182,54182,1107794203,1283420,3717,0,93,2,0,7000.0,True,False
54183,54183,1107909885,1283420,5166,0,94,1,1,26000.0,True,False
54184,54184,1108052965,1283420,5672,0,95,0,1,35000.0,True,False
54185,54185,1108186344,1283420,8958,0,96,3,1,59000.0,True,False
54186,54186,1108306940,1283420,4474,0,97,1,1,65000.0,True,False


In [53]:
lectures_df.type_of.unique()

array(['concept', 'solving question', 'intention', 'starter'],
      dtype=object)

In [61]:
lectures_df[lectures_df.type_of=='starter']

Unnamed: 0,lecture_id,tag,part,type_of
54,4385,181,5,starter
261,21169,151,5,starter
362,28569,27,6,starter


In [72]:
starter_ids = train_df[(train_df.timestamp == 0)].task_container_id.unique()

In [73]:
questions_df[questions_df.question_id.isin(starter_ids)]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
5,5,5,2,1,131 149 162 81
6,6,6,2,1,10 94 162 92
7,7,7,0,1,61 110 162 29
8,8,8,3,1,131 13 162 92
9,9,9,3,1,10 164 81


In [74]:
questions_df[questions_df.part == 1]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
...,...,...,...,...,...
12930,12930,12930,0,1,9 10 92
12931,12931,12931,2,1,131 104 92
12932,12932,12932,0,1,131 187 92
12933,12933,12933,3,1,9 10 92


In [76]:
train_df[(train_df.timestamp != 0) & (train_df.task_container_id.isin(starter_ids))]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
1,1,56943,115,5716,0,2,2,1,37000.0,False,False
2,2,118363,115,128,0,0,0,1,55000.0,False,False
3,3,131167,115,7860,0,3,0,1,19000.0,False,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,False
5,5,157063,115,156,0,5,2,1,5000.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...
101230317,101230317,254505467,2147482888,3642,0,12,1,0,33000.0,True,False
101230318,101230318,254538722,2147482888,6012,0,13,3,1,37000.0,True,False
101230320,101230320,428190226,2147482888,4496,0,15,0,1,25000.0,True,False
101230321,101230321,428225923,2147482888,4425,0,16,1,0,26000.0,True,False


In [None]:
# This shows that there a way more intro sections.

In [78]:
train_df['intro_section'] = np.where(train_df.task_container_id.isin(starter_ids), True, False)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
0,0,0,115,5692,0,1,3,1,,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,True
2,2,118363,115,128,0,0,0,1,55000.0,False,True
3,3,131167,115,7860,0,3,0,1,19000.0,False,True
4,4,137965,115,7922,0,4,1,1,11000.0,False,True
