#### Imports

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 100)

KAGGLE = 0

#### Global Variables

In [2]:
TRAIN_CSV_PATH = 'train.feather'
QUESTIONS_CSV_PATH = 'questions.csv'
LECTURES_CSV_PATH = 'lectures.csv'
TEST_CSV_PATH = 'test.csv'
SAMPLE_CSV_PATH = 'example_sample_submission.csv'

if KAGGLE:
    TRAIN_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/train.csv'
    QUESTIONS_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/questions.csv'
    LECTURES_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/lectures.csv'
    TEST_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/test.csv'
    SAMPLE_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv'

COLUMN_TYPES = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}


In [3]:
# train_df = pd.read_csv(TRAIN_CSV_PATH, nrows=10**5, dtype=COLUMN_TYPES)
train_df = pd.read_feather(TRAIN_CSV_PATH)
train_df = train_df.iloc[:1000]
questions_df = pd.read_csv(QUESTIONS_CSV_PATH)
lectures_df = pd.read_csv(LECTURES_CSV_PATH)

#### Check for user specific features

In [4]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False
3,3,131167,115,7860,0,3,0,1,19000.0,False
4,4,137965,115,7922,0,4,1,1,11000.0,False
...,...,...,...,...,...,...,...,...,...,...
995,995,15092368574,13134,2334,0,297,3,0,22333.0,True
996,996,15092368574,13134,2333,0,297,0,0,22333.0,True
997,997,15092493419,13134,2182,0,298,3,0,31666.0,True
998,998,15092493419,13134,2184,0,298,0,1,31666.0,True


##### 1. Get timestamp as hours and minutes

In [5]:
print("user ids: ", train_df.user_id.unique(), ", user count: ", len(train_df.user_id.unique()))

user ids:  [  115   124  2746  5382  8623  8701 12741 13134] , user count:  8


In [6]:
df = train_df['user_id'].value_counts().reset_index()
df.columns = [
    'user_id', 
    'count'
]
df = df.sort_values('count')
print("number of users with less interactions: {} \n".format(df[:1]))
print("number of users with more interactions: {} \n".format(df[-1:]))

number of users with less interactions:    user_id  count
7     8701     17 

number of users with more interactions:    user_id  count
0    13134    376 



In [7]:
student_id = 115

In [8]:
feature_df = train_df[train_df['user_id']==student_id]

In [9]:
def get_timestamp_in_parts(row):
    millis = row.timestamp
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours, minutes

def get_prior_elasped_time_in_parts(row):
    prior_millis = row.prior_question_elapsed_time
    prior_seconds = (prior_millis / 1000)
    prior_minutes = (prior_seconds / 60)
    return prior_minutes, prior_seconds

feature_df[['days', 'hours', 'minutes']] = [*feature_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)]
feature_df[['prior_minutes', 'prior_seconds']] = [*feature_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)]

In [59]:
train_df[['days', 'hours', 'minutes']] = [*train_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)]
train_df[['prior_minutes', 'prior_seconds']] = [*train_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)]

In [10]:
feature_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,days,hours,minutes,prior_minutes,prior_seconds
0,0,0,115,5692,0,1,3,1,,,0.0,0.0,0.0,,
1,1,56943,115,5716,0,2,2,1,37000.0,False,0.000659,0.015817,0.94905,0.616667,37.0
2,2,118363,115,128,0,0,0,1,55000.0,False,0.00137,0.032879,1.972717,0.916667,55.0
3,3,131167,115,7860,0,3,0,1,19000.0,False,0.001518,0.036435,2.186117,0.316667,19.0
4,4,137965,115,7922,0,4,1,1,11000.0,False,0.001597,0.038324,2.299417,0.183333,11.0
5,5,157063,115,156,0,5,2,1,5000.0,False,0.001818,0.043629,2.617717,0.083333,5.0
6,6,176092,115,51,0,6,0,1,17000.0,False,0.002038,0.048914,2.934867,0.283333,17.0
7,7,194190,115,50,0,7,3,1,17000.0,False,0.002248,0.053942,3.2365,0.283333,17.0
8,8,212463,115,7896,0,8,2,1,16000.0,False,0.002459,0.059017,3.54105,0.266667,16.0
9,9,230983,115,7863,0,9,0,1,16000.0,False,0.002673,0.064162,3.849717,0.266667,16.0


##### 2. Find the intro rows.

In [11]:
# There are no actions which are having elapsed time when the action is lecture and the explanation is false (it will be always false for lectures)
train_df[(train_df.answered_correctly == -1) &  (train_df.prior_question_had_explanation==False)].count()

row_id                            18
timestamp                         18
user_id                           18
content_id                        18
content_type_id                   18
task_container_id                 18
user_answer                       18
answered_correctly                18
prior_question_elapsed_time        0
prior_question_had_explanation    18
dtype: int64

In [12]:
train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==1)].count()
# There are 80 rows related to the intro lecutures

row_id                            0
timestamp                         0
user_id                           0
content_id                        0
content_type_id                   0
task_container_id                 0
user_answer                       0
answered_correctly                0
prior_question_elapsed_time       0
prior_question_had_explanation    0
dtype: int64

In [13]:
len(train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==0)].user_id.unique())

8

In [14]:
len(train_df[(train_df.timestamp == 0) &  (train_df.content_type_id==1)].user_id.unique())

0

In [15]:
# The unique user count is 393656
# The count when added will be 393576 + 80

In [16]:
# some of the intro are having the explanation
len(train_df[~(train_df.timestamp == 0) & (train_df.prior_question_had_explanation.isnull())].user_id.unique())

0

In [17]:
# most of the elapsed time is null for intro sections
len(train_df[~(train_df.timestamp == 0) & (train_df.prior_question_elapsed_time.isnull())].user_id.unique())


5

In [18]:
dropped_duplicates = train_df.drop_duplicates(subset=['user_id'])
dropped_duplicates

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
46,46,0,124,7900,0,0,0,1,,
76,76,0,2746,5273,0,0,1,0,,
96,96,0,5382,5000,0,0,0,1,,
224,224,0,8623,3915,0,0,3,1,,
336,336,0,8701,3901,0,0,2,1,,
353,353,0,12741,5145,0,0,3,0,,
624,624,0,13134,3926,0,0,3,1,,


In [19]:
len(train_df[(train_df.timestamp == 0)].user_id.unique())
# This proves that the timestamp 0 is the beginning for all the students

8

In [20]:
train_df['intro_section'] = np.where(train_df.timestamp == 0, True, False)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
0,0,0,115,5692,0,1,3,1,,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,False
2,2,118363,115,128,0,0,0,1,55000.0,False,False
3,3,131167,115,7860,0,3,0,1,19000.0,False,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,False


In [21]:
lectures_df.type_of.unique()

array(['concept', 'solving question', 'intention', 'starter'],
      dtype=object)

In [22]:
lectures_df[lectures_df.type_of=='starter']

Unnamed: 0,lecture_id,tag,part,type_of
54,4385,181,5,starter
261,21169,151,5,starter
362,28569,27,6,starter


In [23]:
lectures_df[lectures_df.type_of=='intention']

Unnamed: 0,lecture_id,tag,part,type_of
38,3153,62,2,intention
65,5752,6,2,intention
76,6808,129,2,intention
103,8976,62,2,intention
109,9554,69,2,intention
174,14325,69,2,intention
349,27699,129,2,intention


In [24]:
starter_ids = train_df[(train_df.timestamp == 0)].task_container_id.unique()

In [25]:
questions_df[questions_df.question_id.isin(starter_ids)]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81


In [26]:
questions_df[questions_df.part == 1]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
...,...,...,...,...,...
12930,12930,12930,0,1,9 10 92
12931,12931,12931,2,1,131 104 92
12932,12932,12932,0,1,131 187 92
12933,12933,12933,3,1,9 10 92


In [27]:
train_df[(train_df.timestamp != 0) & (train_df.task_container_id.isin(starter_ids))]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
2,2,118363,115,128,0,0,0,1,55000.0,False,False
47,47,32683,124,7876,0,1,0,0,26000.0,False,False
77,77,21592,2746,758,0,1,0,0,28000.0,False,False
97,97,39828,5382,3944,0,1,1,0,24000.0,False,False
225,225,38769,8623,4750,0,1,1,1,16000.0,False,False
337,337,17833,8701,6671,0,1,0,1,13000.0,False,False
354,354,22273,12741,9691,0,1,3,1,13000.0,False,False
625,625,23840,13134,564,0,1,1,0,22000.0,False,False


In [28]:
# This shows that there a way more intro sections.

In [29]:
train_df['intro_section'] = np.where(train_df.task_container_id.isin(starter_ids), True, False)
train_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section
0,0,0,115,5692,0,1,3,1,,,True
1,1,56943,115,5716,0,2,2,1,37000.0,False,False
2,2,118363,115,128,0,0,0,1,55000.0,False,True
3,3,131167,115,7860,0,3,0,1,19000.0,False,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,False


##### 3. The actions after the lectures should be tagged with something.

In [30]:
train_df[(train_df['user_id']==student_id) & (train_df['row_id'] >= 54177)][:100]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,intro_section


In [31]:
# Things to do here with the issue with the lecture things being more positive towards answering
# we have to combine the question and lectures to do something like this
# 1. There rows which come after the lecture should be values based on the tag or the type of question
# example: if the user is having the lecture on concept and the question is based on some other type then the value given will not be correct so we have to come up with some thing brillent to check this.
# 2. Time is a great factor to look out for, the time the lecture is seen and the time at which it is answer will be depentent

In [32]:
# First we will check what all question and lecture relate

In [33]:
lectures_df[lectures_df.type_of=="starter"].tag.unique()

array([181, 151,  27])

In [34]:
questions_df[questions_df.tags.str.contains(r"^27", na=False)]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
6484,6484,6481,1,6,27
6488,6488,6485,3,6,27 162
6492,6492,6489,3,6,27
6496,6496,6493,0,6,27
6498,6498,6497,3,6,27
...,...,...,...,...,...
11182,11182,11179,3,6,27
11616,11616,11615,0,6,27
11621,11621,11619,2,6,27
11626,11626,11623,1,6,27


In [35]:
questions_df

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38
...,...,...,...,...,...
13518,13518,13518,3,5,14
13519,13519,13519,3,5,8
13520,13520,13520,2,5,73
13521,13521,13521,0,5,125


In [36]:
lectures_df

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question
...,...,...,...,...
413,32535,8,5,solving question
414,32570,113,3,solving question
415,32604,24,6,concept
416,32625,142,2,concept


In [37]:
feature_df = train_df.join(questions_df.set_index("question_id"), 'content_id', how='left')

In [279]:
lecture_dict = lectures_df.set_index("lecture_id").to_dict(orient="index")
lecture_comprehensive_type_of_dict = lectures_df.set_index('type_of').groupby(level=0).apply(lambda row: row.to_dict('list')).to_dict()
lecture_comprehensive_part_dict = lectures_df.set_index('part').groupby(level=0).apply(lambda row: row.to_dict('list')).to_dict()

In [290]:
question_dict = questions_df.set_index("question_id").to_dict(orient="index")


In [283]:
for key in lecture_comprehensive_part_dict:
    lecture_comprehensive_part_dict[key]['tag'] = list(set(lecture_comprehensive_part_dict[key]['tag']))
    lecture_comprehensive_part_dict[key]['type_of'] = list(set(lecture_comprehensive_part_dict[key]['type_of']))

for key in lecture_comprehensive_type_of_dict:
    lecture_comprehensive_type_of_dict[key]['tag'] = list(set(lecture_comprehensive_type_of_dict[key]['tag']))
    lecture_comprehensive_type_of_dict[key]['part'] = list(set(lecture_comprehensive_type_of_dict[key]['part']))

In [285]:
for key in lecture_comprehensive_part_dict:
    print(lecture_comprehensive_part_dict[key]['type_of'])

['concept', 'solving question']
['concept', 'solving question', 'intention']
['solving question', 'concept']
['concept', 'solving question']
['concept', 'starter', 'solving question']
['concept', 'starter', 'solving question']
['concept', 'solving question']


In [277]:
questions_dict = questions_df.set_index('question_id').to_dict(orient='index')

In [294]:
question_dict[13249]

{'bundle_id': 13247,
 'correct_answer': 1,
 'part': 3,
 'tags': '136 81 92',
 'has_related_tag': False}

In [297]:
question_dict[13249] in lecture_comprehensive_type_of_dict['starter']['part']

False

In [315]:
def apply_filter_based_on_previous_row(func):
    prev_row = {
        "has_seen_lecture": False,
        "previous_lecture_hours": 0,
        "previous_lecture": {},
        "user_id": ""
    }
    def wrapper(curr_row, **kwargs):
        content_id = curr_row['content_id']
        has_seen_same_tag_as_lecture = False
        has_seen_same_part_as_lecture = False
        has_part_common_with_type_of = False
        has_tag_common_with_type_of = False
        has_tag_common_with_part_dict = False
        has_type_of_common_with_part_dict = False
        if prev_row['user_id'] == curr_row['user_id']:
            if curr_row['content_type_id'] != 0:
                prev_row['has_seen_lecture'] = True
                prev_row['previous_lecture_hours'] = curr_row['hours']
                prev_row['previous_lecture'] = lecture_dict[content_id]
            else:
                has_seen_same_tag_as_lecture = (str(prev_row['previous_lecture'].get("tag", "")) in str(question_dict[content_id]['tags']).split())
                has_seen_same_part_as_lecture = (prev_row['previous_lecture'].get("part") == question_dict[content_id]['part'])
                prev_type_of = prev_row['previous_lecture'].get("type_of")
                prev_part = prev_row['previous_lecture'].get("part")
                has_part_common_with_type_of = question_dict[content_id]['part'] in  lecture_comprehensive_type_of_dict.get(prev_type_of, {}).get('part', [])
                has_tag_common_with_type_of = bool(set(map(int, str(question_dict[content_id]['tags']).split())).intersection(lecture_comprehensive_type_of_dict.get(prev_type_of, {}).get('tag', [])))
                has_tag_common_with_part_dict = bool(set(map(int, str(question_dict[content_id]['tags']).split())).intersection(lecture_comprehensive_part_dict.get(prev_part, {}).get('tag', [])))
                has_type_of_common_with_part_dict = prev_type_of in lecture_comprehensive_part_dict[question_dict[content_id]['part']]['type_of']
        else:
            prev_row['user_id'] = curr_row['user_id']
            if curr_row['content_type_id'] != 0:
                prev_row['has_seen_lecture'] = True
                prev_row['previous_lecture_hours'] = curr_row['hours']
                prev_row['previous_lecture'] = lecture_dict[content_id]
            else:
                prev_row['has_seen_lecture'] = False
                prev_row['previous_lecture_hours'] = 0
                prev_row['previous_lecture'] = {}

        timestamp_difference = curr_row['hours'] - prev_row['previous_lecture_hours']
        retention = func(timestamp_difference)
        return retention, prev_row['has_seen_lecture'], has_seen_same_tag_as_lecture, has_seen_same_part_as_lecture, has_part_common_with_type_of, has_tag_common_with_type_of, has_tag_common_with_part_dict, has_type_of_common_with_part_dict
    return wrapper

@apply_filter_based_on_previous_row
def running_retention(timestamp_difference):
    retention = 1.48 / ((1.25 * timestamp_difference) + 1.48)
    return retention

train_df["retention"], train_df["has_seen_lecture_before"], train_df["has_seen_same_tag_as_lecture"], train_df["has_seen_same_part_as_lecture"], train_df["has_part_common_with_type_of"], train_df["has_tag_common_with_type_of"], train_df["has_tag_common_with_part_dict"], train_df["has_type_of_common_with_part_dict"] = zip(*train_df.apply(running_retention, axis=1))

In [316]:
train_df

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,prior_minutes,prior_seconds,retention,has_seen_lecture_before,has_seen_same_tag_as_lecture,has_seen_same_part_as_lecture,has_part_common_with_type_of,has_tag_common_with_type_of,has_tag_common_with_part_dict,has_type_of_common_with_part_dict
0,0,0,115,5692,0,1,3,1,,,...,,,1.000000,False,False,False,False,False,False,False
1,1,56943,115,5716,0,2,2,1,37000.0,False,...,0.616667,37.000,0.986817,False,False,False,False,False,False,False
2,2,118363,115,128,0,0,0,1,55000.0,False,...,0.916667,55.000,0.972981,False,False,False,False,False,False,False
3,3,131167,115,7860,0,3,0,1,19000.0,False,...,0.316667,19.000,0.970146,False,False,False,False,False,False,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,...,0.183333,11.000,0.968647,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,995,15092368574,13134,2334,0,297,3,0,22333.0,True,...,0.372217,22.333,0.056078,True,False,False,True,True,False,True
996,996,15092368574,13134,2333,0,297,0,0,22333.0,True,...,0.372217,22.333,0.056078,True,False,False,True,True,False,True
997,997,15092493419,13134,2182,0,298,3,0,31666.0,True,...,0.527767,31.666,0.055986,True,False,False,True,True,False,True
998,998,15092493419,13134,2184,0,298,0,1,31666.0,True,...,0.527767,31.666,0.055986,True,False,False,True,True,False,True


In [317]:
train_df[770:800]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,prior_minutes,prior_seconds,retention,has_seen_lecture_before,has_seen_same_tag_as_lecture,has_seen_same_part_as_lecture,has_part_common_with_type_of,has_tag_common_with_type_of,has_tag_common_with_part_dict,has_type_of_common_with_part_dict
770,770,15014038675,13134,4683,0,139,1,1,19000.0,True,...,0.316667,19.0,0.00206,True,False,False,True,True,False,True
771,771,15014061350,13134,5719,0,140,2,1,5000.0,True,...,0.083333,5.0,0.00206,True,False,False,True,True,False,True
772,772,15015426698,13134,5905,0,141,0,1,9000.0,True,...,0.15,9.0,0.002058,True,False,False,True,True,False,True
773,773,15016111296,13134,10002,0,142,1,1,16000.0,True,...,0.266667,16.0,0.002058,True,False,False,True,True,False,True
774,774,15016762442,13134,8165,0,143,1,1,22000.0,True,...,0.366667,22.0,0.002057,True,False,False,True,True,False,True
775,775,15016774763,13134,4521,0,144,0,1,15000.0,True,...,0.25,15.0,0.002057,True,False,False,True,True,False,True
776,776,15016796241,13134,8786,0,145,3,1,4000.0,True,...,0.066667,4.0,0.002057,True,False,False,True,True,False,True
777,777,15016826052,13134,9047,0,146,3,1,13000.0,True,...,0.216667,13.0,0.002057,True,False,False,True,True,False,True
778,778,15016850331,13134,6004,0,147,0,1,18000.0,True,...,0.3,18.0,0.002057,True,False,False,True,True,False,True
779,779,15016872315,13134,5255,0,148,0,1,12000.0,True,...,0.2,12.0,0.002057,True,False,False,True,True,False,True


In [323]:
train_df[(train_df['has_type_of_common_with_part_dict']!=True) & (train_df['content_type_id']==0)]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,prior_minutes,prior_seconds,retention,has_seen_lecture_before,has_seen_same_tag_as_lecture,has_seen_same_part_as_lecture,has_part_common_with_type_of,has_tag_common_with_type_of,has_tag_common_with_part_dict,has_type_of_common_with_part_dict
0,0,0,115,5692,0,1,3,1,,,...,,,1.000000,False,False,False,False,False,False,False
1,1,56943,115,5716,0,2,2,1,37000.0,False,...,0.616667,37.0,0.986817,False,False,False,False,False,False,False
2,2,118363,115,128,0,0,0,1,55000.0,False,...,0.916667,55.0,0.972981,False,False,False,False,False,False,False
3,3,131167,115,7860,0,3,0,1,19000.0,False,...,0.316667,19.0,0.970146,False,False,False,False,False,False,False
4,4,137965,115,7922,0,4,1,1,11000.0,False,...,0.183333,11.0,0.968647,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,685,12948517301,13134,868,0,54,0,1,12000.0,True,...,0.200000,12.0,0.000329,False,False,False,False,False,False,False
686,686,12948521851,13134,868,0,53,3,0,7000.0,True,...,0.116667,7.0,0.000329,False,False,False,False,False,False,False
687,687,12948556717,13134,1136,0,55,0,1,23000.0,True,...,0.383333,23.0,0.000329,False,False,False,False,False,False,False
688,688,12948599939,13134,758,0,56,0,0,18000.0,True,...,0.300000,18.0,0.000329,False,False,False,False,False,False,False


In [68]:
lectures_ids = train_df[train_df['content_type_id']!=0].content_id.unique()

In [74]:
lecture_tags = lectures_df[lectures_df.lecture_id.isin(lectures_ids)].tag.unique()
#tags.str.contains(r"^27", na=False)

In [106]:
lecture_tags[0]

62

In [124]:
questions_df['has_related_tag'] = questions_df.apply(lambda row: any([ str(tag) in str(row.tags).split() for tag in list(lecture_tags)]), axis=1)

In [125]:
questions_df[questions_df['has_related_tag']]

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,has_related_tag
9,9,9,3,1,10 164 81,True
16,16,16,3,1,131 99 92,True
31,31,31,1,1,131 40 162 81,True
37,37,37,1,1,131 40 81,True
45,45,45,1,1,10 164 162 81,True
...,...,...,...,...,...,...
13087,13087,13085,3,4,161 67 81,True
13089,13089,13088,0,4,161 12 81,True
13092,13092,13091,2,4,161 67 92,True
13385,13385,13385,0,5,52,True


In [309]:
train_df[train_df['content_type_id']!=0]

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,...,prior_minutes,prior_seconds,retention,has_seen_lecture_before,has_seen_same_tag_as_lecture,has_seen_same_part_as_lecture,has_part_common_with_type_of,has_tag_common_with_type_of,has_tag_common_with_part_dict,has_type_of_common_with_part_dict
89,89,653762,2746,6808,1,14,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
117,117,10183847,5382,16736,1,21,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
212,212,1424348597,5382,30207,1,104,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
216,216,1425557777,5382,18545,1,121,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
295,295,405813029,8623,10540,1,59,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
302,302,406265768,8623,25843,1,66,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
323,323,577424049,8623,29544,1,87,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
409,409,3083871588,12741,20307,1,33,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
441,441,3263276452,12741,10688,1,53,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
486,486,3268387671,12741,10217,1,66,-1,-1,,False,...,,,1.0,True,False,False,False,False,False,False
