#### Imports

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 100)

KAGGLE = 0

#### Global Variables

In [None]:
TRAIN_CSV_PATH = 'train.csv'
QUESTIONS_CSV_PATH = 'questions.csv'
LECTURES_CSV_PATH = 'lectures.csv'
TEST_CSV_PATH = 'test.csv'
SAMPLE_CSV_PATH = 'example_sample_submission.csv'

if KAGGLE:
    TRAIN_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/train.csv'
    QUESTIONS_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/questions.csv'
    LECTURES_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/lectures.csv'
    TEST_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/test.csv'
    SAMPLE_CSV_PATH = '/kaggle/input/riiid-test-answer-prediction/example_sample_submission.csv'

COLUMN_TYPES = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
}


In [None]:
train_df = pd.read_csv(TRAIN_CSV_PATH, nrows=10**5, dtype=COLUMN_TYPES)
questions_df = pd.read_csv(QUESTIONS_CSV_PATH)
lectures_df = pd.read_csv(LECTURES_CSV_PATH)

In [None]:
features_df = pd.DataFrame()

#### Check for user specific features

In [None]:
train_df

In [None]:
print("user ids: ", train_df.user_id.unique(), ", user count: ", len(train_df.user_id.unique()))

In [None]:
df = train_df['user_id'].value_counts().reset_index()
df.columns = [
    'user_id', 
    'count'
]
df = df.sort_values('count')
print("number of users with less interactions: {} \n".format(df[:1]))
print("number of users with more interactions: {} \n".format(df[-1:]))

In [None]:
student_id = 1283420

In [None]:
train_df[train_df['user_id']==student_id]

In [None]:
feature_df = train_df[train_df['user_id']==student_id]

In [None]:
pd.set_option('display.max_rows', 10000)

In [None]:
def get_timestamp_in_parts(row):
    millis = row.timestamp
    seconds = (millis / 1000)
    minutes = (seconds / 60)
    hours = (minutes / 60)
    days = (hours / 24)
    return days, hours, minutes, seconds

def get_prior_elasped_time_in_parts(row):
    prior_millis = row.prior_question_elapsed_time
    prior_seconds = (prior_millis / 1000)
    prior_minutes = (prior_seconds / 60)
    prior_hours = (prior_minutes / 60)
    return prior_hours, prior_minutes, prior_seconds

feature_df[[['days', 'hours', 'minutes', 'seconds', 'answered_correctly']]] = feature_df.apply(lambda row: get_timestamp_in_parts(row), axis=1)
feature_df[["prior_hours", "prior_minutes", "prior_seconds"]] = feature_df.apply(lambda row: get_prior_elasped_time_in_parts(row), axis=1)

In [None]:
questions_df[questions_df['question_id']==57]