# TakeItEZ -- submission
### HyunJae Pi, hyunpi@brandeis.edu, 12/10/2020

In [None]:
import riiideducation
import numpy as np
import pandas as pd
import lightgbm as lgb
import datatable as dt

import gc
gc.enable()
from sklearn.model_selection import train_test_split

import pickle
import matplotlib.pyplot as plt

# Preprocess data

In [None]:
%%time

train = dt.fread('../input/riiid-test-answer-prediction/train.csv').to_pandas()

In [None]:
%%time

train = train.astype({'row_id' : 'int32',
                      'timestamp': 'int64',
                      'user_id': 'int32',
                      'content_id': 'int16',
                      'content_type_id': 'int8',
                      'task_container_id': 'int16',
                      'user_answer': 'int8',
                      'answered_correctly': 'int8',
                      'prior_question_elapsed_time': 'float32',
                      'prior_question_had_explanation': 'bool',
                      })

In [None]:
train.info()

In [None]:
train.drop(['row_id', 'task_container_id', 'user_answer'], axis=1, inplace=True)

In [None]:
train['prior_question_elapsed_time'].fillna(0, inplace = True)

In [None]:
%%time

# user dataframe
user_df = train[train.answered_correctly != -1].groupby('user_id').agg({'answered_correctly': ['count', 'mean'], 'prior_question_had_explanation': ['mean']}).reset_index()
user_df.columns = ['user_id', 'user_n_questions_answered', 'user_mean_accuracy','user_boolean_mean_prior_question_had_explanation'] 

user_df['user_n_questions_answered'] = np.log(user_df['user_n_questions_answered'])

user_lect = train.groupby(["user_id", "answered_correctly"]).size().unstack()
user_lect.columns = ['Lecture', 'Wrong', 'Right'] # -1, 0, 1
user_lect['Lecture'] = user_lect['Lecture'].fillna(0)
user_lect = user_lect.astype('Int64')
user_lect['user_watched_lecture'] = np.where(user_lect.Lecture > 0, 1, 0)
user_lect = user_lect.reset_index()
user_lect = user_lect[['user_id', 'user_watched_lecture']]

user_df = user_df.merge(user_lect, on = "user_id", how = "left")
del user_lect
gc.collect()

In [None]:
# log (user_n_questions_answered)
n_answers = user_df.user_n_questions_answered


fig = plt.figure(figsize=(12,6))
n_answers.plot.hist(bins=100)
plt.xlabel('log(user_n_question_answered)')
plt.show()

In [None]:
user_df.to_csv('user_df.csv', index=False)

In [None]:
%%time

# content dataframe
content_df = train[train.answered_correctly != -1].groupby('content_id').agg({'answered_correctly': ['count', 'mean']}).reset_index()
content_df.columns = ['content_id', 'content_n_questions', 'content_mean_accuracy']

In [None]:
content_df.to_csv('content_df.csv', index=False)

In [None]:
%%time

# questions.csv
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv')
questions_df['num_tags'] = questions_df['tags'].apply(lambda x:len(x.split()) if pd.notna(x) else 0)
questions_df = questions_df[['question_id','part','num_tags']]
questions_df.columns = ['content_id','part','num_tags'] # changed the column names to merge it later

In [None]:
questions_df.to_csv('questions_df.csv', index=False)

# Merge data

In [None]:
%%time

user_df = dt.fread('../input/dataframes/user_df.csv').to_pandas()
content_df = dt.fread('../input/dataframes/content_df.csv').to_pandas()
questions_df = dt.fread('../input/dataframes/questions_df.csv').to_pandas()

In [None]:
%%time

train = train.merge(user_df, on = "user_id", how = "left")
del user_df

In [None]:
%%time
train = train.astype({'user_n_questions_answered' : 'int8',  
                      'user_mean_accuracy' : 'float32',
                      'user_boolean_mean_prior_question_had_explanation' : 'float32',
                      'user_watched_lecture' : 'bool',
                    })

In [None]:
%%time

train = train.merge(content_df, on = "content_id", how = "left")
del content_df

In [None]:
train['content_n_questions'].fillna(0, inplace = True)
train['content_mean_accuracy'].fillna(0.5, inplace = True)
train = train.astype({'content_n_questions' : 'int16',
                      'content_mean_accuracy' : 'float32',    
                    })

In [None]:
%%time

train = train.merge(questions_df, on = "content_id", how = "left")
del questions_df

In [None]:
gc.collect()

In [None]:
train['part'].fillna(0, inplace = True)
train['num_tags'].fillna(0, inplace = True)

In [None]:
train = train.astype({'part' : 'category',
                      'num_tags' : 'int8',
                    })

In [None]:
%%time
train = pd.read_pickle('../input/train-tmp/train_tmp.pkl.gzip')

In [None]:
# user engagement
one_month = 31536000000/12
train['user_engagement'] = np.where(train.timestamp > one_month, False, True)
train.drop('timestamp', axis=1, inplace=True)

In [None]:
train.part = train.part.astype('category') 
gc.collect()

In [None]:
%%time

train.to_pickle('preprocessed_train_v06.pkl.gzip')

In [None]:
# %%time
# # smaller dataset (quarter)
# train = train.sample(frac=1).reset_index(drop=True) # shuffle data
# idx = int(len(train)/8)
# train[0:idx].to_pickle('preprocessed_train_v07_small.pkl.gzip')

In [None]:
# from IPython.display import FileLink
# FileLink(r'./preprocessed_train_v07.pkl.gzip')
# FileLink(r'./preprocessed_train_v07_small.pkl.gzip')

## light GMB model

In [None]:
import riiideducation
import numpy as np
import pandas as pd
import lightgbm as lgb
import datatable as dt

import gc
gc.enable()
from sklearn.model_selection import train_test_split

import pickle
import matplotlib.pyplot as plt

In [None]:
%%time

#train = pd.read_pickle('../input/preprocessed-train-v07/preprocessed_train_v07_small.pkl.gzip')
train = pd.read_pickle('../input/preprocessed-train-v07/preprocessed_train_v07.pkl.gzip')

In [None]:
# 12/10a -- too avoid memory error, use only half the data -- failed: timeout error >9hr running
# 12/10b -- 1/3 data

idx = int(len(train)/3)
#print(idx)
train = train[0:idx]
#print(len(train))
train.info()

In [None]:
%%time

# shuffle & remove lectures
train = train.sample(frac=1).reset_index(drop=True) # shuffle data
train = train[train.answered_correctly != -1] # remove lectures = -1

In [None]:
# part was set to 'category' type and caused an error
train['part'] = train['part'].astype('int8')

In [None]:
features = ['prior_question_elapsed_time',
            'prior_question_had_explanation', 
            'user_log_n_questions_answered',
            'user_mean_accuracy', 
            'user_boolean_mean_prior_question_had_explanation', 
            'user_watched_lecture', 
            'content_mean_accuracy',
            'part',
            'num_tags',
            'user_engagement',
           ]

In [None]:
X = train[features]
y = train['answered_correctly']

In [None]:
del train
gc.collect()

In [None]:
%%time

# train vs. validation dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
lgb_train = lgb.Dataset(X_train, y_train, categorical_feature = ['part', 'user_engagement', 'user_watched_lecture', 'prior_question_had_explanation'])
lgb_test = lgb.Dataset(X_test, y_test, categorical_feature = ['part', 'user_engagement', 'user_watched_lecture', 'prior_question_had_explanation'])

del X_train, y_train, X_test, y_test
gc.collect()

In [None]:
del X, y
gc.collect()

In [None]:
params = {'objective': 'binary',
          'metric': 'auc',
          'seed': 123,
          'learning_rate': 0.1, 
          'boosting_type': 'gbdt',
         }

In [None]:
%%time

clf = lgb.train(
    params, lgb_train,
    valid_sets=[lgb_train, lgb_test],
    verbose_eval=50,
    num_boost_round=10000,
    early_stopping_rounds=10,
)

In [None]:
ax = lgb.plot_importance(clf)
ax.figure.savefig('feature_importance_v07b.pdf')
plt.show()

## Save a trained model

In [None]:
# save a trained model
fname = 'saved_model_v07b.sav'
pickle.dump(clf, open(fname, 'wb'))

In [None]:
# # load a saved model
# fname = '../input/saved-models/saved_model.sav'
# clf = pickle.load(open(fname, 'rb'))

## Load the saved model & Submit a prediction

In [None]:
# import riiideducation
# import numpy as np
# import pandas as pd
# import datatable as dt
# import pickle

In [None]:
%%time

# load a saved model
# fname = '../input/saved-models/saved_model_v07b.sav'
# clf = pickle.load(open(fname, 'rb'))

user_df = dt.fread('../input/dataframes/user_df.csv').to_pandas()
content_df = dt.fread('../input/dataframes/content_df.csv').to_pandas()
questions_df = dt.fread('../input/dataframes/questions_df.csv').to_pandas()

# load env
env = riiideducation.make_env()
iter_test = env.iter_test()

In [None]:
features = ['prior_question_elapsed_time',
            'prior_question_had_explanation', 
            'user_log_n_questions_answered',
            'user_mean_accuracy', 
            'user_boolean_mean_prior_question_had_explanation', 
            'user_watched_lecture', 
            'content_mean_accuracy',
            'part',
            'num_tags',
            'user_engagement',
           ]

In [None]:
one_month = 31536000000/12
val_content_mean_accuracy = np.mean(content_df.content_mean_accuracy)
val_user_mean_accuracy = np.mean(user_df.user_mean_accuracy)

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.merge(user_df, on = "user_id", how = "left")
    test_df = test_df.merge(content_df, on = "content_id", how = "left")
    test_df = test_df.merge(questions_df, on = "content_id", how = "left")
    
    # rename
    test_df.rename(columns = {'user_n_questions_answered' : 'user_log_n_questions_answered'}, inplace=True)
    
    # user_engagement
    test_df['user_engagement'] = np.where(test_df.timestamp > one_month, False, True)
  
    # fill NaNs with numbers
    test_df['prior_question_elapsed_time'].fillna(0, inplace = True)
    test_df['content_mean_accuracy'].fillna(val_content_mean_accuracy, inplace = True)
    test_df['user_watched_lecture'].fillna(False, inplace = True)
    test_df['user_mean_accuracy'].fillna(val_user_mean_accuracy, inplace = True)
    test_df['prior_question_had_explanation'].fillna(0, inplace = True) ### use 0 for <NA> instead of False
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].astype('bool')
    

#    test_df['prior_question_had_explanation'] = label_enc.fit_transform(test_df['prior_question_had_explanation'])

    test_df['answered_correctly'] =  clf.predict(test_df[features])
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])