In [0]:
# %% [code]

import numpy as np
import pandas as pd

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objs as go

from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

import optuna
from optuna.samplers import TPESampler

import gc
from sklearn.model_selection import train_test_split
import riiideducation

kaggle=True

# %% [markdown]
# # Load data from local dir

# %% [code]


if kaggle:
    import riiideducation
    env = riiideducation.make_env()
    train_location = '/kaggle/input/riiid-test-answer-prediction/train.csv'
    question_location = '/kaggle/input/riiid-test-answer-prediction/questions.csv'
else:
    train_location = 'data/train.csv'
    question_location = 'data/questions.csv'

# %% [markdown]
# # Load data from kaggle dir

# %% [code]
used_data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float16',
    'prior_question_had_explanation': 'boolean'
}

# %% [code]

train_df = pd.read_csv(
    train_location,
    usecols = used_data_types_dict.keys(),
    dtype=used_data_types_dict, 
    index_col = 0
)

# %% [code]
train_df = train_df[train_df['answered_correctly']!=-1]

# %% [code]
question_df = pd.read_csv(question_location)

# %% [code]

grouped_by_user_df = train_df.groupby('user_id')
user_answers_df = grouped_by_user_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew']}).copy()
user_answers_df.columns = ['mean_user_accuracy', 'questions_answered', 'std_user_accuracy', 'median_user_accuracy', 'skew_user_accuracy']
user_answers_df["questions_answered"]=user_answers_df["questions_answered"].astype('int16')
user_answers_df[["mean_user_accuracy","std_user_accuracy","skew_user_accuracy"]]=user_answers_df[["mean_user_accuracy","std_user_accuracy","skew_user_accuracy"]].astype('float32')
user_answers_df["median_user_accuracy"]=user_answers_df["median_user_accuracy"].astype('int_')
print(user_answers_df.dtypes)
user_answers_df

# %% [code]
del grouped_by_user_df
gc.collect()

# %% [code]
grouped_by_content_df = train_df.groupby('content_id')
content_answers_df = grouped_by_content_df.agg({'answered_correctly': ['mean', 'count', 'std', 'median', 'skew'] }).copy()
content_answers_df.columns = ['mean_accuracy', 'question_asked', 'std_accuracy', 'median_accuracy', 'skew_accuracy']
content_answers_df["question_asked"]=content_answers_df["question_asked"].astype('int32')
content_answers_df[["mean_accuracy","std_accuracy","skew_accuracy"]]=content_answers_df[["mean_accuracy","std_accuracy","skew_accuracy"]].astype('float32')
content_answers_df["median_accuracy"]=content_answers_df["median_accuracy"].astype('int_')
content_answers_df

# %% [code]
del grouped_by_content_df
gc.collect()

# %% [code]
train_df = train_df.merge(user_answers_df, how='left', on='user_id')
gc.collect()

# %% [code]
train_df.info(verbose=False, memory_usage="deep")
question_df.info(verbose=False, memory_usage="deep")
user_answers_df.info(verbose=False, memory_usage="deep")
content_answers_df.info(verbose=False, memory_usage="deep")

# %% [code]
gc.collect(0)

# %% [code]
train_df = train_df.merge(content_answers_df, how='left', on='content_id')
gc.collect()

# %% [code]
print('Part of missing values for every column')
print(train_df.isnull().sum() / len(train_df))

# %% [code]
train_df['prior_question_had_explanation'] = train_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
train_df = train_df.fillna(value = 0.5)


train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

# %% [code]
print('Part of missing values for every column')
print(train_df.isnull().sum() / len(train_df))

# %% [code]
for col in train_df.columns:
    train_df[col] = pd.to_numeric(train_df[col], downcast='integer')
gc.collect(0)

# %% [code]
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
#regressor = RandomForestRegressor(n_estimators=5, max_depth=5, random_state=0)
regressor = linear_model.LinearRegression()
#regressor = linear_model.Ridge(alpha=.5)
Xcol = [
    'mean_user_accuracy', 
    'questions_answered',
    'std_user_accuracy', 
    'median_user_accuracy',
    'skew_user_accuracy',
    'mean_accuracy', 
    'question_asked',
    'std_accuracy', 
    'median_accuracy',
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'skew_accuracy'
]
target = 'answered_correctly'

# %% [code]
train_df = train_df[Xcol+[target]]

# %% [code]
train_df.info( memory_usage="deep")


for dtype in ['float','int','object']:
    selected_dtype = train_df.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    sum_usage_b = selected_dtype.memory_usage(deep=True).sum()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    sum_usage_b = sum_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB, sum is {}".format(dtype,mean_usage_mb,sum_usage_b))

# %% [code]
gc.collect()

# %% [code]
#X_train, X_test, y_train, y_test = train_test_split(train_df[Xcol], train_df[target], test_size=0.1, random_state=0)



features_df = train_df.iloc[:int(9 /10 * len(train_df))]
train_df = train_df.iloc[int(9 /10 * len(train_df)):]

# %% [code]


train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(0.5)

# %% [code]
regressor.fit(train_df[Xcol], train_df[target])

# %% [code]
y_pred = regressor.predict(X_test)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# %% [code]
del train_df
gc.collect(0)

# %% [code]
iter_test = env.iter_test()

# %% [code]


for (test_df, sample_prediction_df) in iter_test:
    test_df = test_df.loc[test_df['content_type_id'] == 0]
    test_df = test_df.merge(user_answers_df, how='left', on='user_id')
    test_df = test_df.merge(content_answers_df, how='left', on='content_id')
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)


    test_df['answered_correctly'] = regressor.predict(test_df[Xcol].to_numpy())
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

# %% [code]
env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

# %% [code]
