In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
for dirname, _, filenames in os.walk('/kaggle/input/riiid-test-answer-prediction'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import riiideducation

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
import lightgbm as lgb
from lightgbm import LGBMClassifier
import eli5



%matplotlib inline
# for heatmap and other plots
colorMap1 = sns.color_palette("RdBu_r")
# for countplot and others plots
colorMap2 = 'Blues_r'

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

In [None]:
train_path = "/kaggle/input/riiid-test-answer-prediction/train.csv"
questions_path = "../input/riiid-test-answer-prediction/questions.csv"
lectures_path = "../input/riiid-test-answer-prediction/lectures.csv"
test = "../input/riiid-test-answer-prediction/example_test.csv"

# **DATA EXPLORATION & EDA**

We used the 5% of the data

In [None]:
%%time

train = pd.read_csv(train_path, low_memory=False,  nrows=10123033,
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )
train

In [None]:
print(f"Train shape: {train.shape}")

In [None]:
train.head(10)

In [None]:
print(train.isnull().sum())

In [None]:
train=train.dropna()

In [None]:
print(train.shape)

In [None]:
print(train.isnull().sum())

In [None]:
train.memory_usage()

In [None]:
train.drop(['row_id', 'timestamp'], axis=1, inplace=True)

In [None]:
train.describe().style.background_gradient(cmap='Blues')

In [None]:
train.describe().transpose()

In [None]:
train['answered_correctly'].describe()

In [None]:
sns.distplot(train['answered_correctly']);

In [None]:
train['answered_correctly'].hist()

In [None]:
feature = ['answered_correctly','user_id', 'content_id', 'task_container_id',
       'user_answer', 'prior_question_elapsed_time','prior_question_had_explanation']
df = train[feature]

In [None]:
corr_with_target = df.corr()["answered_correctly"].sort_values(ascending=False)
plt.figure(figsize=(14,7))
corr_with_target.drop("answered_correctly").plot.bar()
plt.show()

In [None]:
plt.figure(figsize=(25,20))
sns.set(font_scale=1)
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(),annot=True,cmap=colorMap1)

In [None]:
correlation_matrix  = df.corr()
correlation_matrix["answered_correctly"].sort_values(ascending=False)

# Features

I'll give just some part from our data bacause of the RAM limit on Kaggle kernel

In [None]:
n = int(train.shape[0] * 1)
train_n = train.sample(n=n, random_state=42)
n

In [None]:
user_characteristics = train.groupby('user_id').agg({'answered_correctly':
                                                  ['mean', 'median', 'std', 'skew', 'count']})
user_characteristics.columns = [
    'mean_user_acc',
    'median_user_acc',
    'std_user_acc',
    'skew_user_acc',
    'number_of_answered_q'
]

In [None]:
user_characteristics.head(5)

In [None]:
task_container_characteristics = train.groupby('task_container_id').agg({'answered_correctly':
                                                                      ['mean', 'median', 'std', 'skew', 'count']})
task_container_characteristics.columns = [
    'mean_task_acc',
    'median_task_acc',
    'std_task_acc',
    'skew_task_acc',
    'number_of_asked_task_containers'
]

In [None]:
task_container_characteristics.head(5)

In [None]:
content_characteristics = train.groupby('content_id').agg({'answered_correctly':
                                                        ['mean', 'median', 'std', 'skew', 'count']})
content_characteristics.columns = [
    'mean_acc',
    'median_acc',
    'std_acc',
    'skew_acc',
    'number_of_asked_q'
]

In [None]:
content_characteristics.head(5)

In [None]:
train = train_n.copy()
del train_n

In [None]:
train = train.merge(user_characteristics, how='left', on='user_id')
train = train.merge(task_container_characteristics, how='left', on='task_container_id')
train = train.merge(content_characteristics, how='left', on='content_id')

# The target: answered_correctly

Answered_correctly is our target, and we have to predict to probability for an answer to be correct. 

In [None]:
features = [
    'prior_question_elapsed_time', 
    'prior_question_had_explanation',
    'mean_user_acc',
    'median_user_acc',
    'std_user_acc',
    'skew_user_acc',
    'number_of_answered_q',
    'mean_task_acc',
    'median_task_acc',
    'std_task_acc',
    'skew_task_acc',
    'number_of_asked_task_containers',
    'mean_acc',
    'median_acc',
    'std_acc',
    'skew_acc',
    'number_of_asked_q'
]

target = 'answered_correctly'


Drop features that we are not going to use in our model

In [None]:
col_to_drop = set(train.columns.values.tolist()).difference(features + [target])
for col in col_to_drop:
    del train[col]

In [None]:
train['prior_question_had_explanation'] = train['prior_question_had_explanation'].fillna(value=False).astype(bool)
train = train.fillna(value=0.5)

In [None]:
train = train.replace([np.inf, -np.inf], np.nan)
train = train.fillna(0.5)

In [None]:
train.head(5)

# Model

In [None]:
train_df, test_df, y_train, y_test = train_test_split(train[features], train[target], random_state=777, test_size=0.3)

1. **LGBMClassifier**

In [None]:
# clf = LGBMClassifier(random_state=777)

#params = {
#   'n_estimators': [50, 150, 300],'max_depth': [3, 5, 10],'num_leaves': [5, 15, 30],'min_data_in_leaf': [5, 50, 100],
#    'feature_fraction': [0.1, 0.5, 1.],'lambda': [0., 0.5, 1.],
#}
#cv = RandomizedSearchCV(clf, param_distributions=params, cv=5, n_iter=50, verbose=2)
#cv.fit(train_df, y_train)

#print(cv.best_params_)
#print(cv.best_score_)

In [None]:
params = {
    'num_leaves': 30, 
    'n_estimators': 300, 
    'min_data_in_leaf': 100, 
    'max_depth': 5, 
    'lambda': 0.0, 
    'feature_fraction': 1.0
}

In [None]:
model = LGBMClassifier(**params)
model.fit(train_df, y_train)

In [None]:
print('LGB ROC-AUC score: ', roc_auc_score(y_test.values, model.predict_proba(test_df)[:, 1]))
accuracy1=roc_auc_score(y_test.values, model.predict_proba(test_df)[:, 1])

In [None]:
eli5.show_weights(model, top=20)

In [None]:
lgb.plot_importance(model)

2. **LogisticRegression**

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logreg = LogisticRegression()

In [None]:
logreg.fit(train_df, y_train)

Predicting the test set results and calculating the accuracy

In [None]:
y_pred = logreg.predict(test_df)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_df, y_test)))
accuracy2= logreg.score(test_df, y_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

**The result is telling us that we have 208239+1844304 = 2052543 correct predictions and 804476+108660 = 913136 incorrect predictions.**

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=logreg.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=logreg.classes_)
disp.plot() 

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

**The support is the number of occurrences of each class in y_test.**

ROC Curve

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(test_df))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(test_df)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner).

3. **RandomForestClassifier**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_df, y_train)
clf_pred = clf.predict(test_df)
print('Accuracy of Random Forest Classifier on test set: {:.2f}'.format(clf.score(test_df, y_test)))

In [None]:
accuracy3= clf.score(test_df, y_test)

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, clf_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
disp.plot() 

In [None]:
confusion_matrix = confusion_matrix(y_test, clf_pred)
print(confusion_matrix)

2106652 correct et 859027 fauses

In [None]:
clf_roc_auc = roc_auc_score(y_test, clf.predict(test_df))
fpr, tpr, thresholds = roc_curve(y_test, clf.predict_proba(test_df)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Random Forest Classifier(area = %0.2f)' % clf_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('clf_ROC')
plt.show()

In [None]:
leaderboard  = pd.DataFrame({'Accuracy':[accuracy1,accuracy2, accuracy3]},index = ['LGBMClassifier', ' Logistic Regression', 'Random Forest'])
fig_dims = (15, 8)

fig, a = plt.subplots(figsize=fig_dims)
ax = sns.barplot(x="Accuracy", y=leaderboard.index, data=leaderboard,ax=a)

# SUBMISSION 

In [None]:
env = riiideducation.make_env()

In [None]:
iter_test = env.iter_test()

In [None]:
for (test_df, sample_prediction_df) in iter_test:
    # merge
    test_df = test_df.merge(user_characteristics, on = "user_id", how = "left")
    test_df = test_df.merge(task_container_characteristics, on = "task_container_id", how = "left")
    test_df = test_df.merge(content_characteristics, on = "content_id", how = "left")
    
    # type transformation
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value=False).astype(bool)
    test_df.fillna(value = 0.5, inplace = True)
    test_df = test_df.replace([np.inf, -np.inf], np.nan)
    test_df = test_df.fillna(0.5)
    
    # preds
    test_df['answered_correctly'] = model.predict_proba(test_df[features])[:, 1]
    cols_to_submission = ['row_id', 'answered_correctly', 'group_num']
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])