In [58]:
import pandas as pd
import numpy as np
import plotly.express as px
import nbformat

# Load data

TRAIN.CSV

- row_id: (int64) ID code for the row.
- timestamp: (int64) the time in milliseconds between this user interaction and the first event completion from that user.
user_id: (int32) ID code for the user.
- content_id: (int16) ID code for the user interaction
- content_type_id: (int8) 0 if the event was a question being posed to the user, 1 if the event was the user watching a lecture.
- task_container_id: (int16) Id code for the batch of questions or lectures. For example, a user might see three questions in a row before seeing the explanations for any of them. Those three would all share a task_container_id.
- user_answer: (int8) the user's answer to the question, if any. Read -1 as null, for lectures.
- answered_correctly: (int8) if the user responded correctly. Read -1 as null, for lectures.
- prior_question_elapsed_time: (float32) The average time in milliseconds it took a user to answer each question in the previous question bundle, ignoring any lectures in between. Is null for a user's first question bundle or lecture. Note that the time is the average time a user took to solve each question in the previous bundle.
- prior_question_had_explanation: (bool) Whether or not the user saw an explanation and the correct response(s) after answering the previous question bundle, ignoring any lectures in between. The value is shared across a single question bundle, and is null for a user's first question bundle or lecture. Typically the first several questions a user sees were part of an onboarding diagnostic test where they did not get any feedback.

In [22]:
# Define the data types for each column as int8
dtype_train_dict = {
    'row_id': 'int8',
    'timestamp': 'int8',
    'user_id': 'int8',
    'content_id': 'int8',
    'content_type_id': 'int8',
    'task_container_id': 'int8',
    'user_answer': 'int8',
    'answered_correctly': 'int8'
}

# Read the CSV file with specified dtypes
train = pd.read_csv('train.csv', dtype=dtype_train_dict)

In [23]:
train.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101230332 entries, 0 to 101230331
Data columns (total 10 columns):
 #   Column                          Dtype  
---  ------                          -----  
 0   row_id                          int8   
 1   timestamp                       int8   
 2   user_id                         int8   
 3   content_id                      int8   
 4   content_type_id                 int8   
 5   task_container_id               int8   
 6   user_answer                     int8   
 7   answered_correctly              int8   
 8   prior_question_elapsed_time     float64
 9   prior_question_had_explanation  object 
dtypes: float64(1), int8(8), object(1)
memory usage: 4.9 GB


In [24]:
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,60,0,1,3,1,,
1,1,111,115,84,0,2,2,1,37000.0,False
2,2,91,115,-128,0,0,0,1,55000.0,False
3,3,95,115,-76,0,3,0,1,19000.0,False
4,4,-19,115,-14,0,4,1,1,11000.0,False


In [25]:
train.describe()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time
count,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,101230300.0,98878790.0
mean,-0.4999999,-0.5159859,-0.8548383,0.1489695,0.01935222,6.315211,1.376123,0.6251644,25423.81
std,73.90027,73.76007,73.65767,72.95121,0.1377596,70.57228,1.192896,0.5225307,19948.15
min,-128.0,-128.0,-128.0,-128.0,0.0,-128.0,-1.0,-1.0,0.0
25%,-65.0,-64.0,-65.0,-62.0,0.0,-52.0,0.0,0.0,16000.0
50%,0.0,0.0,-1.0,3.0,0.0,13.0,1.0,1.0,21000.0
75%,64.0,63.0,63.0,62.0,0.0,63.0,3.0,1.0,29666.0
max,127.0,127.0,127.0,127.0,1.0,127.0,3.0,1.0,300000.0


QUESTIONS.CSV: METADATA FOR THE QUESTIONS POSED TO USERS.

- question_id: foreign key for the train/test content_id column, when the content type is question (0).
- bundle_id: code for which questions are served together.
- correct_answer: the answer to the question. Can be compared with the train user_answer column to check if the user was right.
- part: the relevant section of the TOEIC test.
- tags: one or more detailed tag codes for the question. The meaning of the tags will not be provided, but these codes are sufficient for clustering the questions together.

In [39]:
# Define the data types for each column as int8
dtype_questions_dict = {
    'question_id': 'int8',
    'bundle_id': 'int8',
    'correct_answer': 'int8',
    'part': 'int8',
}

# Read the CSV file with specified dtypes
questions = pd.read_csv('questions.csv', dtype=dtype_questions_dict)

In [40]:
questions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   question_id     13523 non-null  int8  
 1   bundle_id       13523 non-null  int8  
 2   correct_answer  13523 non-null  int8  
 3   part            13523 non-null  int8  
 4   tags            13522 non-null  object
dtypes: int8(4), object(1)
memory usage: 786.4 KB


In [41]:
questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


LECTURES.CSV: METADATA FOR THE LECTURES WATCHED BY USERS AS THEY PROGRESS IN THEIR EDUCATION.

- lecture_id: foreign key for the train/test content_id column, when the content type is lecture (1).
- part: top level category code for the lecture.
- tag: one tag codes for the lecture. The meaning of the tags will not be provided, but these codes are sufficient for clustering the lectures together.
- type_of: brief description of the core purpose of the lecture.

In [45]:
# Define the data types for each column as int8
dtype_questions_dict = {
    'lecture_id': 'int8',
    'tag': 'int8',
    'part': 'int8'
}

# Read the CSV file with specified dtypes
lectures = pd.read_csv('lectures.csv', dtype=dtype_questions_dict)

In [46]:
lectures.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   lecture_id  418 non-null    int8  
 1   tag         418 non-null    int8  
 2   part        418 non-null    int8  
 3   type_of     418 non-null    object
dtypes: int8(3), object(1)
memory usage: 25.9 KB


In [47]:
lectures.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,-97,5,concept
1,100,70,1,concept
2,-71,45,6,concept
3,-64,79,5,solving question
4,61,-100,5,solving question


In [48]:
# Join tables

In [50]:
# Handling missing values for `prior_question_elapsed_time` and `prior_question_had_explanation`
train['prior_question_elapsed_time'].fillna(train['prior_question_elapsed_time'].mean(), inplace=True)
train['prior_question_had_explanation'].fillna(False, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['prior_question_elapsed_time'].fillna(train['prior_question_elapsed_time'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['prior_question_had_explanation'].fillna(False, inplace=True)
  train['prior_question_had_explanation'].fillna(False, inpla

In [53]:
# Separate content into questions and lectures
questions_train_merged = train[train['content_type_id'] == 0]

Analysis 1: Impact of Prior Explanation on Correct Answer Rate

In [54]:
explanation_group = questions_train_merged.groupby('prior_question_had_explanation')['answered_correctly'].mean().reset_index()

In [59]:
fig1 = px.bar(
    explanation_group,
    x='prior_question_had_explanation',
    y='answered_correctly',
    title='Impact of Prior Explanation on Correct Answer Rate',
    labels={'prior_question_had_explanation': 'Prior Question Had Explanation', 'answered_correctly': 'Average Correct Answer Rate'},
    text='answered_correctly'
)
fig1.update_traces(texttemplate='%{text:.2f}', textposition='outside')
fig1.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [26]:
questions = pd.read_csv('questions.csv')

In [27]:
questions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13523 entries, 0 to 13522
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   question_id     13523 non-null  int64 
 1   bundle_id       13523 non-null  int64 
 2   correct_answer  13523 non-null  int64 
 3   part            13523 non-null  int64 
 4   tags            13522 non-null  object
dtypes: int64(4), object(1)
memory usage: 1.1 MB
