# Combining and cleaning clickstream data

This notebook:

* Reads student and teacher clickstream data
* Combines with each user with their school
* Groups data by school

Output is a dataframe ready for analysis and saved in a csv file

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Inputs
teacher_dir = r"C:\Users\Andris\Documents\GitHub\data\raw\teacher_group.csv"
student_dir = r"C:\Users\Andris\Documents\GitHub\data\raw\user_group.csv"
user_det_dir = r"C:\Users\Andris\Documents\GitHub\data\raw\user_details.csv"

# Outputs
out_dir = r"C:\Users\Andris\Documents\GitHub\data\clean\school_clk_data.csv"


In [2]:
# Here we summary of what each teacher has done
df_teacher = pd.read_csv(teacher_dir)
df_teacher.head(5)

Unnamed: 0,user_id,add_assig,add_custom_assig,add_user,create_group,play_video,q_lvl_0,q_lvl_0_cor,q_lvl_1,q_lvl_1_cor,...,q_lvl_3_cor,q_lvl_4,q_lvl_4_cor,q_lvl_5,q_lvl_5_cor,q_lvl_6,q_lvl_6_cor,view_assig_prog,view_concept,view_hint
0,00026a46eefde681a60b50bd7c2368c3091d035b2f5658...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0
1,001c6813b7449d381899502fb8f1bbbb9239a2b3d8b726...,0.0,20.0,42.0,0.0,13.0,5.0,12.0,4.0,1.0,...,0.0,4.0,1.0,0.0,0.0,0.0,0.0,58.0,12.0,39.0
2,00d7b52c5e872c5f09ffe70b1c0d5b99b79e4021b5b0c5...,0.0,8.0,23.0,2.0,0.0,4.0,2.0,5.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.0,1.0,8.0
3,00ffec02f911cf694e8a2eff69c896bc04929211ce2b6b...,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,10.0
4,010b04243304cf1a7f680b589ebbd3e2f9848f2019b5fc...,0.0,7.0,24.0,3.0,1.0,66.0,49.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,4.0,0.0


In [3]:
# A summary of what each user did
df_user = pd.read_csv(student_dir)
df_user.head(5)

Unnamed: 0,user_id,play_video,q_lvl_0,q_lvl_0_cor,q_lvl_1,q_lvl_1_cor,q_lvl_2,q_lvl_2_cor,q_lvl_3,q_lvl_3_cor,q_lvl_4,q_lvl_4_cor,q_lvl_5,q_lvl_5_cor,q_lvl_6,q_lvl_6_cor,view_concept,view_hint
0,00007cec7712f365f0dc35daee6be4ba26add9bf5438a0...,0.0,11.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0001fbafbadf796136e7b11d6d156c485a4439498d052b...,0.0,105.0,152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00030f81e0af68844c4cc08e45555b4106340992a35e71...,0.0,4.0,2.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,10.0
3,0009e551b420629563e2df12374f8af9cf2d757069ea87...,24.0,24.0,0.0,56.0,36.0,54.0,64.0,32.0,30.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,152.0
4,000a5508d1d06c4b273761be57ffeca66a98daafc1f23c...,0.0,28.0,36.0,0.0,4.0,8.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0


In [4]:
# Details of each user
df_details = pd.read_csv(user_det_dir)
df_details.head(5)

Unnamed: 0,user_id,role,school_id
0,3a5f900dfffd16b753fed6ac7a96a9ebecd3bfe4eff086...,STUDENT,100977
1,bdf13ca931784bce247d636e55dcc34554a890c59cf4e7...,STUDENT,117648
2,fc2143ab6bbe533ffd45739f9422342e15d457eaf4a10d...,STUDENT,124881
3,e36407e0140fe10beaae42eb0e77ee751e14df1be77fc4...,STUDENT,136347
4,a812b8fa223bd3d44dce8c1cde5ad03002d6ccd0d88bf4...,TEACHER,137123


In [5]:
# -----------------------------------------
# CLEANING AND GROUPING
# -----------------------------------------

# Match up the teacher and user data with the school
df_tch_clk = df_teacher.set_index('user_id').join(df_details.set_index('user_id'))
df_user_clk = df_user.set_index('user_id').join(df_details.set_index('user_id'))

# Group by schools (teacher)
a = {}
for item in df_teacher.keys():
    a[item] = 'sum'
a['user_id'] = 'count'

# Group by schools (user)
b = {}
for item in df_user.keys():
    b[item] = 'sum'
b['user_id'] = 'count'

# Group both user and teacher clickstream data by schools
df_tch_clk = df_tch_clk.reset_index().groupby('school_id').agg(a)
df_user_clk = df_user_clk.reset_index().groupby('school_id').agg(b)

# Join the 2 dataframes together
df_clk = df_tch_clk.join(df_user_clk,lsuffix='_tch',rsuffix='_std',how='outer')

# Replace NaN with 0
df_clk = df_clk.fillna(0)

# Save clickstream data to csv
df_clk.to_csv(out_dir)

# Print the dataframe of all clickstream data
df_clk.head(5)

Unnamed: 0_level_0,q_lvl_2_cor_tch,view_concept_tch,q_lvl_3_cor_tch,q_lvl_2_tch,q_lvl_0_cor_tch,q_lvl_6_cor_tch,view_assig_prog,user_id_tch,view_hint_tch,add_user,...,q_lvl_3_std,user_id_std,view_hint_std,q_lvl_1_std,q_lvl_5_cor_std,q_lvl_4_cor_std,q_lvl_1_cor_std,view_concept_std,q_lvl_5_std,play_video_std
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,0.0,13.0,0.0,0.0,7.0,0.0,46.0,1.0,1.0,82.0,...,572.0,18.0,2158.0,36.0,21.0,70.0,40.0,88.0,35.0,497.0
-3,7.0,6.0,6.0,2.0,55.0,0.0,403.0,1.0,57.0,87.0,...,11.0,25.0,332.0,20.0,6.0,11.0,9.0,29.0,1.0,53.0
100001,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,170.0,4.0,463.0,6.0,9.0,32.0,48.0,26.0,9.0,41.0
100003,0.0,8.0,0.0,0.0,29.0,1.0,33.0,3.0,22.0,61.0,...,216.0,17.0,628.0,62.0,0.0,0.0,92.0,31.0,0.0,192.0
100050,0.0,3.0,0.0,4.0,5.0,0.0,0.0,2.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
