# Combining and cleaning clickstream data

This notebook:

* Reads student and teacher clickstream data
* Combines with each user with their school
* Groups data by school

Output is a dataframe ready for analysis and saved in a csv file

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Inputs
teacher_dir = r"C:\Users\Andris\Documents\GitHub\data\raw\teacher_group.csv"
student_dir = r"C:\Users\Andris\Documents\GitHub\data\raw\user_group.csv"
user_det_dir = r"C:\Users\Andris\Documents\GitHub\data\raw\user_details.csv"

# Outputs
out_dir = r"C:\Users\Andris\Documents\GitHub\data\clean\school_clk_data.csv"


In [7]:
# Here we summary of what each teacher has done
df_teacher = pd.read_csv(teacher_dir)
df_teacher.head(5)

Unnamed: 0,user_id,add_assig,add_custom_assig,add_user,create_group,play_video,q_lvl_0,q_lvl_0_cor,q_lvl_1,q_lvl_1_cor,...,q_lvl_3_cor,q_lvl_4,q_lvl_4_cor,q_lvl_5,q_lvl_5_cor,q_lvl_6,q_lvl_6_cor,view_assig_prog,view_concept,view_hint
0,00026a46eefde681a60b50bd7c2368c3091d035b2f5658...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0
1,0018d184ff3b1f2336ebeac5646d13ca84de10d034adc5...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0
2,001c6813b7449d381899502fb8f1bbbb9239a2b3d8b726...,0.0,62.0,158.0,2.0,0.0,0.0,2.0,0.0,0.0,...,1.0,4.0,1.0,0.0,0.0,0.0,0.0,65.0,2.0,27.0
3,001ee4c077e3972984ebddee5d8928ea3be07031025d61...,4.0,8.0,53.0,3.0,2.0,58.0,2.0,4.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,1.0,20.0
4,0037979dda749ad521e81ac59c594706d4ab73f5300379...,0.0,2.0,12.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0


In [9]:
df_user = pd.read_csv(student_dir)
df_user.head(5)

Unnamed: 0,user_id,play_video,q_lvl_0,q_lvl_0_cor,q_lvl_1,q_lvl_1_cor,q_lvl_2,q_lvl_2_cor,q_lvl_3,q_lvl_3_cor,q_lvl_4,q_lvl_4_cor,q_lvl_5,q_lvl_5_cor,q_lvl_6,q_lvl_6_cor,view_concept,view_hint
0,00007cec7712f365f0dc35daee6be4ba26add9bf5438a0...,0.0,11.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00011e0373cefdf1a82a6ceced473cecc3390c009adef9...,0.0,12.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0001fbafbadf796136e7b11d6d156c485a4439498d052b...,0.0,105.0,152.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00030f81e0af68844c4cc08e45555b4106340992a35e71...,0.0,4.0,2.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,10.0
4,000569c7208305f6f8105a065cfada9aa58a25877f10b2...,3.0,37.0,10.0,21.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0


In [11]:
# Details of each user
df_details = pd.read_csv(user_det_dir)
df_details.head(5)

Unnamed: 0,user_id,role,school_id
0,3a5f900dfffd16b753fed6ac7a96a9ebecd3bfe4eff086...,STUDENT,100977
1,bdf13ca931784bce247d636e55dcc34554a890c59cf4e7...,STUDENT,117648
2,fc2143ab6bbe533ffd45739f9422342e15d457eaf4a10d...,STUDENT,124881
3,e36407e0140fe10beaae42eb0e77ee751e14df1be77fc4...,STUDENT,136347
4,a812b8fa223bd3d44dce8c1cde5ad03002d6ccd0d88bf4...,TEACHER,137123


In [14]:
# -----------------------------------------
# CLEANING AND GROUPING
# -----------------------------------------

# Match up the teacher and user data with the school
df_tch_clk = df_teacher.set_index('user_id').join(df_details.set_index('user_id'))
df_user_clk = df_user.set_index('user_id').join(df_details.set_index('user_id'))

# Group by schools (teacher)
a = {}
for item in df_teacher.keys():
    a[item] = 'sum'
a['user_id'] = 'count'

# Group by schools (user)
b = {}
for item in df_user.keys():
    b[item] = 'sum'
b['user_id'] = 'count'

# Group both user and teacher clickstream data by schools
df_tch_clk = df_tch_clk.reset_index().groupby('school_id').agg(a)
df_user_clk = df_user_clk.reset_index().groupby('school_id').agg(b)

# Join the 2 dataframes together
df_clk = df_tch_clk.join(df_user_clk,lsuffix='_tch',rsuffix='_std',how='outer')

# Replace NaN with 0
df_clk = df_clk.fillna(0)

# Save clickstream data to csv
df_clk.to_csv(out_dir)

# Print the dataframe of all clickstream data
df_clk.head(5)

Unnamed: 0_level_0,add_assig,q_lvl_0_tch,create_group,q_lvl_3_tch,view_concept_tch,play_video_tch,q_lvl_5_cor_tch,add_custom_assig,q_lvl_0_cor_tch,q_lvl_4_cor_tch,...,q_lvl_3_std,play_video_std,q_lvl_5_cor_std,q_lvl_1_std,q_lvl_2_std,q_lvl_4_cor_std,q_lvl_1_cor_std,user_id_std,q_lvl_2_cor_std,q_lvl_6_std
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-1,33.0,94.0,13.0,111.0,65.0,914.0,57.0,254.0,66.0,180.0,...,1530.0,1502.0,250.0,230.0,668.0,600.0,233.0,92.0,390.0,47.0
-2,0.0,14.0,2.0,0.0,11.0,4.0,0.0,36.0,49.0,1.0,...,0.0,6.0,0.0,0.0,15.0,4.0,0.0,11.0,0.0,0.0
-3,6.0,71.0,7.0,17.0,7.0,1.0,0.0,325.0,138.0,3.0,...,41.0,145.0,8.0,28.0,187.0,17.0,11.0,68.0,29.0,0.0
-4,3.0,19.0,4.0,0.0,5.0,1.0,0.0,6.0,14.0,0.0,...,4.0,0.0,0.0,2.0,27.0,4.0,2.0,5.0,14.0,0.0
100001,4.0,80.0,6.0,9.0,20.0,3.0,4.0,10.0,106.0,4.0,...,729.0,1545.0,28.0,124.0,514.0,144.0,170.0,39.0,508.0,0.0
