# Cleaning and organising external data

Using raw csv files of exam results and external indices for each 

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import ttest_ind,pearsonr
from helper_files import *

In [10]:
# ------------------------------------------------------
# JOIN EXTERNAL DATA WITH CLICKSTREAM DATA
# ------------------------------------------------------

external_csv = 'data/clean/school_ext_data.csv'
internal_csv = 'data/clean/school_clk_data.csv'

# Read external data (indices and exam results)
df_ext = pd.read_csv(external_csv)

# Read internal data (clickstream data)
df_int = pd.read_csv(internal_csv)

# Combine the 2 dataframes
df_ext['URN'] = df_ext['URN'].astype('str') 
df = df_ext.set_index('URN').join(df_int.set_index('school_id'))

# ------------------------------------------------------
# ADDING NEW FEATURES AND INDICES
# ------------------------------------------------------

# Computing improvement in exam scores

qu = ['Biology','Mathematics','Chemistry','Physics']

for item in qu:
    df['exam_improv_' + item] = (df['exam_score|2016_'+item] - df['exam_score|2015_'+item]) - (df['exam_score|2016_'+item] - df['exam_score|2015_'+item]).mean()

# Teacher activity
df['teacher_activity'] = (df.filter(regex='q_lvl_[0-6]_tch|_cor_tch').sum(axis=1)/df['user_id_tch']).fillna(0)
df['teacher_other'] = (df[['add_user','create_group','add_custom_assig','add_assig']].sum(axis=1)/df['user_id_tch']).fillna(0)

# Student activity
df['student_activity'] =  (df.filter(regex='q_lvl_[0-6]_std|_cor_std').sum(axis=1)/df['user_id_std']).fillna(0)
df['student_other'] =  (df[['play_video_std','view_concept_std']].sum(axis=1)/df['user_id_std']).fillna(0)

# ------------------------------------------------------
# CLEANING THE DATA
# ------------------------------------------------------

#  Remove all schools that have no IDACI rating
#df = df[pd.notnull(df['IDACI'])]

#  All clickstream data that is not defined is set to 0
df = df.fillna(0)

# Select the schools that have provide all science courses in 2016
df = df[(df['Entries|2016_Biology']>0)&(df['Entries|2016_Physics']>0)&(df['Entries|2016_Chemistry']>0)&(df['Entries|2016_Mathematics']>0)]

# Select only those schools that also provide science courses in 2015
df = df[(df['Entries|2015_Biology']>0)&(df['Entries|2015_Physics']>0)&(df['Entries|2015_Chemistry']>0)&(df['Entries|2015_Mathematics']>0)]

# Remove any unecessary features
df = df.drop(df.filter(regex='Entries|exam_score'),axis=1)

# FURTHER CLEANING

# Print dataframe features
print('Total number of schools:',df.shape[0])
print('Total number of features:',df.shape[1])

# Print the first 5 rows of the dataframe
df.head(5)


Total number of schools: 1125
Total number of features: 54


Unnamed: 0_level_0,IDACI,Effectiveness,Teach_quality,L_M_index,PercentageFSM,user_id_tch,add_assig,add_custom_assig,add_user,create_group,...,view_concept_std,view_hint_std,exam_improv_Biology,exam_improv_Mathematics,exam_improv_Chemistry,exam_improv_Physics,teacher_activity,teacher_other,student_activity,student_other
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0.0,0.0,0.0,0.0,0.0,2.0,4.0,10.0,133.0,6.0,...,270.0,3824.0,0.002513,-0.039319,0.098176,-0.001666,176.0,76.5,117.820513,46.538462
100003,0.0,0.0,0.0,0.0,0.0,3.0,0.0,30.0,76.0,9.0,...,42.0,1177.0,0.026017,0.017784,-0.010819,0.036294,26.666667,38.333333,177.71875,13.0
100049,5.0,3.0,3.0,3.0,40.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.243966,-0.1172,-0.05857,-0.173454,0.0,0.0,0.0,0.0
100054,5.0,1.0,1.0,1.0,19.7,3.0,26.0,1.0,91.0,9.0,...,148.0,8422.0,-0.075617,-0.063231,0.004101,-0.087845,18.333333,42.333333,476.512195,30.658537
100065,0.0,0.0,0.0,0.0,0.0,7.0,16.0,27.0,174.0,16.0,...,70.0,3105.0,0.093966,-0.016816,0.146206,0.017131,153.428571,33.285714,200.126984,13.746032
