In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from scipy.stats import ttest_ind,pearsonr, kendalltau, spearmanr

# Inputs

external_csv = r"C:\Users\Andris\Documents\GitHub\data\clean\school_ext_data.csv"
internal_csv = r"C:\Users\Andris\Documents\GitHub\data\clean\school_clk_data.csv"

def bin_groups(df, feature, bins, group_names):
    
    categories = pd.cut(df[feature],bins, labels=group_names)
    return categories

def test_correlation(df, x_array, y_array, corrType):
    
    store_array = []
    
    for item_x in x_array:
        for item_y in y_array:
            score, pval = corrType(df[item_x],df[item_y])
            store_array.append([item_x, item_y, score,pval])
            
    df = pd.DataFrame(store_array,columns=['x','y','Correlation','P Value'])
            
    return df


In [11]:
# ------------------------------------------------------
# JOIN EXTERNAL DATA WITH CLICKSTREAM DATA
# ------------------------------------------------------

# Read external data (indices and exam results)
df_ext = pd.read_csv(external_csv)

# Read internal data (clickstream data)
df_int = pd.read_csv(internal_csv)

# Combine the 2 dataframes
df_ext['URN'] = df_ext['URN'].astype('str') 
df = df_ext.set_index('URN').join(df_int.set_index('school_id'))

# ------------------------------------------------------
# ADDING NEW FEATURES AND INDICES
# ------------------------------------------------------

# Computing improvement in exam scores

qu = ['Biology','Mathematics','Chemistry','Physics']

for item in qu:
    df['exam_improv_2013-2014_' + item] = (df['exam_score|2014_'+item] - df['exam_score|2013_'+item]) - (df['exam_score|2014_'+item] - df['exam_score|2013_'+item]).mean()

for item in qu:
    df['exam_improv_2013-2016_' + item] = (df['exam_score|2016_'+item] - df['exam_score|2013_'+item]) - (df['exam_score|2016_'+item] - df['exam_score|2013_'+item]).mean()

# Student activity
df['student_activity'] =  (df.filter(regex='_cor_std|q_lvl_[0-6]_std').sum(axis=1)/(df.filter(regex='Entries').sum(axis=1))).fillna(0)

# ------------------------------------------------------
# CLEANING THE DATA
# ------------------------------------------------------

#  All clickstream data that is not defined is set to 0
df = df.fillna(0)

# Set all external indices with no value as NULL
df['IDACI'] = df['IDACI'].replace(to_replace=0,value=np.nan)
df['L_M_index'] = df['L_M_index'].replace(to_replace=0,value=np.nan)
df['Effectiveness'] = df['Effectiveness'].replace(to_replace=0,value=np.nan)
df['Teach_quality'] = df['Teach_quality'].replace(to_replace=0,value=np.nan)

# Select the schools that have provide all science courses in all years
df = df[(df.filter(regex='Entries') > 0).all(axis=1)]

# Remove any unecessary features
df = df.drop(df.filter(regex='Entries|exam_score'),axis=1)

# Binning according to IP activity
bins = [-0.1, 0, 10,1000]
group_names = ['Inactive','Semi-active','Active']

df['categories'] = bin_groups(df,'student_activity',bins,group_names)

# Creating another dataframe that has removed outliers (who fall outside the 95% percentile)
q = df[['student_activity']].quantile(0.95)
df_out = df[(df[q.keys()] < q).all(axis=1)]

print(df_out.shape)
df_out.head()

(854, 56)


Unnamed: 0_level_0,IDACI,Effectiveness,Teach_quality,L_M_index,PercentageFSM,q_lvl_2_cor_tch,view_concept_tch,q_lvl_3_cor_tch,q_lvl_2_tch,q_lvl_0_cor_tch,...,exam_improv_2013-2014_Biology,exam_improv_2013-2014_Mathematics,exam_improv_2013-2014_Chemistry,exam_improv_2013-2014_Physics,exam_improv_2013-2016_Biology,exam_improv_2013-2016_Mathematics,exam_improv_2013-2016_Chemistry,exam_improv_2013-2016_Physics,student_activity,categories
URN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,,,,,0.0,0.0,3.0,0.0,0.0,0.0,...,0.036272,-0.003697,-0.061495,-0.099692,0.029736,-0.04333,-0.0031,-0.062738,1.531328,Semi-active
100003,,,,,0.0,0.0,8.0,0.0,0.0,29.0,...,-0.08035,-0.020661,-0.019674,0.052747,-0.020112,-0.013455,-0.009502,0.106763,6.104,Semi-active
100054,5.0,1.0,1.0,1.0,19.7,1.0,8.0,1.0,3.0,0.0,...,0.138762,-0.006687,0.016362,0.015041,-0.035131,-0.0173,0.003301,-0.038957,3.130952,Semi-active
100065,,,,,0.0,8.0,10.0,12.0,49.0,44.0,...,0.096233,0.001924,0.015242,0.059983,0.064762,-0.037641,0.037853,0.113732,10.724924,Active
100076,,,,,0.0,22.0,2.0,35.0,78.0,0.0,...,-0.107807,0.032783,-0.044749,-0.167057,-0.109535,-0.023395,-0.024589,-0.062783,0.162252,Semi-active


In [29]:
df[df['categories']=='Active'].corr()[['IDACI','Effectiveness','Teach_quality','L_M_index']]



Unnamed: 0,IDACI,Effectiveness,Teach_quality,L_M_index
IDACI,1.0,-0.105745,-0.028079,-0.076649
Effectiveness,-0.105745,1.0,0.856408,0.861189
Teach_quality,-0.028079,0.856408,1.0,0.776472
L_M_index,-0.076649,0.861189,0.776472,1.0
PercentageFSM,0.805795,0.18551,0.121014,0.13555
q_lvl_2_cor_tch,0.09298,-0.100151,-0.140328,-0.099727
view_concept_tch,0.018807,-0.144512,-0.134092,-0.163175
q_lvl_3_cor_tch,0.173044,-0.143376,-0.154567,-0.147739
q_lvl_2_tch,0.010909,-0.035245,-0.097488,-0.047541
q_lvl_0_cor_tch,0.034753,-0.208235,-0.242216,-0.214324


In [18]:
df[df['categories']=='Inactive'].corr()['exam_improv_2013-2016_Physics'].sort_values(ascending=False)

exam_improv_2013-2016_Physics        1.000000
exam_improv_2013-2014_Physics        0.463375
exam_improv_2013-2016_Mathematics    0.414300
exam_improv_2013-2016_Chemistry      0.335616
exam_improv_2013-2016_Biology        0.261938
exam_improv_2013-2014_Biology        0.144436
exam_improv_2013-2014_Chemistry      0.143089
exam_improv_2013-2014_Mathematics    0.128298
q_lvl_2_cor_tch                      0.069370
q_lvl_5_tch                          0.063668
q_lvl_0_cor_tch                      0.062113
q_lvl_5_cor_tch                      0.061382
view_assig_prog                      0.052887
q_lvl_2_tch                          0.044809
play_video_tch                       0.038195
view_hint_tch                        0.036796
view_concept_tch                     0.035715
user_id_tch                          0.026369
add_custom_assig                     0.025414
q_lvl_0_tch                          0.025224
add_user                             0.024376
q_lvl_6_cor_tch                   