In [1]:
import sys
import numpy as np
import pandas as pd
from scipy import stats

OUTPUT_TEMPLATE = (
    '"Did more/less users use the search feature?" p-value: {more_users_p:.3g}\n'
    '"Did users search more/less?" p-value: {more_searches_p:.3g}\n'
    '"Did more/less instructors use the search feature?" p-value: {more_instr_p:.3g}\n'
    '"Did instructors search more/less?" p-value: {more_instr_searches_p:.3g}'
)

In [2]:
data = pd.read_json("searches.json", orient='records', lines=True)

In [3]:
data_new = data[data['uid'] %2 ==1]
data_old = data[data['uid'] %2 == 0]
data_old_count = data_old.count(0)

In [4]:
data_new_searched = data_new[data_new['search_count'] != 0]
data_new_searched_count = data_new_searched.count(0)

data_new_nosearched = data_new[data_new['search_count'] == 0]
data_new_nosearched_count = data_new_nosearched.count(0)

data_old_searched = data_old[data_old['search_count'] != 0]
data_old_searched_count = data_old_searched.count(0)

data_old_nosearched = data_old[data_old['search_count'] == 0]
data_old_nosearched_count = data_old_nosearched.count(0)

In [5]:
data_new_instr = data_new[data_new['is_instructor'] != 0]
data_old_instr = data_old[data_old['is_instructor'] != 0]

In [6]:
data_new_instr_searched = data_new_instr[data_new_instr['search_count'] != 0]
data_old_instr_searched = data_old_instr[data_old_instr['search_count'] != 0]
data_new_instr_nosearched = data_new_instr[data_new_instr['search_count'] == 0]
data_old_instr_nosearched = data_old_instr[data_old_instr['search_count'] == 0]

data_new_instr_searched_count =data_new_instr_searched.count(0)
data_old_instr_searched_count = data_old_instr_searched.count(0)
data_new_instr_nosearched_count =data_new_instr_nosearched.count(0)
data_old_instr_nosearched_count = data_old_instr_nosearched.count(0)

In [7]:
contingency_all = [[data_old_searched_count['uid'], data_old_nosearched_count['uid']],
                       [data_new_searched_count['uid'], data_new_nosearched_count['uid']]]
chi2, p1, dof, expected = stats.chi2_contingency(contingency_all)
p2 = stats.mannwhitneyu(data_new['search_count'], data_old['search_count']).pvalue

In [8]:
contingency_instr = [[data_old_instr_searched_count['uid'], data_old_instr_nosearched_count['uid']],
                       [data_new_instr_searched_count['uid'], data_new_instr_nosearched_count['uid']]]
chi2, p3, dof, expected = stats.chi2_contingency(contingency_instr)
p4 = stats.mannwhitneyu(data_new_instr['search_count'], data_old_instr['search_count']).pvalue

In [9]:

print(OUTPUT_TEMPLATE.format(
        more_users_p=p1,
        more_searches_p=p2,
        more_instr_p=p3,
        more_instr_searches_p=p4,
    ))

"Did more/less users use the search feature?" p-value: 0.168
"Did users search more/less?" p-value: 0.0706
"Did more/less instructors use the search feature?" p-value: 0.052
"Did instructors search more/less?" p-value: 0.0225
