In [2]:
import pandas as pd

def get_data(path, start_year):
  data = pd.read_csv(path)
  y_col = 'y_' + str(start_year + 1)
  data = data[(data['T_' + str(start_year)] > 0) & (data['T_' + str(start_year + 1)] > 0) & (data[y_col] > 0)]
  data = data.applymap(lambda x: -1 if x <= 0 else x)
  return data

def get_data_for_trial(data, start_year):
  data = data[(data['T_' + str(start_year)] < 6)]
  data['T_' + str(start_year + 1)] = data['T_' + str(start_year + 1)].apply(lambda x: int(x == 6))
  columns_to_use = [c for c in data.columns if c.endswith(str(start_year + 1)) and not c.startswith('y')] + ['gender']
  data = data[columns_to_use]
  return data.reset_index()

In [3]:
from scipy.stats import chi2_contingency, chisquare
from scipy.stats import ttest_ind


def get_statistics_of_categorial_col(data, col_name, start_year):
  """
  computes counts and precentage of each unique value in the column specified among treated and control.
  compares the distributions between treated and control by performing chi square test.

  returns: dictionatry including counts and precentage for each unique value among control and treated.
            also, including pvalue and chisquare statistic from the test


  """
  treated = data.loc[data['T_' + str(start_year + 1)] == 1, col_name]
  control = data.loc[data['T_' + str(start_year + 1)] == 0, col_name]

  treated_counts = treated.value_counts().to_dict()
  control_counts = control.value_counts().to_dict()
  
  num_treated = len(treated)
  num_control = len(control)
  treated_precentage = dict()
  control_precentage = dict()

  for value, count in treated_counts.items():
    treated_precentage[value] = count / num_treated

  for value, count in control_counts.items():
    control_precentage[value] = count / num_control

  zero_flag = False
  for val, count in treated_counts.items():
    if val not in control_counts.keys():
      control_precentage[val] = 0
      zero_flag = True
  for val, count in control_counts.items():
    if val not in treated_counts.keys():
      treated_precentage[val] = 0
      zero_flag = True
  
  statistics = dict()
  statistics['counts'] = dict()
  statistics['counts']['treated'] = treated_counts
  statistics['counts']['control'] = control_counts

  statistics['precentage'] = dict()
  statistics['precentage']['treated'] = {k : round(v * 100) for k, v in treated_precentage.items()}
  statistics['precentage']['control'] = {k : round(v * 100) for k, v in control_precentage.items()}

  if not zero_flag:
    test_res = chisquare(list(control_precentage.values()), list(treated_precentage.values()))
    statistics['pval'] = test_res.pvalue
    statistics['statistic'] = test_res.statistic
  else:
    test_res = chi2_contingency([list(control_precentage.values()), list(treated_precentage.values())])
    statistics['pval'] = test_res[1]
    statistics['statistic'] = test_res[0]

  return statistics


def get_statistics_of_continous_col(data, col_name, start_year):
  data = data[data[col_name] > 0]
  treated = data.loc[data['T_' + str(start_year + 1)] == 1, col_name]
  control = data.loc[data['T_' + str(start_year + 1)] == 0, col_name]

  mean_treated = round(treated.mean())
  mean_control = round(control.mean())

  var_treated = round(treated.var())
  var_control = round(control.var())
  test_res = ttest_ind(control, treated, equal_var = False)

  statistics = dict()
  statistics['mean'] = dict()
  statistics['mean']['treated'] = mean_treated
  statistics['mean']['control'] = mean_control

  statistics['var'] = dict()
  statistics['var']['treated'] = var_treated
  statistics['var']['control'] = var_control

  statistics['pval'] = test_res.pvalue
  statistics['statistic'] = test_res.statistic

  return statistics


def get_statistics_of_all_data(data, categorial_columns, continous_columns, start_year):
  statistics = dict()
  statistics['continous'] = dict()
  statistics['categorial'] = dict()

  for c in categorial_columns:
    statistics['categorial'][c] = get_statistics_of_categorial_col(data, c, start_year)

  for c in continous_columns:
    statistics['continous'][c] = get_statistics_of_continous_col(data, c, start_year)

  return statistics

In [None]:
data = get_data('/content/2009_2010_data.csv', 2009)
data = get_data_for_trial(data, 2009)
continous_columns = ['income_2010', 'CVC_HOURS_WK_YR_ALL_2010']
categorial_columns = [c for c in data.columns if c not in continous_columns and not c.endswith('2009') and 
                      c not in {'T_2010', 'y_2010', 'index', 'PUBID'}]

stats_2009_2010 = get_statistics_of_all_data(data, categorial_columns, continous_columns, 2009)

In [5]:
stats_2009_2010

{'continous': {'income_2010': {'mean': {'treated': 29302, 'control': 25049},
   'var': {'treated': 391129549, 'control': 361818009},
   'pval': 2.5034008370051766e-06,
   'statistic': -4.732887781322325},
  'CVC_HOURS_WK_YR_ALL_2010': {'mean': {'treated': 1811, 'control': 1765},
   'var': {'treated': 740688, 'control': 913133},
   'pval': 0.21979137461686377,
   'statistic': -1.2276432823793033}},
 'categorial': {'ENROLLSTAT_2010': {'counts': {'treated': {4: 238,
     3: 142,
     6: 132,
     5: 71,
     1: 66,
     9: 65,
     10: 60,
     2: 59,
     11: 39,
     7: 28,
     8: 9,
     -1: 7},
    'control': {3: 785,
     4: 684,
     1: 494,
     2: 405,
     6: 214,
     5: 122,
     9: 104,
     10: 77,
     7: 39,
     11: 28,
     -1: 21,
     8: 20}},
   'precentage': {'treated': {4: 26,
     3: 16,
     6: 14,
     5: 8,
     1: 7,
     9: 7,
     10: 7,
     2: 6,
     11: 4,
     7: 3,
     8: 1,
     -1: 1},
    'control': {3: 26,
     4: 23,
     1: 17,
     2: 14,
     6

In [None]:
data = get_data('/content/2010_2011_data.csv', 2010)
data = get_data_for_trial(data, 2010)
continous_columns = ['income_2011', 'CVC_HOURS_WK_YR_ALL_2011']
categorial_columns = [c for c in data.columns if c not in continous_columns and not c.endswith('2010') and 
                      c not in {'T_2011', 'y_2011', 'index', 'PUBID'}]

stats_2010_2011 = get_statistics_of_all_data(data, categorial_columns, continous_columns, 2010)

In [7]:
stats_2010_2011

{'continous': {'income_2011': {'mean': {'treated': 31461, 'control': 25916},
   'var': {'treated': 502472688, 'control': 376526842},
   'pval': 5.44409517229435e-08,
   'statistic': -5.479291118793351},
  'CVC_HOURS_WK_YR_ALL_2011': {'mean': {'treated': 1868, 'control': 1783},
   'var': {'treated': 858812, 'control': 863515},
   'pval': 0.036013041363767206,
   'statistic': -2.0991598976193298}},
 'categorial': {'CV_HIGHEST_DEGREE_2011': {'counts': {'treated': {2: 437,
     4: 159,
     1: 110,
     3: 76,
     -1: 62,
     5: 24,
     7: 2,
     6: 1},
    'control': {2: 1453, -1: 557, 1: 489, 4: 233, 3: 133, 5: 32, 7: 4, 6: 1}},
   'precentage': {'treated': {2: 50,
     4: 18,
     1: 13,
     3: 9,
     -1: 7,
     5: 3,
     7: 0,
     6: 0},
    'control': {2: 50, -1: 19, 1: 17, 4: 8, 3: 5, 5: 1, 7: 0, 6: 0}},
   'pval': 0.9999999397347724,
   'statistic': 0.03502282315725358},
  'ENROLLSTAT_2011': {'counts': {'treated': {4: 235,
     6: 137,
     3: 136,
     2: 75,
     5: 63,
 

In [19]:
import json
with open("stats_2009_2010.json", "w") as outfile:
    json.dump(stats_2009_2010, outfile)
with open("stats_2010_2011.json", "w") as outfile:
    json.dump(stats_2010_2011, outfile)