In [6]:
import bisect 
import pandas as pd
import numpy as np
import json

In [2]:
def get_best_cuts(x, y, thres, num_cuts):
    cuts = [x.min() - 0.000000001, x.max()]
    for c in range(num_cuts):
        best_r2 = 0 
        best_t = None 
        for t in thres:
            if t not in cuts:
                before = cuts[bisect.bisect_left(cuts, t) - 1]
                after = cuts[bisect.bisect_right(cuts, t)]
                merge_bin = y[(before < x) & (x <= after)]
                bin1 = y[(before < x) & (x <= t)]
                bin2 = y[(t < x) & (x <= after)]
                r2_imp = 0
                r2_imp += (len(bin1) * (np.mean(bin1))**2)
                r2_imp += (len(bin2) * (np.mean(bin2))**2)
                r2_imp -= (len(merge_bin) * (np.mean(merge_bin))**2)
                if best_r2 < r2_imp:
                    best_r2 = r2_imp
                    best_t = t
        if best_r2 == 0:
            break
        cuts.append(best_t)
        cuts = sorted(cuts)
    return cuts[1:-1]

In [55]:
inp_path = "../danny_submission/feature_extractions_TA2/feature_extractions_for_all_papers_TA2.csv"
rep_path = "../danny_submission/feature_extractions_TA2/feature_extractions_for_reproducible_papers_TA2.csv"
non_rep_path = "../danny_submission/feature_extractions_TA2/feature_extractions_for_non_reproducible_papers_TA2.csv"

df_all = pd.read_csv(inp_path)
df_rep = pd.read_csv(rep_path)
df_non_rep = pd.read_csv(non_rep_path)

In [18]:
ta2_full_path = "TA2_regression_data_final_with_folds.json"
df_ta2 = pd.read_json(ta2_full_path)
df_ta2 = df_ta2[["DOI_CR", "label"]]

In [19]:
pid_label_map = {}
for idx, cr in df_ta2.iterrows():
    doi = cr["DOI_CR"]
    label = cr["label"]
    pid_label_map[doi] = label

In [64]:
df_all["label_score"] = df_all.apply(lambda x: pid_label_map[x["paper_id"]], axis=1)
df_rep["label_score"] = df_rep.apply(lambda x: pid_label_map[x["paper_id"]], axis=1)
df_non_rep["label_score"] = df_non_rep.apply(lambda x: pid_label_map[x["paper_id"]], axis=1)

In [67]:
def getCuts(df_rep, df_non_rep, field = "P_Values_1"):
    x_list = []
    y_list = []
    
    cur_x = list(df_rep[df_rep[field].notnull()][field])
    cur_y = [1.0] * len(cur_x)
    x_list += cur_x
    y_list += cur_y
    
    cur_x = list(df_non_rep[df_non_rep[field].notnull()][field])
    cur_y = [0.0] * len(cur_x)
    x_list += cur_x
    y_list += cur_y
    
    x = np.array(x_list)
    y = np.array(y_list)
    thresh = np.arange(x.min(), x.max(), (x.max() - x.min()) / 500)[1:]
    cuts = get_best_cuts(x, y, thresh, 1)
    
    sample_size = x.shape[0]
    result = [field, cuts[0], sample_size]
    return result

In [71]:
def getCutsWithScores(df_all, field = "P_Values_1"):
    x_list = []
    y_list = []
    
    cur_x = list(df_all[df_all[field].notnull()][field])
    cur_y = list(df_all[df_all[field].notnull()]["label_score"])
    x_list += cur_x
    y_list += cur_y
    
    x = np.array(x_list)
    y = np.array(y_list)
    thresh = np.arange(x.min(), x.max(), (x.max() - x.min()) / 500)[1:]
    cuts = get_best_cuts(x, y, thresh, 1)
    
    sample_size = x.shape[0]
    result = [field, cuts[0], sample_size]
    return result

In [69]:
getCuts(df_rep, df_non_rep, field= "P_Values_1")

['P_Values_1', 0.0057497, 330]

In [72]:
getCutsWithScores(df_all, field= "P_Values_1")

['P_Values_1', 0.00588994, 852]

In [75]:
df_all.columns

Index(['paper_id', 'Number_of_Studies', 'Number_of_Models', 'Effect_Size_1',
       'Effect_Size_2', 'Effect_Size_3', 'Effect_Size_4', 'Effect_Size_5',
       'P_Values_1', 'P_Values_2', 'P_Values_3', 'P_Values_4', 'P_Values_5',
       'Model_Names_1', 'Model_Names_2', 'Model_Names_3', 'Model_Names_4',
       'Model_Names_5', 'Sample_Sizes_1', 'Sample_Sizes_2', 'Sample_Sizes_3',
       'Sample_Sizes_4', 'Sample_Sizes_5', 'sorting_order_based_on',
       'label_score'],
      dtype='object')

In [76]:
feature_list = ["P_Values_1", "Effect_Size_1", "Sample_Sizes_1", "Number_of_Studies", "Number_of_Models"]

In [77]:
df_res_cols = ["feature_name", "cut_threshold", "num_samples"]
df_res_data = []
for cur_feature in feature_list:
    cur_result = getCutsWithScores(df_all, field=cur_feature)
    df_res_data.append(cur_result)

In [78]:
df_res = pd.DataFrame(data=df_res_data, columns=df_res_cols)

In [79]:
df_res

Unnamed: 0,feature_name,cut_threshold,num_samples
0,P_Values_1,0.00589,852
1,Effect_Size_1,0.25274,273
2,Sample_Sizes_1,283052.006,1245
3,Number_of_Studies,1.014,26
4,Number_of_Models,9.008,136


## Adhoc analysis to Danny:

In [4]:
feat_path = "../danny_submission/feature_extractions_TA2/feature_extractions_for_all_papers_TA2.json"

In [7]:
with open(feat_path, "r") as f:
    feat_data = json.load(f)

In [51]:
len(feat_data)

2380

In [48]:
def getClosestSampleSize(pv_idx, ss_list):
    out_ss = [None, None]
    if ss_list is None:
        return out_ss
    
    best_ss_abs_dist = None
    best_ss_val = None
    
    for cur_ss in ss_list:
        ss_val = cur_ss["value"]
        ss_idx = cur_ss["sent_idx"]
        cur_dist = abs(pv_idx - ss_idx)
        
        if best_ss_abs_dist is None:
            best_ss_abs_dist = cur_dist
            best_ss_val = ss_val
        elif cur_dist < best_ss_abs_dist:
            best_ss_abs_dist = cur_dist
            best_ss_val = ss_val
        
    out_ss = [best_ss_val, best_ss_abs_dist]
    return out_ss
    

def getPvalues(feat_data):
    result_cols = ["paper_doi", "p_value", "closest_sample_size", "absolute_dist_between_pv_and_ss", "label"]
    result_data = []
    
    for k, v in feat_data.items():
        pid = k
        pv_list = v["Validity_of_Inference"]["P_Values"]
        if pv_list is None:
            continue
        for cur_pv in pv_list:
            pv_val = cur_pv["value"]
            pv_idx = cur_pv["sent_idx"]
            ss_val_list = getClosestSampleSize(pv_idx, v["Design_Quality"]["Sample_Sizes"])
            try:
                cur_row = [pid, pv_val] + ss_val_list + [pid_label_map[pid]]
                result_data.append(cur_row)
            except:
                pass
    df_data = pd.DataFrame(data=result_data, columns=result_cols)
    return df_data

In [49]:
df_pv_list = getPvalues(feat_data)

In [50]:
pv_out_path = "TA2_pvalue_extractions.csv"
df_pv_list.to_csv(pv_out_path, index=False)

In [52]:
df_pv_list.head()

Unnamed: 0,paper_doi,p_value,closest_sample_size,absolute_dist_between_pv_and_ss,label
0,10.1177/0003122414545986,0.05,70562.0,135.0,0.314487
1,10.1177/0003122414545986,0.05,70562.0,167.0,0.314487
2,10.1177/0003122414545986,0.01,70562.0,167.0,0.314487
3,10.1177/0003122414545986,0.001,70562.0,167.0,0.314487
4,10.1037/a0029648,0.0001,20.0,7.0,0.821205
