In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
import json
from collections import defaultdict

## Get phrase-level extraction data for reproducible vs non-reproducible papers:

In [11]:
train_extraction_path = "../repr_claims_results/socrepr_claims_train.json"
dev_extraction_path = "../repr_claims_results/socrepr_claims_dev.json"

In [12]:
with open(train_extraction_path, "r") as f:
    train_res = json.load(f)
    
with open(dev_extraction_path, "r") as f:
    dev_res = json.load(f)

In [13]:
res_full = train_res.copy()
res_full = res_full + dev_res
df_res = pd.DataFrame(res_full)

In [14]:
df_res["important_segment_idx"] = df_res.apply(lambda x: pid_seg_idx_map[x["paper_id"]][0], axis=1)
df_res["important_segment"] = df_res.apply(lambda x: pid_seg_idx_map[x["paper_id"]][1], axis=1)

In [15]:
print(df_res.shape)
print(df_res.columns)

(845, 6)
Index(['paper_id', 'label', 'predicted_label', 'important_segment',
       'important_phrases', 'important_segment_idx'],
      dtype='object')


In [16]:
reverse_seg_map = {
    '0': "claim2",
    '1':"claim3a",
    '2':"claim3b",
    '3':"claim4"
}

In [17]:
df_res["important_segment_idx"] = df_res.apply(lambda x: reverse_seg_map[str(x["important_segment_idx"])], axis=1)

In [18]:
def cleanPhrases(phrase_list):
    out_list = []
    for cur_phrase in phrase_list:
        new_phrase = cur_phrase[1].replace("- lrb -", "(").replace("- rrb -", ")")
        out_list.append([cur_phrase[0], new_phrase])
    return out_list

In [19]:
# Replace the "- lrb - and - rrb -" tokens:
df_res["important_phrases"] = df_res.apply(lambda x: cleanPhrases(x["important_phrases"][0:5]), axis=1)

In [20]:
df_11 = df_res[(df_res["label"] == 1) & (df_res["predicted_label"] == 1)]
df_00 = df_res[(df_res["label"] == 0) & (df_res["predicted_label"] == 0)]

In [21]:
rep_path = "phrase_extractions_for_reproducible_papers_TA2.json"
df_11_json_out = df_11.to_json(orient="records")

# with open(rep_path, "w") as f:
#     json.dump(json.loads(df_11_json_out), f, indent=2)

In [22]:
non_rep_path = "phrase_extractions_for_non_reproducible_papers_TA2.json"
df_00_json_out = df_00.to_json(orient="records")

# with open(non_rep_path, "w") as f:
#     json.dump(json.loads(df_00_json_out), f, indent=2)

## Get feature extraction data for reproducible and non-reproducible papers:

In [23]:
feat_path = "../danny_submission/feature_extractions_TA2/feature_extractions_for_all_papers_TA2.json"

In [24]:
with open(feat_path, "r") as f:
    feat_data = json.load(f)

In [25]:
rep_feat_list = {}
non_rep_feat_list = {}

In [26]:
for key, val in feat_data.items():
    if pid_label_map.get(key) is None:
        continue
        
    if pid_label_map[key] == 1:
        rep_feat_list[key] = val
        rep_feat_list[key]["label"] = 1
    elif pid_label_map[key] == 0:
        non_rep_feat_list[key] = val
        non_rep_feat_list[key]["label"] = 0

In [13]:
def getStats(cur_feat_data):
    count_dict = defaultdict(lambda : 0)
    count_none_dict = defaultdict(lambda : 0)
    
    for k, v in cur_feat_data.items():
        ns = v["Validity_of_Inference"]["Number_of_Studies"]
        es = v["Validity_of_Inference"]["Effect_Size"]
        nm = v["Validity_of_Inference"]["Number_of_Models"]
        pv = v["Validity_of_Inference"]["P_Values"]
        mn = v["Validity_of_Inference"]["Model_Names"]
        ss = v["Design_Quality"]["Sample_Sizes"]
        claim_cnts = {"claim2_distance": 0, "claim3a_distance": 0, "claim3b_distance": 0, "claim4_distance": 0}
        
        if ns is None:
            count_none_dict["Number_of_Studies"] += 1
        else:
            count_dict["Number_of_Studies"] += 1
            
        if es is None:
            count_none_dict["Effect_Size"] += 1
        elif len(es) !=0:
            count_dict["Effect_Size"] += 1
            cur_val = es[0]
            for k, v in cur_val.items():
                if k != "sent_idx" and v is not None:
                    claim_cnts[k] = 1
            
        if nm is None:
            count_none_dict["Number_of_Models"] += 1
        else:
            count_dict["Number_of_Models"] += 1
            
        if pv is None:
            count_none_dict["P_Values"] += 1
        elif len(pv) !=0:
            count_dict["P_Values"] += 1
            cur_val = pv[0]
            for k, v in cur_val.items():
                if k != "sent_idx" and v is not None:
                    claim_cnts[k] = 1
        
        if mn is None:
            count_none_dict["Model_Names"] += 1
        elif len(mn) !=0:
            count_dict["Model_Names"] += 1
            cur_val = mn[0]
            for k, v in cur_val.items():
                if k != "sent_idx" and v is not None:
                    claim_cnts[k] = 1
        
        if ss is None:
            count_none_dict["Sample_Sizes"] += 1
        elif len(ss) !=0:
            count_dict["Sample_Sizes"] += 1
            cur_val = ss[0]
            for k, v in cur_val.items():
                if k != "sent_idx" and v is not None:
                    claim_cnts[k] = 1
                    
        for k, v in claim_cnts.items():
            if k=="value":
                continue
            if v == 1:
                count_dict[k] += 1
            else:
                count_none_dict[k] += 1
                
    count_dict = dict(sorted(dict(count_dict).items(), key=lambda x: x[0]))
    count_none_dict = dict(sorted(dict(count_none_dict).items(), key=lambda x: x[0]))
    
    percent_dict = {}
    percent_none_dict = {}
    total_num_papers = len(cur_feat_data.keys())
    for k, v in count_dict.items():
        percent_dict[k] = round(v * 100/total_num_papers, 2)
    
    for k, v in count_none_dict.items():
        percent_none_dict[k] = round(v * 100/total_num_papers, 2)
    
    print("Count of non-None entries in papers:")
    print(count_dict, "\n")
    
    print("Percent of non-None entries in papers:")
    print(percent_dict, "\n")
    
    print("Count of None entries in papers:")
    print(count_none_dict)
    
    print("Percent of None entries in papers:")
    print(percent_none_dict, "\n")

In [69]:
print("For all {} TA2 papers: ".format(len(feat_data)))
getStats(feat_data)

For all 2380 TA2 papers: 
Count of non-None entries in papers:
{'Effect_Size': 569, 'Model_Names': 2367, 'Number_of_Models': 273, 'Number_of_Studies': 87, 'P_Values': 1748, 'Sample_Sizes': 2329, 'claim2_distance': 1722, 'claim3a_distance': 656, 'claim3b_distance': 977, 'claim4_distance': 617} 

Percent of non-None entries in papers:
{'Effect_Size': 23.91, 'Model_Names': 99.45, 'Number_of_Models': 11.47, 'Number_of_Studies': 3.66, 'P_Values': 73.45, 'Sample_Sizes': 97.86, 'claim2_distance': 72.35, 'claim3a_distance': 27.56, 'claim3b_distance': 41.05, 'claim4_distance': 25.92} 

Count of None entries in papers:
{'Effect_Size': 1811, 'Model_Names': 13, 'Number_of_Models': 2107, 'Number_of_Studies': 2293, 'P_Values': 632, 'Sample_Sizes': 36, 'claim2_distance': 658, 'claim3a_distance': 1724, 'claim3b_distance': 1403, 'claim4_distance': 1763}
Percent of None entries in papers:
{'Effect_Size': 76.09, 'Model_Names': 0.55, 'Number_of_Models': 88.53, 'Number_of_Studies': 96.34, 'P_Values': 26.55

In [70]:
print("For {} reproducible papers: ".format(len(rep_feat_list)))
getStats(rep_feat_list)

For 392 reproducible papers: 
Count of non-None entries in papers:
{'Effect_Size': 82, 'Model_Names': 387, 'Number_of_Models': 51, 'Number_of_Studies': 12, 'P_Values': 237, 'Sample_Sizes': 376, 'claim2_distance': 276, 'claim3a_distance': 118, 'claim3b_distance': 165, 'claim4_distance': 121} 

Percent of non-None entries in papers:
{'Effect_Size': 20.92, 'Model_Names': 98.72, 'Number_of_Models': 13.01, 'Number_of_Studies': 3.06, 'P_Values': 60.46, 'Sample_Sizes': 95.92, 'claim2_distance': 70.41, 'claim3a_distance': 30.1, 'claim3b_distance': 42.09, 'claim4_distance': 30.87} 

Count of None entries in papers:
{'Effect_Size': 310, 'Model_Names': 5, 'Number_of_Models': 341, 'Number_of_Studies': 380, 'P_Values': 155, 'Sample_Sizes': 14, 'claim2_distance': 116, 'claim3a_distance': 274, 'claim3b_distance': 227, 'claim4_distance': 271}
Percent of None entries in papers:
{'Effect_Size': 79.08, 'Model_Names': 1.28, 'Number_of_Models': 86.99, 'Number_of_Studies': 96.94, 'P_Values': 39.54, 'Sample_

In [71]:
print("For {} non-reproducible papers: ".format(len(non_rep_feat_list)))
getStats(non_rep_feat_list)

For 492 non-reproducible papers: 
Count of non-None entries in papers:
{'Effect_Size': 145, 'Model_Names': 490, 'Number_of_Models': 53, 'Number_of_Studies': 36, 'P_Values': 426, 'Sample_Sizes': 488, 'claim2_distance': 339, 'claim3a_distance': 128, 'claim3b_distance': 181, 'claim4_distance': 126} 

Percent of non-None entries in papers:
{'Effect_Size': 29.47, 'Model_Names': 99.59, 'Number_of_Models': 10.77, 'Number_of_Studies': 7.32, 'P_Values': 86.59, 'Sample_Sizes': 99.19, 'claim2_distance': 68.9, 'claim3a_distance': 26.02, 'claim3b_distance': 36.79, 'claim4_distance': 25.61} 

Count of None entries in papers:
{'Effect_Size': 347, 'Model_Names': 2, 'Number_of_Models': 439, 'Number_of_Studies': 456, 'P_Values': 66, 'Sample_Sizes': 1, 'claim2_distance': 153, 'claim3a_distance': 364, 'claim3b_distance': 311, 'claim4_distance': 366}
Percent of None entries in papers:
{'Effect_Size': 70.53, 'Model_Names': 0.41, 'Number_of_Models': 89.23, 'Number_of_Studies': 92.68, 'P_Values': 13.41, 'Samp

In [49]:
rep_feat_path = "feature_extractions_for_reproducible_papers_TA2.json"
non_rep_feat_path = "feature_extractions_for_non_reproducible_papers_TA2.json"

# with open(rep_feat_path, "w") as f:
#     json.dump(rep_feat_list, f, indent=2)
    
# with open(non_rep_feat_path, "w") as f:
#     json.dump(non_rep_feat_list, f, indent=2)

## Order the feature extractions based on distance to claims:

In [14]:
def getList(input_list, topk=5):
    out_list = [None] * topk
    based_on = None
    
    if input_list is None or len(input_list) == 0:
        return out_list, based_on
    
    first_val = input_list[0]
    
    if first_val.get("claim4_distance") is not None:
        sorted_list = sorted(input_list, key=lambda x: abs(x["claim4_distance"]))
        sorted_list = [x["value"] for x in sorted_list]
        based_on = "claim4_distance"
        
    elif first_val.get("claim3b_distance") is not None:
        sorted_list = sorted(input_list, key=lambda x: abs(x["claim3b_distance"]))
        sorted_list = [x["value"] for x in sorted_list]
        based_on = "claim3b_distance"
        
    else:
        return out_list, based_on
    
    for i in range(len(sorted_list)):
        if i>=topk:
            break
        out_list[i] = sorted_list[i]
    return out_list, based_on


def convertToCSV(input_data):
    df_result = []
    
    df_cols = ["paper_id", "Number_of_Studies", "Number_of_Models"]
    for i in range(1, 6):
        df_cols.append("Effect_Size_" + str(i))
    for i in range(1, 6):
        df_cols.append("P_Values_" + str(i))
    for i in range(1, 6):
        df_cols.append("Model_Names_" + str(i))
    for i in range(1, 6):
        df_cols.append("Sample_Sizes_" + str(i))
    df_cols.append("sorting_order_based_on")
    
    for cur_pid, cur_val in input_data.items():
        num_studies = cur_val["Validity_of_Inference"]["Number_of_Studies"]
        num_models = cur_val["Validity_of_Inference"]["Number_of_Models"]
        
        effect_size_list, based_on = getList(cur_val["Validity_of_Inference"]["Effect_Size"])
        pv_list, based_on = getList(cur_val["Validity_of_Inference"]["P_Values"])
        model_names_list, based_on = getList(cur_val["Validity_of_Inference"]["Model_Names"])
        ss_list, based_on = getList(cur_val["Design_Quality"]["Sample_Sizes"])
        
        cur_row = [cur_pid, num_studies, num_models]
        cur_row += effect_size_list
        cur_row += pv_list
        cur_row += model_names_list
        cur_row += ss_list
        cur_row += [based_on]
        
        df_result.append(cur_row)
    
    df_res = pd.DataFrame(data=df_result, columns=df_cols)
    df_res = df_res[df_res["sorting_order_based_on"].notnull()]
    return df_res

In [122]:
feat_path = "feature_extractions_for_reproducible_papers_TA2.json"
with open(feat_path, "r") as f:
    feat_data = json.load(f)
    
feat_path_csv = feat_path.split(".")[0] + ".csv"
df_feat = convertToCSV(feat_data)
df_feat.to_csv(feat_path_csv, index=False)
print("For {} reproducible TA2 papers, count of non-null values in each columns".format(df_feat.shape[0]))
df_feat.count()

For 216 reproducible TA2 papers, count of non-null values in each columns


paper_id                  216
Number_of_Studies           6
Number_of_Models           32
Effect_Size_1              41
Effect_Size_2              30
Effect_Size_3              18
Effect_Size_4              13
Effect_Size_5              11
P_Values_1                125
P_Values_2                107
P_Values_3                 97
P_Values_4                 88
P_Values_5                 82
Model_Names_1             215
Model_Names_2             214
Model_Names_3             211
Model_Names_4             203
Model_Names_5             187
Sample_Sizes_1            216
Sample_Sizes_2            205
Sample_Sizes_3            192
Sample_Sizes_4            175
Sample_Sizes_5            159
sorting_order_based_on    216
dtype: int64

In [123]:
feat_path = "feature_extractions_for_non_reproducible_papers_TA2.json"
with open(feat_path, "r") as f:
    feat_data = json.load(f)
    
feat_path_csv = feat_path.split(".")[0] + ".csv"
df_feat = convertToCSV(feat_data)
df_feat.to_csv(feat_path_csv, index=False)
print("For {} non-reproducible TA2 papers, count of non-null values in each columns".format(df_feat.shape[0]))
df_feat.count()

For 248 non-reproducible TA2 papers, count of non-null values in each columns


paper_id                  248
Number_of_Studies           8
Number_of_Models           27
Effect_Size_1              74
Effect_Size_2              52
Effect_Size_3              36
Effect_Size_4              25
Effect_Size_5              19
P_Values_1                205
P_Values_2                198
P_Values_3                189
P_Values_4                181
P_Values_5                171
Model_Names_1             247
Model_Names_2             238
Model_Names_3             231
Model_Names_4             224
Model_Names_5             211
Sample_Sizes_1            248
Sample_Sizes_2            234
Sample_Sizes_3            212
Sample_Sizes_4            195
Sample_Sizes_5            172
sorting_order_based_on    248
dtype: int64

In [124]:
feat_path = "feature_extractions_for_all_papers_TA2.json"
with open(feat_path, "r") as f:
    feat_data = json.load(f)
    
feat_path_csv = feat_path.split(".")[0] + ".csv"
df_feat = convertToCSV(feat_data)
df_feat.to_csv(feat_path_csv, index=False)
print("For {} all TA2 papers, count of non-null values in each columns".format(df_feat.shape[0]))
df_feat.count()

For 1245 all TA2 papers, count of non-null values in each columns


paper_id                  1245
Number_of_Studies           26
Number_of_Models           136
Effect_Size_1              273
Effect_Size_2              190
Effect_Size_3              128
Effect_Size_4               92
Effect_Size_5               66
P_Values_1                 852
P_Values_2                 788
P_Values_3                 733
P_Values_4                 674
P_Values_5                 631
Model_Names_1             1241
Model_Names_2             1223
Model_Names_3             1196
Model_Names_4             1129
Model_Names_5             1046
Sample_Sizes_1            1245
Sample_Sizes_2            1187
Sample_Sizes_3            1106
Sample_Sizes_4            1009
Sample_Sizes_5             892
sorting_order_based_on    1245
dtype: int64

### For RPP data:

In [1]:
inp_path = "../danny_submission/feature_extractions_RPP/RPP_feature_extraction_results.json"
rpp_with_labels_path = ""

In [3]:
with open(inp_path, "r") as f:
    rpp_feat_data = json.load(f)

In [19]:
rpp_csv = convertToCSV(rpp_feat_data)

In [20]:
print(rpp_csv.shape)

(0, 24)


### For TA1 data:

In [39]:
feat_path = "TA1_feature_extraction_results.json"
with open(feat_path, "r") as f:
    feat_data = json.load(f)
    
feat_path_csv = feat_path.split(".")[0] + ".csv"
df_feat = convertToCSV(feat_data)
df_feat.to_csv(feat_path_csv, index=False)
print("For {} all TA1 papers, count of non-null values in each columns".format(df_feat.shape[0]))
df_feat.count()

For 582 all TA1 papers, count of non-null values in each columns


paper_id                  582
Number_of_Studies          13
Number_of_Models           66
Effect_Size_1             122
Effect_Size_2              87
Effect_Size_3              57
Effect_Size_4              39
Effect_Size_5              27
P_Values_1                417
P_Values_2                383
P_Values_3                359
P_Values_4                327
P_Values_5                302
Model_Names_1             581
Model_Names_2             573
Model_Names_3             563
Model_Names_4             536
Model_Names_5             497
Sample_Sizes_1            582
Sample_Sizes_2            552
Sample_Sizes_3            518
Sample_Sizes_4            474
Sample_Sizes_5            413
sorting_order_based_on    582
dtype: int64

## Get full paper data:

In [284]:
paper_data = "../segment_data/TA2_classify_data_final_with_folds.json"

In [285]:
df_data_full = pd.read_json(paper_data)

In [289]:
pid_claims_map = {}
for idx, cur_row in df_data_full.iterrows():
    try:
        cur_pid = cur_row["DOI_CR"]
        cur_claims = [cur_row["coded_claim2"], cur_row["coded_claim3a"], cur_row["coded_claim3b"], cur_row["coded_claim4"]]
        pid_claims_map[cur_pid] = cur_claims
    except e:
        print(e)

## Important Segment data analysis:

In [28]:
inp_seg_path = "../segment_data/TA2_classify_data_final_with_imp_claims_only.json"

In [29]:
df = pd.read_json(inp_seg_path)

In [30]:
df["label"].value_counts()

0    493
1    393
Name: label, dtype: int64

In [31]:
print(df.shape)
print(df.columns)

pid_seg_idx_map = {}
pid_label_map = {}
for idx, cr in df.iterrows():
    pid_seg_idx_map[cr["paper_id"]] = [cr["important_segment_idx"], cr["important_segment"]]
    pid_label_map[cr["paper_id"]] = cr["label"]

(886, 5)
Index(['paper_id', 'important_segment', 'important_segment_idx', 'label',
       'Fold_Id'],
      dtype='object')
