In [240]:
import pandas as pd
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
import json

## Get full paper data:

In [284]:
paper_data = "../segment_data/TA2_classify_data_final_with_folds.json"

In [285]:
df_data_full = pd.read_json(paper_data)

In [289]:
pid_claims_map = {}
for idx, cur_row in df_data_full.iterrows():
    try:
        cur_pid = cur_row["DOI_CR"]
        cur_claims = [cur_row["coded_claim2"], cur_row["coded_claim3a"], cur_row["coded_claim3b"], cur_row["coded_claim4"]]
        pid_claims_map[cur_pid] = cur_claims
    except e:
        print(e)

## Important Segment data analysis:

In [255]:
inp_seg_path = "../segment_data/TA2_classify_data_final_with_imp_claims_only.json"

In [256]:
df = pd.read_json(inp_seg_path)

In [257]:
df["label"].value_counts()

0    493
1    393
Name: label, dtype: int64

In [268]:
print(df.shape)
print(df.columns)

pid_seg_idx_map = {}
for idx, cr in df.iterrows():
    pid_seg_idx_map[cr["paper_id"]] = [cr["important_segment_idx"], cr["important_segment"]]

(886, 5)
Index(['paper_id', 'important_segment', 'important_segment_idx', 'label',
       'Fold_Id'],
      dtype='object')


In [259]:
result = dict(df["important_segment_idx"].value_counts())
# result = dict(df_11["important_segment_idx"].value_counts())
# result = dict(df_00["important_segment_idx"].value_counts())

for i in range(0, 4):
    if result.get(i) is None:
        result[i] = 0
    
out_result = {
    "claim2": result[0],
    "claim3a": result[1],
    "claim3b": result[2],
    "claim4": result[3]
}

In [260]:
print(out_result)
print(sum(out_result.values()))

{'claim2': 0, 'claim3a': 69, 'claim3b': 45, 'claim4': 772}
886


In [None]:
plt.figure(figsize=(15,9))
plt.bar(out_result.keys(), out_result.values())
plt.title("Important claim distribution among 886 TA2 papers")
# plt.title("Important claim distribution among 393 TA2 papers where both label & predicted label = 0")

## Phrase-level extraction data analysis:

In [271]:
train_extraction_path = "../repr_claims_results/socrepr_claims_train.json"
dev_extraction_path = "../repr_claims_results/socrepr_claims_dev.json"

In [272]:
with open(train_extraction_path, "r") as f:
    train_res = json.load(f)
    
with open(dev_extraction_path, "r") as f:
    dev_res = json.load(f)

In [273]:
res_full = train_res.copy()
res_full = res_full + dev_res
df_res = pd.DataFrame(res_full)

In [274]:
df_res["important_segment_idx"] = df_res.apply(lambda x: pid_seg_idx_map[x["paper_id"]][0], axis=1)
df_res["important_segment"] = df_res.apply(lambda x: pid_seg_idx_map[x["paper_id"]][1], axis=1)

In [275]:
print(df_res.shape)
print(df_res.columns)

(845, 6)
Index(['paper_id', 'label', 'predicted_label', 'important_segment',
       'important_phrases', 'important_segment_idx'],
      dtype='object')


In [276]:
df_11 = df_res[(df_res["label"] == 1) & (df_res["predicted_label"] == 1)]
df_00 = df_res[(df_res["label"] == 0) & (df_res["predicted_label"] == 0)]

In [298]:
# def getMostImportantPhraseListTemp(cur_df, claim_idx):
#     cur_df_filt = cur_df[cur_df["important_segment_idx"] == claim_idx]
#     phrase_list = []
#     phrase_list2 = []
#     l_t = 100
#     for idx, cur_row in cur_df_filt.iterrows():
#         cur_pid = cur_row["paper_id"]
#         cur_phrase = cur_row["important_phrases"][0][1]
#         cur_imp_seg = cur_row["important_segment"]
#         if len(cur_imp_seg) <= l_t:
#             phrase_list2.append([cur_imp_seg, cur_phrase, cur_pid])
            
#         if cur_pid == "10.1177/2167702612472884":
#             for i in cur_row["important_phrases"]:
#                 print(i, "\n")
        
#         phrase_list.append(cur_phrase)

#     for i in phrase_list:
#         print(i, "\n")
        
#     for i in phrase_list2:
#         print(i, "\n")


def getMostImportantPhraseList(cur_df, claim_idx):
    cur_df_filt = cur_df[cur_df["important_segment_idx"] == claim_idx]
    phrase_list = []
    phrase_list2 = []
    l_t = 100
    for idx, cur_row in cur_df_filt.iterrows():
        cur_pid = cur_row["paper_id"]
        cur_phrase = cur_row["important_phrases"][0][1]
        cur_imp_seg = cur_row["important_segment"]        
        phrase_list.append(cur_phrase)
        
        if cur_pid == "10.1007/s13524-016-0487-5":
            for i in cur_row["important_phrases"]:
                print(i)
        
        #print(cur_pid, "  #######  ", cur_imp_seg, "  #########  ", cur_phrase, "\n")


    return phrase_list

In [299]:
phrase_list = getMostImportantPhraseList(df_11, 3)

[8.981502, 'a statistically significant improvement']
[6.234601, 'statistically significant']
[5.839654, 'statistically']
[4.1362, 'the result was a statistically significant improvement in model fit - lrb - chi - square = 1 , 217 , df = 11 , p = . 001 .']
[3.509991, 'a statistically significant improvement in model fit - lrb - chi - square = 1 , 217 , df = 11 , p = . 001']
[3.083767, 'was a statistically significant improvement in model fit - lrb - chi - square = 1 , 217 , df = 11 , p = . 001']
[3.033817, 'significant']
[2.040396, 'improvement']
[1.42845, 'the result']
[1.403977, '. 001']
[1.298188, 'chi - square = 1 , 217']
[1.286927, '= . 001']
[1.279396, '. 001']
[0.844449, 'result']
[0.496762, '= 1 , 217']
[0.459804, 'fit']
[0.297108, '1 , 217']
[0.247735, 'model']
[0.239056, '1 , 217']
[0.206529, 'df']
[0.197803, 'chi - square']
[0.182726, '= 11']
[0.172515, '.']
[0.150446, 'df = 11 , p = . 001']
[0.129126, '=']
[0.128099, 'square']
[0.106376, 'df']
[0.041876, '=']
[0.039031, '11

In [295]:
for i in pid_claims_map["10.1007/s13524-016-0487-5"]:
    print(i, "\n")

We find that ethnicity is the main basis of local residential sorting, while occupational standing and, to a lesser degree, family life cycle and nativity also are significant. 

We selected the claim that in Newark, NJ in 1880, people lived near similar people at a very local (street segment) scale.  This was particularly the case when similarity is defined along race/ethnic lines.  

The authors test the claim using a multivariate analysis based on discrete choice models for residential location. The discrete choice models were based on a random sample of 2,894 persons living on1,442 street segments.  Street segment and person level variables were grouped into four categories (Occupation, Nativity, Family, and Race/ethnicity) and entered in a stepwise fashion.  The Race/ethnicity category was entered last.  The focal test result concerns the marginal improvement to model fit associated with entering the Race/Ethnicity variables.  

The result was a statistically significant improveme

## Feature value distribution in important phrases:

In [249]:
def getNum(input_str):
    num = ""
    first = True
    for c_char in input_str:
        if c_char.isdigit() or c_char == '.':
            num += c_char
            first = False
        elif c_char == " ":
            continue
        elif not first:
            break
            
    try:
        num = float(num)
    except:
        num = -1
    return num
            

def getFeatureList(input_phrase_list):
    pv_list = []
    
    for cur_phrase in input_phrase_list:
        if cur_phrase.startswith("p ") or " p " in cur_phrase:
            cur_idx = cur_phrase.find(" p ")
            cur_num = getNum(cur_phrase[cur_idx + 3:])
            if cur_num != -1 and cur_num < 1:
                pv_list.append(cur_num)
            
    return pv_list

In [250]:
pv_list1 = getFeatureList(getMostImportantPhraseList(df_11, 3))

In [251]:
pv_list2 = getFeatureList(getMostImportantPhraseList(df_00, 3))

In [252]:
def plotPvalues(cur_pv_list, cur_title):
    plt.figure(figsize=(15,9))
    plt.title(cur_title)
    plt.xlabel("P-value")
    plt.ylabel("Count")
    pl.hist(cur_pv_list, bins=np.logspace(np.log10(0.0001),np.log10(1.0), 10))
    pl.gca().set_xscale("log")
    pl.show()

In [None]:
plotPvalues(pv_list1, "P-value distribution for p-values extracted from the most important phrase of reproducible papers")

In [None]:
plotPvalues(pv_list2, "P-value distribution for p-values extracted from the most important phrase of non-reproducible papers")

## Visualize the parsed trees:

In [96]:
from nltk.tree import Tree
from IPython.display import display

In [117]:
# Load the parsed trees:
inp_tree_path = "../repr_claims/trees/repr_claims_trees_data.csv"

In [118]:
df_trees = pd.read_csv(inp_tree_path)

In [119]:
pid_tree_map = {}
for idx, cur_row in df_trees.iterrows():
    pid = cur_row["paper_id"]
    cur_tree = cur_row["important_segment_parsed_tree"]
    pid_tree_map[pid] = cur_tree

In [120]:
def plotTree(cur_pid):
    cur_tree = pid_tree_map[cur_pid]
    tree = Tree.fromstring(cur_tree)
    display(tree)

In [None]:
plotTree("10.1177/2167702612472884")

In [203]:
train_path = "../repr_claims/train.csv"
dev_path = "../repr_claims/dev.csv"

df_train = pd.read_csv(train_path)
df_dev = pd.read_csv(dev_path)
print(df_train["label"].value_counts(), df_train.shape)
print(df_dev["label"].value_counts(), df_dev.shape)

0    388
1    320
Name: label, dtype: int64 (708, 4)
0    105
1     73
Name: label, dtype: int64 (178, 4)
