In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os

%matplotlib inline

In [21]:
seg_path = "data/TA2_classify_data_final_with_imp_sentences_bilstm_only.json"
full_data_path = "data/ta2_classify_folds/fold_full/data.json"

In [22]:
# Read the data

with open(seg_path, "r") as f:
    seg_data = json.load(f)
    
print(len(data))


with open(full_data_path, "r") as f:
    full_data = json.load(f)
    
print(len(full_data))

886
886


## Split the important phrase extractions to reproducible vs non-reproducible:

In [14]:
phrase_path1 = "data/ta2_important_sentences_phrases/socrepr_sentences_full_v1.json"
phrase_path2 = "data/ta2_important_sentences_phrases/socrepr_sentences_full_v2.json"

In [15]:
with open(phrase_path1, "r") as f:
    phrase_data = json.load(f)
    
with open(phrase_path2, "r") as f:
    phrase_data += json.load(f)

In [29]:
# Create the paper_id and segment heading map:

seg_data_map = {}
for i in range(len(seg_data)):
    paper_id = seg_data[i]["paper_id"]
    
    seg_data_map[paper_id] = {}
    seg_data_map[paper_id]["important_segment_idx"] = seg_data[i]["important_segment_idx"]
    seg_data_map[paper_id]["important_section_heading_or_idx"] = seg_data[i]["important_section_heading_or_idx"]

In [32]:
# Merge the seg data with phrase data:

for i in range(len(phrase_data)):
    paper_id = phrase_data[i]["paper_id"]
    if seg_data_map.get(paper_id) is not None:
        for k, v in seg_data_map[paper_id].items():
            phrase_data[i][k] = v
    else:
        print("Error for ", paper_id)

In [38]:
# Separate into reprod vs non-reprod:

def seperateData(cur_phrase_data):
    reprod = []
    non_reprod = []
    
    for i in range(len(cur_phrase_data)):
        if cur_phrase_data[i]["label"] == 0 and cur_phrase_data[i]["predicted_label"] == 0:
            non_reprod.append(cur_phrase_data[i])
        elif cur_phrase_data[i]["label"] == 1 and cur_phrase_data[i]["predicted_label"] == 1:
            reprod.append(cur_phrase_data[i])
            
    print(len(reprod), len(non_reprod))
    return reprod, non_reprod

In [39]:
reprod_phrase_data, non_reprod_phrase_data = seperateData(phrase_data)

314 377


In [42]:
# Save the data splits:

base_path = "TA2_phrase_level_extractions_from_overall_paper_content.json"
reprod_path = os.path.join("reproducibility_papers_" + base_path)
non_reprod_path = os.path.join("non_reproducibility_papers_" + base_path)
full_path = os.path.join("all_papers_" + base_path)

with open(reprod_path, "w") as f:
    json.dump(reprod_phrase_data, f, indent=2)
    
with open(non_reprod_path, "w") as f:
    json.dump(non_reprod_phrase_data, f, indent=2)
    
with open(full_path, "w") as f:
    json.dump(phrase_data, f, indent=2)

In [125]:
# Filter the phrase data:

def filterPhrases(cur_phrase_list, topk=3):
    new_phrase_list = []
    hash_dict = {}
    
    max_phrase_length = -1
    for cur_phrase in cur_phrase_list:
        if len(cur_phrase[1].split()) > max_phrase_length:
            max_phrase_length = len(cur_phrase[1].split())
    
    phrase_length_limit = 0.80 * max_phrase_length
    for cur_phrase in cur_phrase_list:
        
        # Remove long phrases
        if len(cur_phrase[1].split()) >= phrase_length_limit:
            continue
            
        # Remove duplicates
        if hash_dict.get(cur_phrase[1]) is not None:
            continue
        hash_dict[cur_phrase[1]] = True
        
        
        new_phrase_list.append(cur_phrase)
        if len(new_phrase_list) >= topk:
            break
            
    assert(len(new_phrase_list) <= topk)
    return new_phrase_list


def filterData(cur_phrase_data, topk=3):
    new_phrase_data = []
    for i in range(len(cur_phrase_data)):
        cur_record = {}
        for k, v in cur_phrase_data[i].items():
            cur_record[k] = v
            
        cur_record["important_phrases"] = filterPhrases(cur_record["important_phrases"], topk=topk)
        new_phrase_data.append(cur_record)
        
    return new_phrase_data

In [126]:
phrase_data_filtered = filterData(phrase_data)

In [127]:
reprod_phrase_data, non_reprod_phrase_data = seperateData(phrase_data_filtered)

314 377


In [128]:
# Save the data splits:

base_path = "filtered_TA2_phrase_level_extractions_from_overall_paper_content.json"
reprod_path = os.path.join("reproducibility_papers_" + base_path)
non_reprod_path = os.path.join("non_reproducibility_papers_" + base_path)
full_path = os.path.join("all_papers_" + base_path)

with open(reprod_path, "w") as f:
    json.dump(reprod_phrase_data, f, indent=2)
    
with open(non_reprod_path, "w") as f:
    json.dump(non_reprod_phrase_data, f, indent=2)
    
with open(full_path, "w") as f:
    json.dump(phrase_data_filtered, f, indent=2)

## Get important segment distributions:

In [None]:
def encode_paper_sentence_wise(self, content):
    device = self.device
    embeddings = []
    for cur_section in content:
        if len(cur_section['text']) == 0:
            continue
        split_sentences = nltk.sent_tokenize(cur_section['text'])
        for sentence in split_sentences:
            cur_sentence_tokens = sentence.split()

            # if a single sentence is more than window size (cut it down)
            if len(cur_sentence_tokens) > self.window_size:
                cur_sentence_tokens = cur_sentence_tokens[:self.window_size]
            else:
                cur_sentence_tokens = cur_sentence_tokens

            # Filter out sentences with only urls/citations etc.
            if len(cur_sentence_tokens) >= 7:
                cur_sentence = " ".join(cur_sentence_tokens)
                input_ids = torch.tensor(self.lm_tokenizer.encode(cur_sentence)).unsqueeze(0)  # Batch size 1
                try:
                    input_ids = input_ids.to(device)
                    outputs = self.lm_embeddings_model(input_ids)
                except Exception as e:
                    print(e)
                    continue
                cls_embedding = outputs[0][:, 0, :].squeeze().detach().cpu()
                del outputs
                del input_ids
                embeddings.append(cls_embedding)

    final_embs = torch.stack(embeddings)
    return final_embs, len(embeddings)