# Prepare the information sheet for the final manually read article list

In [1]:
# import packages
import pandas as pd
import numpy as np
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [2]:
# import nltk
# nltk.download('punkt')

In [3]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import parameters as params
import dataframe_columns as df_col



## Predefined functions:

In [4]:
def complete_sent(sent):
    if not (sent.endswith('.') or sent.endswith('?') or sent.endswith('!')):
        sent += '.'
    
    return sent

In [5]:
def extract_sent_from_text(text, kw_group):
    if text != text or text == "":
        return 'NA'
    # preprocess text
    text = plib.process_text(text, lower=True) 
    
    sents = []

    # sentence tokenize
    all_sentences = sent_tokenize(text)

    for sent in all_sentences:
        # if the sentence has been added, continue
        compl_sent = complete_sent(sent)
        if compl_sent in sents:
            continue
        
        # else, match and add
        flag = False # this sentence is not matched yet

        for keyword in kw_group:
            keyword = keyword.lower()
            # print(keyword)
            
            # match keyword
            if keyword in params.exact_match_kw_list: # If the keyword is in exact match keyword list, use exact match
                words = word_tokenize(compl_sent)
                
                for word in words:
                    if word == keyword:
                        sents.append(compl_sent)
                        flag = True
                        break
                if flag:
                    break
            elif keyword in compl_sent: # If the keyword is not in exact match keyword list, use fuzzy match
                sents.append(compl_sent)
                break
            else:
                continue
    
    # convert set to string
    sents_combined = ' '.join(sent for sent in sents)
    sents_combined = sents_combined.strip()

    return sents_combined
# --------------------Start of test code--------------------
# text = "Effect of Attentive Fixation in macaque thalamus and Cortex. D. B. BENDER AND M. YOUAKIM Department of Physiology and Biophysics, School of Medicine and Biomedical Sciences, University at Buffalo, State University of New York, Buffalo, New York 14214 Received 29 December 1999; accepted in final form 21 September 2000 Bender, D. B. and M. Youakim. Effect of attentive fixation in macaque thalamus and cortex. J Neurophysiol 85: 219234, 2001. Attentional modulation of neuronal responsiveness is common in many areas of visual cortex. We examined whether attentional modulation in the visual thalamus was quantitatively similar to that in cortex. Identical procedures and apparatus were used to compare attentional modulation of single neurons in seven different areas of the visual system: the lateral geniculate, three visual subdivisions of the pulvinar [inferior, lateral, dorsomedial part of lateral pulvinar (Pdm)], and three areas of extrastriate cortex representing early, intermediate, and late stages of cortical processing (V2, V4/PM, area 7a). A simple fixation task controlled transitions among three attentive states. The animal waited for a fixation point to appear (ready state), fixated the point until it dimmed (fixation state), and then waited idly to begin the next trial (idle state). Attentional modulation was estimated by flashing an identical, irrelevant stimulus in a neurons receptive field during each of the three states; the three responses defined a response vector whose deviation from the line of equal response in all three states (the main diagonal) indicated the character and magnitude of attentional modulation. Attentional modulation was present in all visual areas except the lateral geniculate, indicating that modulation was of central origin. Prevalence of modulation was modest (26%) in pulvinar, and increased from 21% in V2 to 43% in 7a. Modulation had a push-pull character (as many cells facilitated as suppressed) with respect to the fixation state in all areas except Pdm where all cells were suppressed during fixation. The absolute magnitude of attentional modulation, measured by the angle between response vector and main diagonal expressed as a percent of the maximum possible angle, differed among brain areas. Magnitude of modulation was modest in the pulvinar (1926%), and increased from 22% in V2 to 41% in 7a. However, average trial-to-trial variability of response, measured by the coefficient of variation, also increased across brain areas so that its difference among areas accounted for more than 90% of the difference in modulation magnitude among areas. We also measured attentional modulation by the ratio of cell discharge due to attention divided by discharge variability. The resulting signal-tonoise ratio of attention was small and constant, 1.3 6 10%, across all areas of pulvinar and cortex. We conclude that the pulvinar, but not the lateral geniculate, is as strongly affected by attentional state as any area of visual cortex we studied and that attentional modulation amplitude is closely tied to intrinsic variability of response. INTRODUCTION It is now clear that attention can affect the responsiveness of neurons throughout visual cortex. Visually responsive cortex includes a number of distinct areas beyond striate cortex, or V1. Beginning with V2, these extrastriate areas are organized into two partially segregated, roughly hierarchical systems (reviews in Felleman and Van Essen 1991; Maunsell and Newsome 1987; Ungerleider and Mishkin 1982; Van Essen 1985). One includes dorsally located areas such as V3A, MT, and MST and leads into area 7a in the inferior parietal lobule. The other includes more ventrally located areas such as V4 and TEO and leads into area TE in the temporal lobe. Recordings from single neurons in many of these areas show that neuronal excitability depends on the animals attentive state (reviews in Colby 1991; Desimone and Duncan 1995; Lock and Bender 1999; Maunsell 1995; Motter 1998). Typically the effect of attention is modest: a small increase or decrease in magnitude of response to a visual stimulus relative to a control condition. Such modulation can be found at virtually every level of the cortical hierarchy, including V1. A variety of behavioral paradigms have been used to manipulate attention, and these show that the prevalence and magnitude of attentional modulation can depend substantially on both the behavioral paradigm and the cortical area in which its effects are measured. Furthermore factors such as task difficulty, the extent to which a task engages the functions of an area, and whether multiple stimuli compete for attention all can affect the modulation (Luck et al. 1997; Motter 1993; Richmond and Sato 1987). To what extent does the thalamus contribute to, or participate in, the attentional modulation that is so widespread throughout visual cortex? Three thalamic nuclei are closely interrelated with visual cortex: the lateral geniculate nucleus, the pulvinar, and the reticular nucleus of the thalamus. All have been thought to be involved in one form of attention or another (e.g., Guillery et al. 1998; Koch and Ullman 1985; Olshausen et al. 1993). The lateral geniculate projects almost exclusively to V1 with little or no output to extrastriate cortex. Layer 6 of both extrastriate and striate cortex project back to the geniculate, potentially modulating transmission through it. The pulvinar has at least three distinct visual subdivisions. The inferior (PI) and lateral pulvinar (PL) contain two separate visuotopic maps (Bender 1981). PI is driven by input from V1 (Bender 1983) but also receives input from extrastriate cortex and the superior colliculus. It projects mainly to V2, V3, V3A, and MT. PL likewise receives input from V1 and extrastriate cortex, but may have a particular affinity"
# kw_group = ['Cortex', 'Thalamus', 'Macaque']
# sents = extract_sent_from_text(text, kw_group)
# print(sents)
# ---------------------End of test code---------------------

In [6]:
def get_texts(index, title, abstract, keywords):
    txt_file_name = str(index) + ".txt"
    txt_path = os.path.join(fpath.text_folder, txt_file_name)
    txt_500_path = os.path.join(fpath.processed_texts_of_length_500_folder, txt_file_name)
    
    text_tak = ""           # text from title, abstract, and keywords
    text_500 = ""           # text from full text 500
    text_full = ""          # text from full text

    # from title, abstract, and keywords
    # extract first 500 words from text_full and text_tak, if they are longer than 500 words
    # if they are shorter than 500 words, expand them to 500 words by repeating them
    if title == title:
        text_tak = text_tak + title + " "
    else:
        pass  
    if abstract == abstract:
        text_tak = text_tak + abstract + " "
    else:
        pass
    if keywords == keywords:
        text_tak = text_tak + keywords + " "
    else:
        pass
    
    text_tak = plib.process_text(text_tak, lower=True)
        
    if len(text_tak.split()) != 0:
        text_tak = plib.process_text(text_tak, lower=True)
        while len(text_tak.split()) < params.text_length_to_extract:
            text_tak = text_tak + " " + text_tak
        text_tak = ' '.join(text_tak.split()[:params.text_length_to_extract])
        text_tak = plib.process_text(text_tak, lower=True)
    else:
        text_tak = ""
    # print(text_tak)
    # print(len(text_tak.split()))

    # from limited length full text
    if os.path.exists(txt_500_path):
        with open(txt_500_path, "r", encoding='ascii') as f:
            text_500 = f.read()
    else:
        pass
        
    if len(text_500.split()) != 0:
        text_500 = plib.process_text(text_500, lower=True)
        while len(text_500.split()) < params.text_length_to_extract:
            text_500 = text_500 + " " + text_500
        text_500 = ' '.join(text_500.split()[:params.text_length_to_extract])
        text_500 = plib.process_text(text_500, lower=True)
    else:
        text_500 = ""
    # print(text_500)
    # print(len(text_500.split()))
    
    # from full text
    if os.path.exists(txt_path):
        with open(txt_path, "r", encoding='ascii') as f:
            text_full = f.read()
    else:
        pass
        
    if len(text_full.split()) != 0:
        text_full = plib.process_text(text_full, lower=True)
    else:
        text_full = ""
    # print(text_full)
    # print(len(text_full.split()))
    
    return text_tak, text_500, text_full
# --------------------Start of test code--------------------
# index = 0
# title = "hello"
# abstract = "world"
# keywords ="!"
# text_tak, text_500, text_full = get_texts(index, title, abstract, keywords)
# ---------------------End of test code---------------------

In [7]:
def save_large_df(df):
    # Number of rows per split
    rows_per_split = 1000

    # Calculate total number of splits needed
    num_splits = (len(df) + rows_per_split - 1) // rows_per_split  # This ensures any remainder is handled

    # Create and save the smaller DataFrames
    for i in range(num_splits):
        start_index = i * rows_per_split
        end_index = start_index + rows_per_split
        split_df = df.iloc[start_index:end_index]

        # Save each split to a CSV file
        split_df.to_csv(f'./datasets/literature_search_datasets/final_manually_read_csv_{i+1}.csv', sep='\t', index=False, header=True)

In [8]:
def extract_sents_and_record(input_path, output_path):
    df_input = pd.read_csv(input_path, header=0, sep='\t')
    
    df_output = df_input.copy()
    
    for ind in df_input.index:
        # get texts
        index = int(df_input.at[ind, "INDEX"])
        title = df_input.at[ind, "TITLE"]
        abstract = df_input.at[ind, "ABSTRACT"]
        keywords = df_input.at[ind, "KEYWORDS"]
        text_tak, text_500, text_full = get_texts(index, title, abstract, keywords)

        # text columns
        text_column = df_col.text_columns_to_add
        columns_to_fill = df_col.columns_to_fill
        keys_list = list(params.ranking_kw_groups.keys())

        # decide for the text
        if text_full == text_full and text_full != "": # if full text is available, use full text
            text = text_full
        else: # otherwise, use tak (title + abstract + keywords)
            text = text_tak

        # extract sentences from text
        text_list = []
        for key in keys_list:
            sents = extract_sent_from_text(text, params.ranking_kw_groups[key])
            text_list.append(sents)
        
        # fill in column content
        for i in range(len(text_column)): # add key value pair of ranking_kw_groups and values in text_group
            key = text_column[i]
            text_value = text_list[i]
            df_output.at[ind, key] = text_value
        
        # fill in columns to add
        for i in range(len(columns_to_fill)): # add key value pair of ranking_kw_groups and values in text_group
            key = columns_to_fill[i]
            value = np.nan
            df_output.at[ind, key] = value

        # display the progress 
        print("\n")       
        print("ind:", ind, "index:", index)
        
    # read the csv file and reset index and add header
    df_output = df_output[df_col.final_manually_read_df_columns]
    df_output.reset_index(drop=True, inplace=True)
    save_large_df(df_output)

## Main program:

### 1. Select colunmns from the db

In [9]:
# the 120% of the articles above the threshold
df_db = pd.read_csv(fpath.poten_litera_db, header=0, sep='\t')

output_path = fpath.final_confirm_article_list
plib.clear_file(output_path)

output_df = pd.DataFrame(columns = df_col.db_columns)

# get the list of articles to manually read
with open(fpath.article_list_to_manually_read, 'r') as file:
    list_as_string = file.read()
l = list_as_string.strip().split(',')
article_list_to_manually_read = [int(i) for i in l]
print(len(article_list_to_manually_read))
print(article_list_to_manually_read)

# Iterate through the list and find matching rows
for index in article_list_to_manually_read:
    matching_row = df_db[df_db['INDEX'].astype(int) == index]
    output_df = pd.concat([output_df, matching_row], ignore_index=True)
    
# List of columns to select
selected_columns = df_col.index + df_col.identifier + df_col.url + df_col.tak # Replace with your column names

# Selecting specified columns
selected_df = output_df[selected_columns]
selected_df.reset_index(inplace=True, drop=True)
selected_df.to_csv(output_path, header=True, index=False, sep='\t')

1936
[120, 321, 329, 42, 142, 1075, 277, 1001, 2459, 11, 53, 126, 242, 89, 917, 904, 76, 191, 923, 1134, 531, 975, 416, 151, 194, 147, 945, 903, 182, 96, 952, 2567, 232, 910, 143, 20, 636, 2418, 901, 1189, 374, 438, 867, 380, 2535, 509, 185, 1193, 517, 758, 1072, 228, 218, 349, 586, 196, 1125, 82, 637, 127, 1147, 39, 949, 92, 23, 868, 7, 41, 1088, 1602, 878, 29, 17, 81, 57, 264, 1135, 1042, 115, 1033, 927, 97, 2945, 2521, 1385, 5114, 911, 405, 957, 4378, 100, 1059, 914, 4242, 3977, 721, 785, 1108, 95, 2428, 4545, 1131, 103, 162, 50, 4038, 940, 128, 4457, 28, 686, 631, 198, 1314, 650, 1334, 425, 335, 2735, 960, 101, 618, 245, 992, 27, 931, 248, 360, 1302, 916, 1629, 1061, 1307, 1424, 67, 935, 1341, 2414, 4240, 33, 899, 1039, 44, 122, 946, 1702, 1011, 47, 161, 819, 433, 90, 2586, 1109, 1173, 648, 275, 1002, 439, 171, 2812, 1557, 1166, 365, 841, 236, 2438, 752, 1199, 473, 501, 124, 4003, 48, 105, 13, 91, 3599, 4524, 8, 4267, 1165, 1527, 30, 83, 912, 687, 166, 4011, 2464, 156, 56, 953, 211

In [10]:
# the relevant reviews and articles whose tak and full text are not available
df_db = pd.read_csv(fpath.poten_litera_db, header=0, sep='\t')

output_path = fpath.relevant_reviews_and_tak_not_avaialble
plib.clear_file(output_path)

output_df = pd.DataFrame(columns = df_col.db_columns)

# get the list of relevant review articles
with open(fpath.relevant_article_and_is_review, 'r') as file:
    list_as_string = file.read()
l = list_as_string.strip().split(',')
review_and_tak_not_avail = [int(i) for i in l]
print(len(review_and_tak_not_avail))
print(review_and_tak_not_avail)

# get the list of articles whose tak and full text are not available
df = pd.read_csv(fpath.poten_litera_tak_not_available, header=None, sep='\t')
df.columns = df_col.db_columns
for ind in df.index:
    index = int(df.at[ind, "INDEX"])
    review_and_tak_not_avail.append(index)
print(len(review_and_tak_not_avail))
print(review_and_tak_not_avail)

# Iterate through the list and find matching rows
for index in review_and_tak_not_avail:
    matching_row = df_db[df_db['INDEX'].astype(int) == index]
    output_df = pd.concat([output_df, matching_row], ignore_index=True)
    
# List of columns to select
selected_columns = df_col.index + df_col.identifier + df_col.url + df_col.tak # Replace with your column names

# Selecting specified columns
selected_df = output_df[selected_columns]
selected_df.reset_index(inplace=True, drop=True)
selected_df.to_csv(output_path, header=True, index=False, sep='\t')

16
[377, 310, 416, 220, 276, 1273, 49, 251, 354, 465, 1964, 1815, 1827, 4508, 2475, 1939]
900
[377, 310, 416, 220, 276, 1273, 49, 251, 354, 465, 1964, 1815, 1827, 4508, 2475, 1939, 6, 36, 51, 84, 104, 119, 170, 174, 177, 188, 200, 208, 254, 256, 260, 263, 293, 297, 310, 314, 327, 350, 354, 363, 377, 379, 391, 395, 402, 411, 453, 459, 460, 462, 478, 479, 498, 506, 507, 512, 514, 530, 556, 564, 567, 575, 583, 591, 592, 600, 602, 605, 612, 617, 619, 623, 630, 633, 651, 671, 683, 695, 702, 723, 730, 764, 778, 780, 792, 801, 802, 803, 812, 834, 836, 851, 855, 875, 876, 881, 882, 885, 969, 1021, 1035, 1132, 1162, 1178, 1196, 1244, 1252, 1303, 1404, 1729, 1859, 1861, 2165, 2180, 2289, 2295, 2302, 2309, 2333, 2420, 2453, 2470, 2479, 2485, 2510, 2517, 2525, 2531, 2533, 2539, 2543, 2545, 2555, 2558, 2559, 2561, 2569, 2577, 2579, 2582, 2588, 2594, 2595, 2597, 2599, 2601, 2602, 2608, 2621, 2625, 2626, 2639, 2640, 2645, 2650, 2660, 2670, 2671, 2677, 2686, 2689, 2692, 2699, 2714, 2728, 2731, 2740, 2

In [11]:
# # articels with abstract and full text not available
# db_path = fpath.poten_litera_db
# df_db = pd.read_csv(db_path, header=0, sep='\t')

# output_path = fpath.final_manually_read_csv_abstract_full_text_not_available
# plib.clear_file(output_path)

# output_df = pd.DataFrame(columns = df_col.db_columns)

# # get the list of articles to manually read
# df = pd.read_csv(fpath.poten_litera_abstract_or_text_not_available, header=0, sep='\t')
# for ind in df.index:
#     index = int(df.at[ind, "INDEX"])
#     output_df = pd.concat([output_df, df_db[df_db['INDEX'].astype(int) == index]], ignore_index=True)
    
# # List of columns to select
# selected_columns = df_col.index + df_col.identifier + df_col.url + df_col.tak # Replace with your column names

# # Selecting specified columns
# selected_df = output_df[selected_columns]
# selected_df.reset_index(inplace=True, drop=True)
# selected_df.to_csv(output_path, header=True, index=False, sep='\t')

In [12]:
# # extract sentences
# input_path = fpath.final_confirm_article_list
# output_path = fpath.final_confirm_article_list

# extract_sents_and_record(input_path, output_path)

In [13]:
# # extract sentences
# input_path = fpath.relevant_reviews_and_tak_not_avaialble
# output_path = fpath.relevant_reviews_and_tak_not_avaialble

# extract_sents_and_record(input_path, output_path)

In [14]:
# df = pd.read_csv(fpath.poten_litera_testing_set_1000_labeled_complete, header=0, sep='\t')
# df['REVIEW(Y/N)'] = np.nan

# df.to_csv(fpath.poten_litera_testing_set_1000_labeled_complete, header=True, index=False, sep='\t')

In [15]:
def fill_in_info(output_path, info_path):
    df_info = pd.read_csv(info_path, header=0, sep='\t')
    df_output = pd.read_csv(output_path, header=0, sep='\t')
    
    # for col in df_col.columns_to_fill:
    #     # Update df_input where 'INDEX' matches
    #     df_output[col] = np.nan
        
    # Iterate over the index of df_test_1000
    for ind in df_info.index:
        index = int(df_info.at[ind, "INDEX"])
            
        # Iterate over the columns specified in df_col.columns_to_fill_0
        for col in df_col.columns_to_fill:
            # Update df_input where 'INDEX' matches
            df_output.loc[df_output['INDEX'].astype(int) == index, col] = df_info.at[ind, col]
    
    df_output = df_output[df_col.final_confirm_df_columns]
    df_output.to_csv(output_path, header=True, index=False, sep='\t')

In [16]:
# # fill in information in the test_1000 set to final manually read set
# test_1000_path = fpath.poten_litera_testing_set_1000_labeled_complete

# for i in range(1, 5):
#     input_path = os.path.join(fpath.literature_datasets_folder, "final_manually_read_csv" + "_" + str(i) + ".csv")
#     # print(input_path)
#     fill_in_info(input_path, test_1000_path)

In [17]:
# fill in information in the labeled set to final manually read set
info_path = fpath.poten_litera_testing_set_1000_labeled_complete
input_path = fpath.final_confirm_article_list
fill_in_info(input_path, info_path)
input_path = fpath.relevant_reviews_and_tak_not_avaialble
fill_in_info(input_path, info_path)

info_path = fpath.final_manually_read_csv_1_labeled
input_path = fpath.final_confirm_article_list
fill_in_info(input_path, info_path)
input_path = fpath.relevant_reviews_and_tak_not_avaialble
fill_in_info(input_path, info_path)

info_path = fpath.final_manually_read_csv_2_labeled
input_path = fpath.final_confirm_article_list
fill_in_info(input_path, info_path)
input_path = fpath.relevant_reviews_and_tak_not_avaialble
fill_in_info(input_path, info_path)

info_path = fpath.final_manually_read_csv_3_labeled
input_path = fpath.final_confirm_article_list
fill_in_info(input_path, info_path)
input_path = fpath.relevant_reviews_and_tak_not_avaialble
fill_in_info(input_path, info_path)

info_path = fpath.final_manually_read_csv_4_labeled
input_path = fpath.final_confirm_article_list
fill_in_info(input_path, info_path)
input_path = fpath.relevant_reviews_and_tak_not_avaialble
fill_in_info(input_path, info_path)

In [18]:
# count_input_Y = 0
# count_input_N = 0
# count_input_MB = 0
# count_input_NA = 0

# count_verify_Y = 0

# with open(fpath.article_list_to_manually_read, 'r') as file:
#     l = file.read().strip().split(',')
#     l = [int(i) for i in l]

# for i in range(1, 5):
#     input_path = os.path.join(fpath.literature_datasets_folder, "final_manually_read_csv" + "_" + str(i) + ".csv")
#     df_input = pd.read_csv(input_path, header=0, sep='\t')
    
#     verify_path = fpath.poten_litera_testing_set_1000_labeled
#     df_verify = pd.read_csv(verify_path, header=0, sep=',')
    
#     for ind in df_input.index:
#         index = int(df_input.at[ind, "INDEX"])
        
#         if index not in l:
#             raise Exception("index not in article_list_to_manually_read")
        
#         if df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'Y':
#             count_input_Y += 1  
#         elif df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'N':
#             count_input_N += 1   
#         elif df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'MB':
#             count_input_MB += 1
#         elif df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] != df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)']:
#             count_input_NA += 1
#         else:
#             raise Exception("RELEVANT?(Y/N/MB/NA) is not Y, N, MB, or NA")
            
# for ind in df_verify.index:
#     if df_verify.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'Y':
#         count_verify_Y += 1

# print(count_input_Y)
# print(count_input_N)
# print(count_input_MB)
# print(count_input_NA)
# print(count_input_Y + count_input_N + count_input_MB + count_input_NA)

# print(count_verify_Y)

In [19]:
count_input_Y = 0
count_input_N = 0

input_path = fpath.final_confirm_article_list
df_input = pd.read_csv(input_path, header=0, sep='\t')

for ind in df_input.index:
    index = int(df_input.at[ind, "INDEX"])
    
    if df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'Y':
        count_input_Y += 1  
    elif df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'N':
        count_input_N += 1   
    elif df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] == 'MB':
        # count_input_MB += 1
        pass
    elif df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'] != df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)']:
        # count_input_NA += 1
        pass
    else:
        pass
        # print(df_input.at[ind, 'RELEVANT?(Y/N/MB/NA)'])
        # raise Exception("RELEVANT?(Y/N/MB/NA) is not Y, N, MB, or NA")

print(count_input_Y)
print(count_input_N)

101
679
