## Split tables to many markdown files

In [None]:
# Create the many markdown files
import os

# Define the path to the uploaded file
input_file_path = 'tables.md'

# Define a directory to save the split markdown files
output_dir = 'split_markdown_files'
os.makedirs(output_dir, exist_ok=True)

# Initialize variables
current_header = None
current_content = []

# Read the file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Process the file line by line
for line in lines:
    # Check if the line is a header (##)
    if line.startswith('## '):
        # If there is current content, save it to a new file
        if current_header and current_content:
            output_file_path = os.path.join(output_dir, f"{current_header.strip()}.md")
            with open(output_file_path, 'w') as output_file:
                output_file.writelines(current_content)
        
        # Start a new file
        current_header = line[3:].strip()  # Remove '## ' from the header
        current_content = [line]  # Start the content with the current header line
    
    elif line.strip() == '--- END ---':
        # Add the end marker to the content
        current_content.append(line)
        
        # Save the current content to a file
        if current_header:
            output_file_path = os.path.join(output_dir, f"{current_header}.md")
            with open(output_file_path, 'w') as output_file:
                output_file.writelines(current_content)
        
        # Reset for the next section
        current_header = None
        current_content = []
    
    else:
        # Continue collecting lines for the current section
        current_content.append(line)

# Handle the last section if it wasn't closed by '--- END ---'
if current_header and current_content:
    output_file_path = os.path.join(output_dir, f"{current_header}.md")
    with open(output_file_path, 'w') as output_file:
        output_file.writelines(current_content)

print(f"Markdown files have been split and saved to {output_dir}")


## Replace info from the tables

In [None]:
# Replace the info need from the markdown files
import json
import os
import re

def extract_section(md_content, section_name):
    """
    Extracts a section from the markdown content.
    """
    pattern = re.compile(rf"### {section_name}(.*?)## ", re.S)
    match = pattern.search(md_content)
    if match:
        return match.group(1).strip()
    return None

def remove_first_and_last_line(file_path):
    # Open and read the file content
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Remove the first and last lines
    if len(lines) > 3:
        lines = lines[2:-1]
    else:
        lines = []  # In case the file has only one or two lines

    # Join the remaining lines
    final_content = ''.join(lines)

    return final_content

table_sections = [
    "Participant Flow Table",
    "Baseline Characteristics",
    "Primary Outcome Result(s)",
    "Secondary Outcome Result(s)",
    "All-Cause Mortality",
    "Serious Adverse Events",
    "Other .* Adverse Events",
    "Safety Results"
]

# Load the JSON file
with open('database.json', 'r') as file:
    data = json.load(file)

# Iterate through each trial
for trial in data:
    trial_name = trial['trial_name']
    md_file_path = f"./split_markdown_files/{trial_name}.md"
    
    # Check if the markdown file exists
    if os.path.exists(md_file_path):
        # print(md_file_path)
        md_content = remove_first_and_last_line(md_file_path)

        for section in table_sections:
            if section in trial['trial'].keys():
                del trial['trial'][section]

            # Update the trial information in JSON
            trial['trial']['Trial Analysis'] = md_content

# Save the updated JSON file
with open('database.json', 'w') as file:
    json.dump(data, file, indent=4)

## Clean up Database

In [None]:
from spellchecker import SpellChecker

# Initialize the spell checker
spell = SpellChecker()

def is_valid_word(word):
    return word in spell

def fix_united_words(text):
    words = []
    start = 0
    while start < len(text):
        for end in range(len(text), start, -1):
            candidate = text[start:end]
            if is_valid_word(candidate):
                words.append(candidate)
                start = end
                break
        else:
            words.append(text[start])
            start += 1
    return words

def combine_single_characters(words):
    combined_words = []
    temp = []
    
    for word in words:
        if len(word) == 1:
            temp.append(word)
        else:
            if len(temp) > 0:
                combined_words.append(''.join(temp))
                temp = []
            combined_words.append(word)
    
    # Add any remaining single characters as a combined word
    if len(temp) > 0:
        combined_words.append(''.join(temp))
    
    return combined_words

def preprocess_text(text):
    # Step 1: Fix united words
    words = fix_united_words(text)
    
    # Step 2: Combine single characters
    combined_words = combine_single_characters(words)
    
    # Join the final list of words into a string
    output = ' '.join(combined_words)
    return ' '.join(output.split())

# Example usage
text = "This trial helped learn about the safety of different doses of LHC165 given alone or with PDR001 inparticipants with advanced cancers. The researchers concluded that 600 g LHC165 was the highestdose that was safe for participants to receive alone or with 400 mg PDR001.Because enrollment ended early and there were too few participants, the researchers could not makeany conclusions about the effects of LHC165 given alone or with PDR001 on shrinking cancer. Thesponsor has no plans for other trials of LHC165 in people with advanced cancers."
fixed_text = preprocess_text(text)
print(fixed_text)

In [None]:
import os
import json

file_path = 'database.json'
# file_path = 'small_database.json'
with open(file_path, 'r') as file:
    data = json.load(file)

# Define the terms to be replaced with space
terms_to_replace = [
    "www.novartisclinicaltrials.com", "1-888-669-6682 (US);", "1-888-669-6682 (US)", "1-888-669-6682  (US)",
    "+41-61-324-1111 (EU);", "www.novctrd.com", "www.clinicaltrials.gov", "+41-61-324-1111 (EU)",
    "https://www.clinicaltrialsregister.eu/ctr-search/search",
    "Clinical Trial Results Website", "+41-61-324 1111 (EU);", "+41-61-324 1111 (EU)",
    "www.clinicaltrialsregister.eu", "http://www.novartis.com/clinicaltrials",
    "| Adults and Adolescent version |Trial Results Summary | 18", "| Adults and Adolescent version |Trial Results Summary | 17", "| Adults and Adolescent version |Trial Results Summary | 16", "| Adults and Adolescent version |Trial Results Summary | 15",
    "| Adults and Adolescent version |Trial Results Summary | 14", "| Adults and Adolescent version |Trial Results Summary | 13", "| Adults and Adolescent version |Trial Results Summary | 12", "| Adults and Adolescent version |Trial Results Summary | 11",
    "| Adults and Adolescent version |Trial Results Summary | 10", "| Adults and Adolescent version |Trial Results Summary | 9", "| Adults and Adolescent version |Trial Results Summary | 8", "| Adults and Adolescent version |Trial Results Summary | 7",
    "| Adults and Adolescent version |Trial Results Summary | 6", "| Adults and Adolescent version |Trial Results Summary | 5", "| Adults and Adolescent version |Trial Results Summary | 4", "| Adults and Adolescent version |Trial Results Summary | 3",
    "| Adults and Adolescent version |Trial Results Summary | 2", "| Adults and Adolescent version |Trial Results Summary | 1",
    "| Trial Results Summary | 18", "| Trial Results Summary | 17", "| Trial Results Summary | 16", "| Trial Results Summary | 15", "| Trial Results Summary | 14", "| Trial Results Summary | 13", "| Trial Results Summary | 12", "| Trial Results Summary | 11",
    "| Trial Results Summary | 10", "| Trial Results Summary | 9", "| Trial Results Summary | 8", "| Trial Results Summary | 7", "| Trial Results Summary | 6", "| Trial Results Summary | 5", "| Trial Results Summary | 4", "| Trial Results Summary | 3",
    "| Trial Results Summary | 2", "| Trial Results Summary | 1",
    "| Trial Results Summary | Parent Version | 18", "| Trial Results Summary | Parent Version | 17", "| Trial Results Summary | Parent Version | 16", "| Trial Results Summary | Parent Version | 15",
    "| Trial Results Summary | Parent Version | 14", "| Trial Results Summary | Parent Version | 13", "| Trial Results Summary | Parent Version | 12", "| Trial Results Summary | Parent Version | 11",
    "| Trial Results Summary | Parent Version | 10", "| Trial Results Summary | Parent Version | 9", "| Trial Results Summary | Parent Version | 8", "| Trial Results Summary | Parent Version | 7",
    "| Trial Results Summary | Parent Version | 6", "| Trial Results Summary | Parent Version | 5", "| Trial Results Summary | Parent Version | 4", "| Trial Results Summary | Parent Version | 3",
    "| Trial Results Summary | Parent Version | 2", "| Trial Results Summary | Parent Version | 1", "| Trial Results Summary | Adult Version | 18", "| Trial Results Summary | Adult Version | 17",
    "| Trial Results Summary | Adult Version | 16", "| Trial Results Summary | Adult Version | 15", "| Trial Results Summary | Adult Version | 14", "| Trial Results Summary | Adult Version | 13",
    "| Trial Results Summary | Adult Version | 12", "| Trial Results Summary | Adult Version | 11", "| Trial Results Summary | Adult Version | 10", "| Trial Results Summary | Adult Version | 9",
    "| Trial Results Summary | Adult Version | 8", "| Trial Results Summary | Adult Version | 7", "| Trial Results Summary | Adult Version | 6", "| Trial Results Summary | Adult Version | 5",
    "| Trial Results Summary | Adult Version | 4", "| Trial Results Summary | Adult Version | 3", "| Trial Results Summary | Adult Version | 2", "| Trial Results Summary | Adult Version | 1"
]

# Function to replace the terms and trial names with space
def replace_terms(text, trial_name, debug=False):
    for term in terms_to_replace:
        text = text.replace(term, ' ')
    text = text.replace(trial_name, ' ')
    return text

# Process each trial in the JSON data
for trial in data:
    trial_name = trial['trial_name']
    print(trial_name)
    
    # Clean up extra spaces
    for key in trial['trial']:
        if isinstance(trial['trial'][key], str):
            trial['trial'][key] = ' '.join(trial['trial'][key].split())
    
    for key in trial['summary']:
        if isinstance(trial['summary'][key], str):
            trial['summary'][key] = ' '.join(trial['summary'][key].split())

    # Replace in trial sections
    for key in trial['trial']:
        if isinstance(trial['trial'][key], str):
            trial['trial'][key] = replace_terms(trial['trial'][key], trial_name)
            if(key!="Trial Analysis" and key!="Generic Drug Name" and key!="Protocol Number" and key!="Date of Clinical Trial Report"):
                trial['trial'][key] = preprocess_text(trial['trial'][key])
    
    # Replace in summary sections
    for key in trial['summary']:
        if isinstance(trial['summary'][key], str):
            trial['summary'][key] = replace_terms(trial['summary'][key], trial_name, True)
            if(key!="Thank you"):
                trial['summary'][key] = preprocess_text(trial['summary'][key])

    # Clean up extra spaces
    for key in trial['trial']:
        if isinstance(trial['trial'][key], str):
            trial['trial'][key] = ' '.join(trial['trial'][key].split())
    
    for key in trial['summary']:
        if isinstance(trial['summary'][key], str):
            trial['summary'][key] = ' '.join(trial['summary'][key].split())

# Save the modified data to a new JSON file
output_file_path = 'database_clean2.json'
with open(output_file_path, 'w') as file:
    json.dump(data, file, indent=2)

In [None]:
# import os
# import json

# # Load the JSON data
# file_path = 'updated_database.json'
# with open(file_path, 'r') as file:
#     data = json.load(file)

# # Create a directory to store the output files
# output_directory = 'TrialsOutput'
# if not os.path.exists(output_directory):
#     os.makedirs(output_directory)

# # Iterate over each entry in the JSON data
# for entry in data:
#     trial_name = entry['trial_name']
    
#     # Create file names
#     trial_file_name = f"{trial_name}_trial.txt"
#     summary_file_name = f"{trial_name}_summary.txt"
    
#     # File paths
#     trial_file_path = os.path.join(output_directory, trial_file_name)
#     summary_file_path = os.path.join(output_directory, summary_file_name)
    
#     # Write the trial data to the trial file
#     with open(trial_file_path, 'w') as trial_file:
#         trial_content = json.dumps(entry['trial'], indent=4)
#         trial_file.write(trial_content)
    
#     # Write the summary data to the summary file
#     with open(summary_file_path, 'w') as summary_file:
#         summary_content = json.dumps(entry['summary'], indent=4)
#         summary_file.write(summary_content)

# print(f"Files have been created in the {output_directory} directory.")


## Database Mapping

In [None]:
import json
from unicodedata import normalize

# Load the JSON data
with open('database_clean2.json') as file:
    data = json.load(file)

mapping = {
   "Why was the research needed?": ["Objectives"],
   "How long was the trial?": ["Study Start/End Dates", "Reason for Termination"],
   "Who was in this clinical trial?": ["Study Population: Key Inclusion/Exclusion Criteria"],
   "What treatments did the participants take?": ["Statistical Methods", "Trial Analysis"],
   "What happened during the trial?": ["Study Design/Methodology"],
   "What were the results of the trial?": ["Trial Analysis", "Conclusion"],
   "What adverse events did participants report?": ["Trial Analysis"],
   "How has this trial helped?": ["Conclusion"],
}

# Iterate over each key in the mapping
for question, sections in mapping.items():
    print(question)
    output = []
    
    for trial in data:
        trial_name = trial['trial_name']
        print(trial_name)
        context = ""
        answer = ""

        for section in sections:
            # Fetch the context from the trial data
            if section in trial['trial']:
                context += trial['trial'][section] + "\n"
            
        # Fetch the answer from the summary data
        if question in trial['summary']:
            answer += trial['summary'][question]
        
        # Create the entry for this trial
        context = normalize('NFKD', context.strip()).encode('ascii','ignore')
        answer = normalize('NFKD', answer.strip()).encode('ascii','ignore')
        context = context.decode(encoding="utf-8")
        answer = answer.decode(encoding="utf-8")
        if answer and context:
            output.append({
                "trial_name": trial_name,
                "question": question,
                "context": ' '.join(context.split()),
                "answer": ' '.join(answer.split())
            })
    
    # Write the output to a JSON file
    with open(f'FinalDataset/FullDataset2/{question.replace(" ", "_").replace("?", "")}.json', 'w') as outfile:
        json.dump(output, outfile, indent=4)

## Split train and test set

In [None]:
import json
import random
import os

# Define the folder containing the JSON files
folder_path = 'FinalDataset/FullDataset2/' 
folder_path2 = 'FinalDataset/'

# Get a list of all JSON files in the folder
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

# Process each JSON file
for json_file in json_files:
    file_path = os.path.join(folder_path, json_file)
    
    # Load the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
    
    # Shuffle the data randomly
    random.shuffle(data)
    
    # Calculate the split index
    split_index = int(0.85 * len(data))
    
    # Split the data into training and testing sets
    train_data = data[:split_index]
    test_data = data[split_index:]
    
    # Create filenames for the split datasets
    train_file_path = os.path.join(folder_path2, f'train_{json_file}')
    test_file_path = os.path.join(folder_path2, f'test_{json_file}')
    
    # Save the split datasets into separate JSON files
    with open(train_file_path, 'w') as train_file:
        json.dump(train_data, train_file, indent=4)
    
    with open(test_file_path, 'w') as test_file:
        json.dump(test_data, test_file, indent=4)
    
    print(f"Processed and split {json_file} into training and testing sets.")

print("All files have been successfully processed.")
