In [8]:
import pandas as pd

# Read the patients data from a CSV file into a DataFrame
# The file 'patients_AgeCalculated.csv' contains patient details, including calculated ages
patients_df = pd.read_csv('patients_AgeCalculated.csv')

# Read the clinical trials data from an Excel file into a DataFrame
# The file 'ctgStudies_ageRange.xlsx' contains study details, including age range information
trials_df = pd.read_excel('ctgStudies_ageRange.xlsx')

# Read the conditions data from a CSV file into a DataFrame
# The file 'conditions.csv' contains medical conditions associated with patients or trials
conditions_df = pd.read_csv('conditions.csv')

In [9]:
# Ensure consistent formatting for gender by converting all values in the 'GENDER' column to lowercase
patients_df['GENDER'] = patients_df['GENDER'].str.lower()

# Read the 'conditions.csv' file into a DataFrame
conditions_df = pd.read_csv('conditions.csv')

In [10]:
def is_eligible(patient, trial):
    """
    Check if a patient is eligible for a clinical trial based on age and gender criteria.
    
    Args:
        patient (dict): A dictionary containing patient details, including 'AGE' and 'GENDER'.
        trial (dict): A dictionary containing trial information, including 'Age Range'.

    Returns:
        bool: True if the patient is eligible, False otherwise.
    """
    # Parse the minimum and maximum age from the trial's age range string
    min_age, max_age = parse_age_range(trial['Age Range'])
    
    # Check if the patient's age falls within the trial's age range
    if not (min_age <= patient['AGE'] <= max_age):
        return False
    
    # Check if the trial specifies male-only participation and the patient is not male
    if 'male' in trial['Age Range'].lower() and patient['GENDER'].lower() != 'male':
        return False
    
    # Check if the trial specifies female-only participation and the patient is not female
    if 'female' in trial['Age Range'].lower() and patient['GENDER'].lower() != 'female':
        return False
    
    # Return True if the patient meets all eligibility criteria
    return True

def parse_age_range(age_range):
    """
    Parse the age range from a string in various formats and return as a tuple.
    
    Args:
        age_range (str): A string representing the age range, e.g., "18 Years to 65 Years".

    Returns:
        tuple: A tuple (min_age, max_age) representing the minimum and maximum ages.
    """
    # Implement logic to parse age range string
    # For example: "18 Years to 65 Years" -> (18, 65)
    # Handle various formats that might be present in the input string
    pass

In [11]:
# Merge the two DataFrames based on the 'PATIENT' ID
merged_df = pd.merge(patients_df, conditions_df, how='left', left_on='Id', right_on='PATIENT')

# Select and rearrange relevant columns
final_df = merged_df[['Id', 'FIRST', 'LAST', 'DESCRIPTION', 'AGE', 'GENDER', 'RACE', 'ETHNICITY']]

# Save the merged DataFrame to a new Excel file
final_df.to_excel('patient_conditions.xlsx', index=False)

# Display the first few rows of the final DataFrame
print(final_df.head())


                                     Id     FIRST        LAST  \
0  b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85  Damon455  Langosh790   
1  c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8     Thi53   Wunsch504   
2  c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8     Thi53   Wunsch504   
3  c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8     Thi53   Wunsch504   
4  c1f1fcaa-82fd-d5b7-3544-c8f9708b06a8     Thi53   Wunsch504   

                   DESCRIPTION  AGE GENDER   RACE    ETHNICITY  
0                          NaN    5      m  white  nonhispanic  
1  Acute bronchitis (disorder)   19      f  white  nonhispanic  
2        Laceration of forearm   19      f  white  nonhispanic  
3   Viral sinusitis (disorder)   19      f  white  nonhispanic  
4   Viral sinusitis (disorder)   19      f  white  nonhispanic  


In [12]:
# Assuming trials_df is a pandas DataFrame
# trials_df.info() provides a concise summary of the DataFrame

# It includes details such as:
# - Number of non-null entries in each column
# - Data types of each column
# - Memory usage of the DataFrame
trials_df.info()  # Display summary info for the trials_df DataFrame

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67219 entries, 0 to 67218
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   NCT Number     67219 non-null  object
 1   Study Title    67219 non-null  object
 2   Study URL      67219 non-null  object
 3   Study Status   67219 non-null  object
 4   Conditions     67219 non-null  object
 5   Interventions  60529 non-null  object
 6   Age            67219 non-null  object
 7   Study Type     67219 non-null  object
 8   Min Age        67219 non-null  int64 
 9   Max Age        67219 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 5.1+ MB


In [13]:
import pandas as pd
import json
from tqdm import tqdm
import re

def exact_match_condition(patient_condition, trial_conditions):
    """
    Check if the patient's condition matches any of the conditions listed in the trial.

    Args:
        patient_condition (str): The medical condition of the patient.
        trial_conditions (str): A comma-separated string of medical conditions listed for the trial.

    Returns:
        bool: True if the patient's condition matches any of the trial conditions, False otherwise.
    """
    trial_conditions_list = [condition.strip().lower() for condition in trial_conditions.split(',')]  # Normalize and split conditions
    return patient_condition.lower() in trial_conditions_list  # Check if patient condition matches any condition

def find_eligible_patients(final_df, trials_df):
    """
    Find eligible clinical trials for each patient based on their age and medical condition.

    Args:
        final_df (DataFrame): DataFrame containing patient information.
        trials_df (DataFrame): DataFrame containing clinical trial information.

    Returns:
        list: A list of dictionaries containing patient details and eligible trials.
    """
    eligible_results = []

    # Lowercase trial conditions for easy comparison
    trials_df['Conditions_lower'] = trials_df['Conditions'].str.lower()

    # Iterate over each patient in the DataFrame
    for _, patient in tqdm(final_df.iterrows(), total=len(final_df), desc="Processing patients"):
        patient_age = patient['AGE']  # Extract patient age
        patient_condition = str(patient['DESCRIPTION']).lower()  # Extract and normalize patient condition

        # Filter trials where patient's age fits within trial's age range
        age_eligible_trials = trials_df[(trials_df['Min Age'] <= patient_age) & (trials_df['Max Age'] >= patient_age)]

        # Further filter trials where patient's condition matches trial conditions
        eligible_trials = age_eligible_trials[age_eligible_trials['Conditions_lower'].apply(lambda x: exact_match_condition(patient_condition, x))]

        if not eligible_trials.empty:
            patient_trials = []
            # Collect eligible trial information
            for _, trial in eligible_trials.iterrows():
                patient_trials.append({
                    'NCT Number': trial['NCT Number'],
                    'Study Title': trial['Study Title'],
                    'Study URL': trial['Study URL'],
                    'Conditions': trial['Conditions']
                })
            # Add patient's eligible trials to results
            eligible_results.append({
                'Patient Id': patient['Id'],
                'DESCRIPTION': patient['DESCRIPTION'],
                'Eligible Trials': patient_trials
            })

    return eligible_results  # Return list of eligible patients with trials

def generate_output(final_df, trials_df):
    """
    Generate eligible patient-trial matches and export results to JSON and CSV.

    Args:
        final_df (DataFrame): DataFrame containing patient information.
        trials_df (DataFrame): DataFrame containing clinical trial information.

    Returns:
        DataFrame: DataFrame containing eligible patient-trial matches.
    """
    # Get list of eligible patients and trials
    eligible_patients = find_eligible_patients(final_df, trials_df)

    # Write results to a JSON file
    with open('eligible_trials.json', 'w') as f:
        json.dump(eligible_patients, f, indent=2)

    rows = []
    # Prepare data for CSV export
    for patient in eligible_patients:
        for trial in patient['Eligible Trials']:
            rows.append({
                'Patient Id': patient['Patient Id'],
                'Patient Description': patient['DESCRIPTION'],
                'NCT Number': trial['NCT Number'],
                'Study Title': trial['Study Title'],
                'Study URL': trial['Study URL'],
                'Trial Conditions': trial['Conditions']
            })

    # Convert to DataFrame and export as CSV
    output_df = pd.DataFrame(rows)
    output_df.to_csv('eligible_trials.csv', index=False)

    return output_df  # Return DataFrame of eligible patients and trials

# Cell to create subsets and run on subset data
patient_subset = final_df.head(10000)  # Take first 10,000 patients
trials_subset = trials_df.head(10000)   # Take first 10,000 trials

print("Running on subset of data...")
print(f"Number of patients in subset: {len(patient_subset)}")
print(f"Number of trials in subset: {len(trials_subset)}")

# Run the function with subsets
output_df_subset = generate_output(patient_subset, trials_subset)

print("\nSubset processing complete.")
print("First few rows of the output:")
print(output_df_subset.head())

print(f"\nNumber of eligible patient-trial matches: {len(output_df_subset)}")
print(f"Number of unique eligible patients: {output_df_subset['Patient Id'].nunique()}")

# Cell to run on full dataset (optional)
run_full_dataset = input("\nDo you want to run on the full dataset? (yes/no): ").lower().strip()
if run_full_dataset == 'yes':
    print("\nRunning on full dataset...")
    output_df_full = generate_output(final_df, trials_df)
    print("Full dataset processing complete.")
    print("First few rows of the full output:")
    print(output_df_full.head())
    print(f"\nNumber of eligible patient-trial matches: {len(output_df_full)}")
    print(f"Number of unique eligible patients: {output_df_full['Patient Id'].nunique()}")
else:
    print("Skipping full dataset processing.")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  trials_df['Conditions_lower'] = trials_df['Conditions'].str.lower()


Running on subset of data...
Number of patients in subset: 10000
Number of trials in subset: 10000


Processing patients:   0%|          | 0/10000 [00:00<?, ?it/s]

Processing patients: 100%|██████████| 10000/10000 [01:55<00:00, 86.93it/s]



Subset processing complete.
First few rows of the output:
                             Patient Id Patient Description   NCT Number  \
0  339144f8-50e1-633e-a013-f361391c4cff        Hypertension  NCT04423627   
1  339144f8-50e1-633e-a013-f361391c4cff        Hypertension  NCT05546931   
2  339144f8-50e1-633e-a013-f361391c4cff        Hypertension  NCT04869826   
3  339144f8-50e1-633e-a013-f361391c4cff        Hypertension  NCT03810482   
4  339144f8-50e1-633e-a013-f361391c4cff        Hypertension  NCT02558582   

                                         Study Title  \
0  Sympathetic Regulation of Large Artery Stiffne...   
1       Mobile Health Program for Rural Hypertension   
2  Calibration of a Wrist Cuff Blood Pressure Dev...   
3  Pedometers and Walking Tests for Pulmonary Hyp...   
4  Effect of Exercise Training in Patients With P...   

                                      Study URL  \
0  https://clinicaltrials.gov/study/NCT04423627   
1  https://clinicaltrials.gov/study/NCT055469

In [14]:
# Assuming output_df_subset is a DataFrame, this code retrieves unique values 
# from the 'Patient Description' column.

output_df_subset['Patient Description'].unique()

# Explanation:
# - output_df_subset: DataFrame containing patient information.
# - ['Patient Description']: Accesses the 'Patient Description' column.
# - .unique(): Returns an array of unique values from the 'Patient Description' column.

array(['Hypertension', 'COVID-19', 'Diabetes', 'Prediabetes',
       'Recurrent urinary tract infection', 'Hyperlipidemia',
       'Osteoarthritis of knee', 'Atrial Fibrillation', 'Cholelithiasis',
       'Epilepsy', 'Atopic dermatitis', 'Stroke', 'Otitis media',
       'Preeclampsia', 'Cardiac Arrest', 'Chronic pain', 'Appendicitis',
       'Coronary Heart Disease', 'Rheumatoid arthritis',
       'Seasonal allergic rhinitis', 'Childhood asthma', 'Asthma',
       'Osteoarthritis of hip', 'Gout'], dtype=object)

In [15]:
output_df_subset

Unnamed: 0,Patient Id,Patient Description,NCT Number,Study Title,Study URL,Trial Conditions
0,339144f8-50e1-633e-a013-f361391c4cff,Hypertension,NCT04423627,Sympathetic Regulation of Large Artery Stiffne...,https://clinicaltrials.gov/study/NCT04423627,"Hypertension, Systolic|Stiffness, Aortic"
1,339144f8-50e1-633e-a013-f361391c4cff,Hypertension,NCT05546931,Mobile Health Program for Rural Hypertension,https://clinicaltrials.gov/study/NCT05546931,"Hypertension,Essential|Adherence, Medication|Q..."
2,339144f8-50e1-633e-a013-f361391c4cff,Hypertension,NCT04869826,Calibration of a Wrist Cuff Blood Pressure Dev...,https://clinicaltrials.gov/study/NCT04869826,Hypertension
3,339144f8-50e1-633e-a013-f361391c4cff,Hypertension,NCT03810482,Pedometers and Walking Tests for Pulmonary Hyp...,https://clinicaltrials.gov/study/NCT03810482,"Hypertension, Pulmonary"
4,339144f8-50e1-633e-a013-f361391c4cff,Hypertension,NCT02558582,Effect of Exercise Training in Patients With P...,https://clinicaltrials.gov/study/NCT02558582,"Hypertension, Pulmonary"
...,...,...,...,...,...,...
4254,177971b9-2284-1459-06de-165414d9de11,Hypertension,NCT03598894,Hypertension Chronobiome,https://clinicaltrials.gov/study/NCT03598894,Hypertension
4255,177971b9-2284-1459-06de-165414d9de11,Hypertension,NCT05179876,A Study Providing Treatment Access in Particip...,https://clinicaltrials.gov/study/NCT05179876,"Hypertension, Pulmonary"
4256,177971b9-2284-1459-06de-165414d9de11,Hypertension,NCT05292469,Comprehensive Approach to Hypertension Managem...,https://clinicaltrials.gov/study/NCT05292469,Hypertension
4257,177971b9-2284-1459-06de-165414d9de11,Hypertension,NCT05561543,Effects of Peppermint Oil in Mild-moderate Hyp...,https://clinicaltrials.gov/study/NCT05561543,Hypertension


In [16]:
def is_eligible(patient, trial):
    """
    Determines if a patient is eligible for a clinical trial based on age and medical conditions.

    Args:
        patient (dict): A dictionary containing patient information with keys 'AGE' and 'DESCRIPTION'.
        trial (dict): A dictionary containing trial criteria with keys 'Min Age', 'Max Age', and 'Conditions'.

    Returns:
        bool: True if the patient meets the age and condition criteria for the trial, False otherwise.
    """
    # Extract the minimum and maximum age criteria from the trial
    min_age = trial['Min Age']
    max_age = trial['Max Age']
    
    # Split the trial conditions into a list
    conditions = trial['Conditions'].split('|')

    # Get the patient's age and condition
    patient_age = patient['AGE']
    patient_condition = patient['DESCRIPTION']

    # Check if the patient's age is within the trial's age range
    # and if the patient's condition is one of the trial's conditions
    if min_age <= patient_age <= max_age and patient_condition in conditions:
        return True  # Patient is eligible
    return False  # Patient is not eligible

def test_is_eligible():
    """
    Tests the is_eligible function to ensure it works as expected.

    This function sets up a sample trial and patient, then asserts that 
    the eligibility function returns True for the given input.
    """
    # Define a clinical trial with specific age and condition criteria
    trial = {'Min Age': 18, 'Max Age': 65, 'Conditions': 'Hypertension|Diabetes'}
    
    # Define a patient who meets the trial's eligibility criteria
    patient = {'AGE': 18, 'DESCRIPTION': 'Hypertension'}
    
    # Assert that the patient is eligible for the trial
    assert is_eligible(patient, trial) == True

# Call the test function to verify the implementation
test_is_eligible()
print("Test passed successfully!")

Test passed successfully!


In [17]:
import unittest

def is_eligible(patient, trial):
    """
    Check if a patient is eligible for a clinical trial based on 
    age, gender, and medical conditions.
    
    Parameters:
    patient (dict): A dictionary containing patient's details like 
                    age, gender, and condition.
    trial (dict): A dictionary containing trial's eligibility criteria 
                  including minimum and maximum age, gender, and 
                  applicable medical conditions.
    
    Returns:
    bool: True if the patient is eligible for the trial, False otherwise.
    """
    # Check age eligibility
    if not (trial['Min Age'] <= patient['AGE'] <= trial['Max Age']):
        return False

    # Check gender eligibility (assuming trial['Gender'] contains the requirement: 'male', 'female', or 'all')
    if trial.get('Gender', 'all') != 'all' and patient['GENDER'].lower() != trial['Gender'].lower():
        return False

    # Check condition eligibility
    trial_conditions = trial['Conditions'].split('|')  # Split trial conditions into a list
    patient_conditions = [patient['DESCRIPTION']]  # Assuming DESCRIPTION contains the condition

    # Check if any of the patient's conditions match the trial's conditions
    if not any(condition in trial_conditions for condition in patient_conditions):
        return False

    return True

class TestEligibilityFunctions(unittest.TestCase):

    def test_gender_eligibility_male(self):
        """Test if a male patient is eligible for a male trial."""
        trial = {'Min Age': 18, 'Max Age': 65, 'Gender': 'male', 'Conditions': 'Hypertension|Diabetes'}
        patient = {'AGE': 25, 'GENDER': 'male', 'DESCRIPTION': 'Hypertension'}
        self.assertTrue(is_eligible(patient, trial))

    def test_gender_eligibility_female(self):
        """Test if a female patient is eligible for a female trial."""
        trial = {'Min Age': 18, 'Max Age': 65, 'Gender': 'female', 'Conditions': 'Hypertension|Diabetes'}
        patient = {'AGE': 25, 'GENDER': 'female', 'DESCRIPTION': 'Hypertension'}
        self.assertTrue(is_eligible(patient, trial))

    def test_gender_ineligibility(self):
        """Test if a female patient is not eligible for a male trial."""
        trial = {'Min Age': 18, 'Max Age': 65, 'Gender': 'male', 'Conditions': 'Hypertension|Diabetes'}
        patient = {'AGE': 25, 'GENDER': 'female', 'DESCRIPTION': 'Hypertension'}
        self.assertFalse(is_eligible(patient, trial))

    def test_condition_eligibility(self):
        """Test if a patient is eligible for a trial with 'all' gender and valid condition."""
        trial = {'Min Age': 18, 'Max Age': 65, 'Gender': 'all', 'Conditions': 'Hypertension|Diabetes'}
        patient = {'AGE': 25, 'GENDER': 'male', 'DESCRIPTION': 'Hypertension'}
        self.assertTrue(is_eligible(patient, trial))

    def test_condition_ineligibility(self):
        """Test if a patient is not eligible due to an invalid condition."""
        trial = {'Min Age': 18, 'Max Age': 65, 'Gender': 'all', 'Conditions': 'Hypertension|Diabetes'}
        patient = {'AGE': 25, 'GENDER': 'male', 'DESCRIPTION': 'Asthma'}
        self.assertFalse(is_eligible(patient, trial))

# Test Integration for trial and patient matching
class TestIntegration(unittest.TestCase):
    
    def test_integration(self):
        """Test integration of patient eligibility for various trials."""
        patients = [
            {'Patient Id': 'P001', 'AGE': 30, 'GENDER': 'male', 'DESCRIPTION': 'Hypertension'},
            {'Patient Id': 'P002', 'AGE': 40, 'GENDER': 'female', 'DESCRIPTION': 'Diabetes'}
        ]
        
        trials = [
            {'NCT Number': 'NCT001', 'Min Age': 18, 'Max Age': 65, 'Gender': 'male', 'Conditions': 'Hypertension'},
            {'NCT Number': 'NCT002', 'Min Age': 35, 'Max Age': 55, 'Gender': 'female', 'Conditions': 'Diabetes'}
        ]
        
        eligible_trials_per_patient = []  # List to hold eligible trials for each patient
        
        for patient in patients:
            eligible_trials = []  # List to hold eligible trials for the current patient
            for trial in trials:
                if is_eligible(patient, trial):  # Check eligibility
                    eligible_trials.append({'trialId': trial['NCT Number']})  # Add eligible trial ID
            eligible_trials_per_patient.append({'Patient Id': patient['Patient Id'], 'eligibleTrials': eligible_trials})

        print(eligible_trials_per_patient)  # Debugging output

        # Check that patient P001 is eligible for NCT001
        self.assertEqual(eligible_trials_per_patient[0]['eligibleTrials'][0]['trialId'], 'NCT001')  # Patient P001 should match NCT001
        # Check that patient P002 is eligible for NCT002
        self.assertEqual(eligible_trials_per_patient[1]['eligibleTrials'][0]['trialId'], 'NCT002')  # Patient P002 should match NCT002

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)

......
----------------------------------------------------------------------
Ran 6 tests in 0.009s

OK


[{'Patient Id': 'P001', 'eligibleTrials': [{'trialId': 'NCT001'}]}, {'Patient Id': 'P002', 'eligibleTrials': [{'trialId': 'NCT002'}]}]
