In [None]:
import pandas as pd

# Load the DataFrame from the provided CSV file
file_path = 'Figure_4_data.csv'
df_isolation_sources = pd.read_csv(file_path)

# Define categories based on the provided groups
groups = {
    "Gastrointestinal/Stool": [
        'human intestinal microflora', 'rectal swab', 'feces', 'stool sample from a patient with diarrhea',
        'stool', 'fecal sample', 'infant with diarrhea', 'stool sample', 'stool sample from patient with gastroenteritis',
        'diarrheal stool', 'infant diarrheic stool', 'human stool', 'feces of patient with hemolytic uremic syndrome',
        'fecal matter', 'stool from an infant', 'stool specimen', 'feces specimen', 'feces from healthy individual',
        'human feces', 'rectal swabs', 'feces of patient with diarrhea', 'perirectal', 'enteritis', 'faecal sample', 
        'faecal swab', 'rectum', 'anal', 'colon', 'peri-rectal', 'fecal isolate from poultry farm worker', 'enteritis', 'human fecal','gastrointestinal tract', 'rectal', 'human gut', 'diarrheal patient',
        'gut', 'intestine', 'digestive system', 'spleen', 'human feces' ,'faecal', 'intestine', 'human faeces', 'ileum', 'digestive','caecal'
    ],
    "Urinary/Genitourinary": [
        'urine', 'urinary tract infection', 'urine from child < 5 years old', 'clinical urine culture', 'urine/genitourinary',
        'urine from hospitalized patient', 'urine sample', 'clean catch urine', 'uti', 'mid-stream urine', 'urine, catheter urine',
        'infected human urine', 'bladder epithelial biopsy', 'bacteriuria', 'perineum', 'urethra','urinary','urogenital', 'urinary sample', 'urethal swab'
    ],
    "Blood/Circulatory": [
        'blood', 'blood-culture', 'from patient with blood stream infection', 'human blood', 'patient blood', 'blood sample',
        'blood culture', 'bloodstream', 'venous blood', 'blood specimen', 'blood of icu admitted patient'
    ],
    "Respiratory": [
        'sputum', 'pharyngeal secretions', 'nasal swab', 'endotracheal secretion', 'respiratory', 'tracheal aspirate', 'bronchial fluid',
        'bronchoalveolar lavage fluid (bal/balf)', 'nasal cavity', 'sputum from cystic fibrosis patient', 'oropharynx', 'tracheal secretions',
        'pharyngeal swab', 'pulmonary', 'maxillary sinus', 'mucus', 'tracheal secretion', 'bronchoalveolar lavage fluid', 
        'bronchial washing fluid', 'nares', 'sputum', 'pharynx','bronchial sample from a male patient with chronic obstructive pneumonia', 'throat','cf throat', 'cystic fibrosis patient','tracheal','bronchoalveolar lavage from right lung',
        'sptum', 'oral','saliva', 'bronchial washing', 'bronchoalveolar lavage', 'cystic fibrosis human lung', 'lung', 'patient sinus swab', 'tonsil', 'bronchial', 'trachea', 'tracheobronchial'
    ],
    "Wound/Abscess/Skin": [
        'pus', 'abscess on lower extremity', 'wound', 'groin', 'wound left foot', 'pus left axilla', 'wound groin', 'pus axilla',
        'wound swab', 'abscess', 'burn wound', 'leg pus', 'skin abscess', 'pressure ulcer', 'decubitus ulcer', 'cutaneous anthrax patient',
        'necrosectomy', 'tibia/osteomyelitis of diabetes patient', 'burn', 'skin swab', 'osteitis', 'foci', 'prosthesis', 'ear swabs', 'skin', 'swab', 'human skin', 'pelvic abcsess', 'swab', 'sacral swab', 'sacral ulcer','hematoma'
    ],
    "Other Body Fluids": [
        'cerebrospinal fluid', 'pleural effusion', 'ascites', 'peritoneal fluid', 'bile', 'synovial fluid', 'hydrothorax', 'drainage fluid',
        'biliary fluid', 'pericardial effusion', 'wound fluid', 'abdominal drainage', 'csf', 'abdominal drain', 'pleural fluid', 'abdominal', 'secreta','secretion', 'secretions','fluid','peritoneal cavity', 'excreted bodily substance',
        'aspirate'

    ],
    "Tissue/Organ": [
        'kidney', 'tissue', 'liver', 'bone', 'tricuspid valve', 'lung tissue', 'prosthetic joint/hip/knee implants', 'necrotic tissue',
        'lymph node', 'bladder epithelial biopsy', 'left-sided endocarditis', 'right leg muscle', 'hand', 'nose', 'foot','eye','leg','skin and soft parts','blister', 'placenta', 'organ', 'organ (brain)'
    ],
    "Genital": [
        'vaginal swab', 'urethral swab', 'cervical swab', 'penile discharge', 'semen', 'vaginal secretion', 'urethral discharge', 'sperm', 'uterus' , 'cervical','salpingitis'
    ],
    "Miscellaneous/Unspecified Clinical": [
        'clinical isolate', 'clinical specimen', 'hospitalized patients', 'human clinical sample', 'icu patient', 'nosocomial infection',
        'screening swab', 'human', 'patient'
    ]
}

# Function to categorize each isolation source
def categorize_source(source):
    if pd.isna(source) or not isinstance(source, str):  # Check for NaN or non-string values
        return "Miscellaneous/Unspecified Clinical"  # Default category for invalid/empty values

    source = source.lower()  # Convert the source to lowercase to make the comparison case-insensitive
    best_match = None

    # First check for exact matches in groups
    for category, terms in groups.items():
        for term in terms:
            if term.lower() == source:  # Check for exact match
                return category  # Return the first full match

    # If no exact match, check for partial word matches, but prioritize categories
    for category, terms in groups.items():
        for term in terms:
            if term.lower() in source:  # Check for partial match
                if best_match is None:
                    best_match = category  # Assign category if no match has been found
                break  # Exit after the first match for this category

    # If no match is found, categorize under 'Miscellaneous/Unspecified Clinical'
    if not best_match:
        best_match = "Miscellaneous/Unspecified Clinical"

    return best_match

# Apply categorization logic to the 'Isolation source' column in the DataFrame
df_isolation_sources['Group'] = df_isolation_sources['Isolation source'].apply(categorize_source)

# Now, you can check the updated DataFrame
print(df_isolation_sources[['Isolation source', 'Group']])

# Optionally, save the updated DataFrame to a new CSV file
df_isolation_sources.to_csv('Figure_4_data.csv', index=False)