In [None]:
import os
import pandas as pd
import re
pd.set_option('display.max_columns', None)  

# Directory containing the files
directory = '/home/ubuntu/phenotype/'

In [None]:
# Initialize an empty dataframe to store the results
dataframes = []

# Define tissue categories and phenotypes
tissue_categories = ['EPCAM Positive', 'EPCAM Negative', 'Blank', 'GFAP Positive', 'All']
phenotypes = ['CD163', 'CD8', 'EPCAM', 'Non-interest', 'EPCAM/GFAP', 'All']

# Create column names for the dataframe
columns = [f'{tc}-{ph}' for tc in tissue_categories for ph in phenotypes]

# Process each file in the directory
for filename in os.listdir(directory):
    if filename.endswith('cell_seg_data_summary.txt'):
        file_path = os.path.join(directory, filename)
        # Read the file into a dataframe
        df = pd.read_csv(file_path, sep='\t')
        # Extract the "Total Cells" values
        total_cells = df['Total Cells']
        # Create a new dataframe with the extracted values
        data = pd.DataFrame([total_cells.values], columns=columns, index=[filename])
        # Add the new dataframe to the list
        dataframes.append(data)
        
summary_df = pd.concat(dataframes)

# Save the summary dataframe to a new CSV file
summary_df.to_csv('/home/ubuntu/summary.csv')


In [None]:
summary_df = pd.read_csv('summary.csv', index_col=0)

def extract_info(filename):
    patient_number = re.search(r'(\d+)', filename).group(1)

    # Extracting organ
    location = re.search(r'\d+ ([^.]+)\.', filename).group(1)
    
    # Extracting MSI coordinates
    msi_coords = re.search(r'\[(\d+),(\d+)\]', filename).groups()

    return patient_number, location, msi_coords

# Apply the extraction function to the index
summary_df['Patient Number'] = summary_df.index.to_series().apply(lambda x: extract_info(x)[0])
summary_df['Location'] = summary_df.index.to_series().apply(lambda x: extract_info(x)[1])
summary_df['MSI'] = summary_df.index.to_series().apply(lambda x: extract_info(x)[2])

organs = [
    'Lung', 'Brain', 'Adrenal', 'Spine', 'Pan', 'Femoral', 'LN', 'Kidney', 'Pleura', 'Sternum', 
    'Diaphragm', 'Carinal', 'Med', 'Vertebra', 'Chest wall', 'Flank', 'Pancreas', 'Liver', 'abd', 'Ome Peri'
]

# Function to extract organ name
def extract_organ(location):
    for organ in organs:
        # Use a regex pattern that accounts for delimiters like underscores, full stops, and spaces
        if re.search(r'(\b|[_\.\s])' + re.escape(organ) + r'(\b|[_\.\s])', location, re.IGNORECASE):
            return organ
    return 'Unknown'

# Apply the function to create a new column
summary_df['Organ'] = summary_df['Location'].apply(extract_organ)

summary_df['Primary'] = summary_df['Location'].str.contains(f'{'lung'}|{'Lung'}', regex=True)

# Create Brain column
summary_df['Brain'] = summary_df['Location'].str.contains(f'{'brain'}|{'Brain'}', regex=True)

# Convert boolean values to True/False (if desired)
summary_df['Primary'] = summary_df['Primary'].astype(bool)
summary_df['Brain'] = summary_df['Brain'].astype(bool)

In [None]:
columns_to_sum= ['EPCAM Positive-CD163', 'EPCAM Positive-CD8', 'EPCAM Positive-EPCAM',
       'EPCAM Positive-Non-interest', 'EPCAM Positive-EPCAM/GFAP',
       'EPCAM Positive-All', 'EPCAM Negative-CD163', 'EPCAM Negative-CD8',
       'EPCAM Negative-EPCAM', 'EPCAM Negative-Non-interest',
       'EPCAM Negative-EPCAM/GFAP', 'EPCAM Negative-All', 'Blank-CD163',
       'Blank-CD8', 'Blank-EPCAM', 'Blank-Non-interest', 'Blank-EPCAM/GFAP',
       'Blank-All', 'GFAP Positive-CD163', 'GFAP Positive-CD8',
       'GFAP Positive-EPCAM', 'GFAP Positive-Non-interest',
       'GFAP Positive-EPCAM/GFAP', 'GFAP Positive-All', 'All-CD163', 'All-CD8',
       'All-EPCAM', 'All-Non-interest', 'All-EPCAM/GFAP', 'All-All']
 
summary_df_grouped = summary_df.groupby(['Patient Number', 'Organ', 'Primary', 'Brain'])[columns_to_sum].sum().reset_index()
summary_df_grouped

Unnamed: 0,Patient Number,Organ,Primary,Brain,EPCAM Positive-CD163,EPCAM Positive-CD8,EPCAM Positive-EPCAM,EPCAM Positive-Non-interest,EPCAM Positive-EPCAM/GFAP,EPCAM Positive-All,EPCAM Negative-CD163,EPCAM Negative-CD8,EPCAM Negative-EPCAM,EPCAM Negative-Non-interest,EPCAM Negative-EPCAM/GFAP,EPCAM Negative-All,Blank-CD163,Blank-CD8,Blank-EPCAM,Blank-Non-interest,Blank-EPCAM/GFAP,Blank-All,GFAP Positive-CD163,GFAP Positive-CD8,GFAP Positive-EPCAM,GFAP Positive-Non-interest,GFAP Positive-EPCAM/GFAP,GFAP Positive-All,All-CD163,All-CD8,All-EPCAM,All-Non-interest,All-EPCAM/GFAP,All-All
0,10,Brain,False,True,3144,2037,143275,14478,465,166559,2558,1195,1657,15959,2,21742,156,102,885,1123,4,2376,3045,434,7106,9407,844,21165,8905,3768,152935,40978,1315,211874
1,10,Lung,True,False,20860,6020,387907,248225,1,675563,50324,22179,39981,758526,0,886226,4521,8210,6067,109040,0,130951,23,0,152,568,0,777,75734,36410,434114,1116652,1,1693913
2,11,Brain,False,True,23339,3166,330394,23645,638,389465,5084,1398,2210,17125,2,26263,231,1351,1692,3382,8,6942,7650,3720,20339,131874,3909,171250,36304,9635,354646,176081,4557,593997
3,11,Lung,True,False,8276,8640,156468,89632,0,267834,19459,34788,83733,631919,0,783527,1024,12066,4664,92439,0,114524,25,4,16,1691,0,1763,28786,55514,244893,816023,0,1168170
4,12,Brain,False,True,8677,8038,847210,320510,668,1206065,3448,8592,1913,54701,5,70163,9589,27104,5493,264569,27,318798,5471,2643,17189,74258,1397,102738,27188,46385,871828,714253,2097,1698042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,85,Lung,True,False,61157,46984,1498051,219178,4,1857733,14634,19092,33362,191793,0,263075,1269,786,10907,42358,0,56633,22,2,282,816,0,1177,77095,66865,1542633,454599,4,2179232
119,87,Lung,True,False,8975,8537,168958,96622,16,288082,10136,13421,6977,153194,1,186741,9646,6012,14568,258887,3,294315,113,14,92,4586,0,4917,28882,27986,190598,513585,20,774437
120,87,Pleura,False,False,111,173,5300,3154,0,8868,9599,4034,2983,118503,1,137586,751,351,257,17707,0,19475,113,0,5,356,0,482,10578,4560,8545,139985,1,166716
121,9,Brain,False,True,9524,561,86957,46703,1,146287,2436,962,2784,41679,0,48732,216,26,304,1640,0,2307,3254,425,213,39807,60,44469,15430,1974,90258,129829,61,241795


In [None]:
summary_df_grouped.to_csv('phenotype_grouped.csv')