In [1]:
import re, sys, os 
import argparse
from itertools import combinations
import subprocess
import pandas as pd

In [2]:
# Part 0: Read file and extract basic information: (1)pdb_id, (2)chain numbers, (3)chain IDs, (4)data for each chain

# Populate paths_to_process_list
def populate_paths_to_list(path):
    paths_to_process_list = []

    if os.path.isfile(path) and path.lower().endswith('.pdb'):    # Check if the provided path is a .pdb file
        paths_to_process_list.append(path)
        return paths_to_process_list
    elif os.path.isdir(path):                              # Check if the provided path is indeed a directory
        for filename in os.listdir(path):                  # Iterate over files in the directory
            filepath = os.path.join(path, filename)        # Construct the full file path
            if filename.lower().endswith('.pdb'):
                paths_to_process_list.append(filepath)
        return paths_to_process_list
    
    if not paths_to_process_list:
        print(f'No valid PDB files found in path: {path}. The program is quit, please try again.')
        return

# Extract the model id from the .pdb file
def extract_pdb_id(filepath):
    id, _ = os.path.splitext(os.path.basename(filepath)) # Get the basename form the file path, split the name adn the extension, just get the name
    return id

# Extract the chains id and information lines of chains from the .pdb file
def read_file_and_extract_chain_id(pdb_file):
    with open(pdb_file, 'r') as file:
        lines = file.readlines()
        
    chain_set = set()
    for line in lines:
        if line.startswith('ATOM'):
            chain_id = line[21]
            chain_set.add(chain_id)
    # print(f'The type of chains: {type(chains)}')
    # print(f'The type of sorted chains: {type(sorted (chains))}')
    return chain_set, lines

path = '/Users/luna/Documents/RP1/2uv8.pdb'
paths_to_process_list = populate_paths_to_list(path)
for pathway in paths_to_process_list:
    pdb_id = extract_pdb_id(pathway)
    chain_set, lines = read_file_and_extract_chain_id(pathway)
    print(f'File of PDB model {pdb_id} is loaded for processing, contain {len(chain_set)} chains: {sorted(chain_set)}')
    # print(f'File type: pdb_id: {type(pdb_id)} pdb_chains: {type(chain_set)} pdb_lines: {type(lines)}')

File of PDB model 2uv8 is loaded for processing, contain 6 chains: ['A', 'B', 'C', 'G', 'H', 'I']


In [3]:
# Part 1: Generate all individual and combined files for each PDB model

# Write info of each chain into individual files
def write_individual_files(pdb_id, chain_set, lines):
    pdb_filenames = []
    for chain in chain_set:
        pdb_filename = f'{pdb_id}_{chain}.pdb'
        with open(pdb_filename, 'w') as file:
            for line in lines:
                if line.startswith('ATOM') and line[21] == chain:
                    file.write(line)
        print(f'Written individual-chain-file: {pdb_filename}')
        pdb_filenames.append(pdb_filename)
    return pdb_filenames

chain_set = ['A', 'G']
individual_pdb_filename_list = write_individual_files(pdb_id, chain_set, lines)
print(individual_pdb_filename_list)

Written individual-chain-file: 2uv8_A.pdb
Written individual-chain-file: 2uv8_G.pdb
['2uv8_A.pdb', '2uv8_G.pdb']


In [4]:
# For a part of, or all chains, write info of every pair of chains into combined files
def write_part_or_all_combined_files(pdb_id, chain_set, lines):
    pdb_filename_list = []
    # Get all unique combinations of two chains
    chain_combinations = combinations(sorted(chain_set), 2)  
    for chain_pair in chain_combinations:
        # Skip if the chains are the same or if the reverse pair has already been processed
        if chain_pair[0] == chain_pair[1] or (chain_pair[1], chain_pair[0]) in pdb_filename_list:
            continue
        pdb_filename = f'{pdb_id}_{"".join(chain_pair)}.pdb'
        with open(pdb_filename, 'w') as file:
            for line in lines:
                if line.startswith('ATOM') and line[21] in chain_pair:
                    file.write(line)
        print(f'Written combined-chain-file: {pdb_filename}')
        pdb_filename_list.append(pdb_filename)
    return pdb_filename_list


In [5]:
# For two designated chains, write info of this pair into a combined file
def write_designated_combined_files(pdb_id, chain_set, lines):
    pdb_filename_list = []
    if len(chain_set) == 2 and chain_set[0] != chain_set[1]:
        pdb_filename = f'{pdb_id}_{chain_set[0]}{chain_set[1]}.pdb'
        with open(pdb_filename, 'w') as file:
            for line in lines:
                if line.startswith('ATOM') and (line[21] == chain_set[0] or line[21] == chain_set[1]):
                    file.write(line)
        print(f"Written combined-chains-file: {pdb_filename}")
        pdb_filename_list.append(pdb_filename)
    else:
        print("Error: Please specify two different chains.")
    return pdb_filename_list

chain_set = ['A', 'G']
combined_pdb_filename_list = write_designated_combined_files(pdb_id, chain_set, lines)
print(combined_pdb_filename_list)

Written combined-chains-file: 2uv8_AG.pdb
['2uv8_AG.pdb']


In [6]:
# Part 2

# 2.1 Perform naccess on individual-chain-files and combined-chains-files
def perform_naccess_on_files(pdb_filename):
    rsa_filename_list = []
    basename = pdb_filename.split('.')[0]
    rsa_filename = f'{basename}.rsa'
    try:
        subprocess.run(["./naccess", pdb_filename], check=True)
        rsa_filename_list.append(rsa_filename)
        print(f'"naccess" successfully processed {pdb_filename}\n')
    except subprocess.CalledProcessError as e:
        print(f'Error running "naccess" on {pdb_filename}: {e}')
    except FileNotFoundError as e:
        print(f'The "naccess" program was not found: {e}')
    return rsa_filename_list

individual_rsa_filename_list = []
combined_rsa_filename_list = []
# For individual chain
if individual_pdb_filename_list:  # Check if the list is not empty
    print(f'The individual .pdb files for [naccess] to process: {individual_pdb_filename_list}')
    for individual_pdb_filename in individual_pdb_filename_list:
        individual_rsa_filename_list.extend(perform_naccess_on_files(individual_pdb_filename))
    print(f'The individual .rsa files generated: {individual_rsa_filename_list}\n')
# For combined chains
if combined_pdb_filename_list:  # Check if the list is not empty
    print(f'The combined .pdb files for [naccess] to process: {combined_pdb_filename_list}')
    for combined_pdb_filename in combined_pdb_filename_list:
        combined_rsa_filename_list.extend(perform_naccess_on_files(combined_pdb_filename))
    print(f'The combined .rsa files generated: {combined_rsa_filename_list}\n')


The individual .pdb files for [naccess] to process: ['2uv8_A.pdb', '2uv8_G.pdb']
naccess: using vdw.radii in local directory
naccess: using STD FILE in local directory
"naccess" successfully processed 2uv8_A.pdb

naccess: using vdw.radii in local directory
naccess: using STD FILE in local directory
"naccess" successfully processed 2uv8_G.pdb

The individual .rsa files generated: ['2uv8_A.rsa', '2uv8_G.rsa']

The combined .pdb files for [naccess] to process: ['2uv8_AG.pdb']
naccess: using vdw.radii in local directory
naccess: using STD FILE in local directory
"naccess" successfully processed 2uv8_AG.pdb

The combined .rsa files generated: ['2uv8_AG.rsa']



In [7]:
# 2.2 Extract data from .rsa file and return as a pandas DataFrame
def extract_rsa_data_into_df(rsa_filename):
    # Define a regular expression pattern to match the lines starting with 'RES'
    # and capture the different parts of the line.
    line_pattern = re.compile(r'^RES\s+(\w{3})\s+([A-Za-z])\s*(\d+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)\s+(-?[\d\.]+)')
    
    with open(rsa_filename, 'r') as file:
        data = []
        for line in file:
            match = line_pattern.match(line)
            if match:
                # If a match is found, extract all the captured groups
                row = match.groups()
                # Prepend 'RES' to the row tuple
                new_row = ('RES',) + row
                data.append(new_row)
    
    # Create a DataFrame with the correct column names
    df = pd.DataFrame(data, columns=None)
    
    # Extract the base name of the file without the extension
    df_name = rsa_filename.split('.')[0]
    # print(f'The DataFrame {df_name} is created, shape: {df.shape}')
    # print(f'{df}')
    return df, df_name


df_individual_dic = {}
df_combined_dic = {}
# For individual chain
for individual_rsa_file in individual_rsa_filename_list:
    df_individual, df_individual_name = extract_rsa_data_into_df(individual_rsa_file)
    df_individual_dic[df_individual_name] = df_individual
    print(f'The DataFrame {df_individual_name} is created, shape: {df_individual.shape}')
    print(df_individual)
# For combined chains
for combined_rsa_file in combined_rsa_filename_list:
    df_combined, df_combined_name = extract_rsa_data_into_df(combined_rsa_file)
    df_combined_dic[df_combined_name] = df_combined
    print(f'The DataFrame {df_combined_name} is created, shape: {df_combined.shape}')
    print(df_combined)
                         

The DataFrame 2uv8_A is created, shape: (1613, 14)
       0    1  2     3       4     5       6     7      8      9      10  \
0     RES  MET  A     1  157.83  81.3   86.87  55.5  70.96  189.2  89.98   
1     RES  LYS  A     2  114.48  57.0  108.26  66.3   6.22   16.6  61.32   
2     RES  PRO  A     3   97.77  71.8   92.91  77.5   4.86   29.9  94.77   
3     RES  GLU  A     4  104.97  60.9   99.83  74.1   5.14   13.7  69.89   
4     RES  VAL  A     5   59.26  39.1   59.10  51.7   0.16    0.4  59.10   
...   ...  ... ..   ...     ...   ...     ...   ...    ...    ...    ...   
1608  RES  SER  A  1743   24.51  21.0   22.95  29.4   1.57    4.1  22.50   
1609  RES  TYR  A  1744   88.21  41.5   85.62  48.3   2.60    7.3  60.77   
1610  RES  ILE  A  1745  108.70  62.1   84.62  61.3  24.09   64.8  84.62   
1611  RES  ASN  A  1746  120.70  83.9   91.59  86.2  29.11   77.2  39.74   
1612  RES  ALA  A  1747   84.00  77.8   27.16  39.1  56.85  147.5  45.74   

         11     12     13  
0      5

In [9]:
# 2.3 Match names
def find_matching_individual_dfs_to_combined_df(df_combined_name):
    """
    Given the name of a combined DataFrame, extract and return the names of the 
    individual DataFrames that correspond to the chains in the combined DataFrame.
    """
    # Construct the names of the corresponding individual DataFrames
    pdb_id, chain_set = df_combined_name.split('_')
    chain1_id, chain2_id = chain_set[0], chain_set[1]
    
    df_individual_1_name = f"{pdb_id}_{chain1_id}"
    df_individual_2_name = f"{pdb_id}_{chain2_id}"
    
    return df_individual_1_name, df_individual_2_name

for df_combined_name, df_combined in df_combined_dic.items():
    df_chain1_name, df_chain2_name = find_matching_individual_dfs_to_combined_df(df_combined_name)
    # Fetch the corresponding individual DataFrames from dic_df_individual
    df_chain1 = df_individual_dic.get(df_chain1_name)
    df_chain2 = df_individual_dic.get(df_chain2_name)
    print(f'Combined df: {df_combined_name}')
    print(f'df_chain1: {df_chain1_name}')
    print(f'df_chain2: {df_chain2_name}')

Combined df: 2uv8_AG
df_chain1: 2uv8_A
df_chain2: 2uv8_G


In [10]:
# Sort decimal places
def format_decimal_places(df):
    # Identify ABS and REL columns by their position (assuming ABS and REL alternate after the first 2 columns)
    abs_columns = df.columns[4::2]  # Starting from the 5rd column, every second column is 'ABS'
    rel_columns = df.columns[5::2]  # Starting from the 6th column, every second column is 'REL'
    # Format ABS columns with 2 decimal places and REL columns with 1 decimal place
    for col in abs_columns:
        df.loc[:, col] = pd.to_numeric(df[col], errors='coerce')
        df.loc[:, col] = df[col].apply(lambda x: f'{x:.2f}' if pd.notnull(x) else x)
    for col in rel_columns:
        df.loc[:, col] = pd.to_numeric(df[col], errors='coerce')
        df.loc[:, col] = df[col].apply(lambda x: f'{x:.1f}' if pd.notnull(x) else x)
    return df

In [12]:

# 2.4 Perform the subtraction of ABS and REL values by the individual DataFrame to the combined DataFrame(each chain part)
#     write the results to file
def subtract_rsa_values_and_write_to_file(df_chain1, df_chain2, df_combined, df_combined_name):
    # Play with names
    basename = df_combined_name.split('.')[0]
    pdb_id, chain_set = basename.split('_')
    chain1_id, chain2_id = chain_set[0], chain_set[1]
    
    # Copy DataFrames to avoid altering original data and convert all values to float
    df1 = df_chain1.copy()
    df2 = df_chain2.copy()
    df_combined_1 = df_combined[df_combined.iloc[:, 2] == chain1_id].copy().reset_index(drop=True)  # Filter for chain1
    df_combined_2 = df_combined[df_combined.iloc[:, 2] == chain2_id].copy().reset_index(drop=True)  # Filter for chain2
    
    # print(f'The df_combined_1 is prepared, shape: {df_combined_1.shape} \n{df_combined_1}')
    # print(f'The df_combined_2 is prepared, shape: {df_combined_2.shape} \n{df_combined_2}')
    # # Define file names for the output
    # df_combined_1_filename = f'{pdb_id}_{chain1_id}_from_combined.csv'
    # df_combined_2_filename = f'{pdb_id}_{chain2_id}_from_combined.csv'
    # # Write df_combined_1 to a CSV file
    # df_combined_1.to_csv(df_combined_1_filename, index=False)
    # print(f'df_combined_1 written to file: {df_combined_1_filename}')
    # # Write df_combined_2 to a CSV file
    # df_combined_2.to_csv(df_combined_2_filename, index=False)
    # print(f'df_combined_2 written to file: {df_combined_2_filename}')
    
    # Ensure subtraction is only applied to value columns, i.e. excluding identifier columns (the first 4 columns)
    data1 = df1.columns[4:]
    data2 = df2.columns[4:]
    data_combined_1 = df_combined_1.columns[4:]
    data_combined_2 = df_combined_2.columns[4:]
    # Convert all data to numeric for subtraction
    for df in [df1, df2, df_combined_1, df_combined_2]:
        for col in df.columns[4:]:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Perform subtraction
    df_subtracted_1 = df1[data1].subtract(df_combined_1[data_combined_1], fill_value=0)
    df_subtracted_2 = df2[data2].subtract(df_combined_2[data_combined_2], fill_value=0)
    # print(f'\nSubtraction completed by {pdb_id}_{chain1_id} - {basename}({chain1_id} part)')
    # print(f'{df_subtracted_1}')
    # print(f'Subtraction completed by {pdb_id}_{chain2_id} - {basename}({chain2_id} part)')
    # print(f'{df_subtracted_2}')
    
    # Extract identifier columns from df1 and df2
    identifiers_1 = df1.iloc[:, :4].reset_index(drop=True)
    identifiers_2 = df2.iloc[:, :4].reset_index(drop=True)
    # Concatenate identifier columns with subtracted DataFrames
    df_subtracted_1 = pd.concat([identifiers_1, df_subtracted_1.reset_index(drop=True)], axis=1)
    df_subtracted_2 = pd.concat([identifiers_2, df_subtracted_2.reset_index(drop=True)], axis=1)
    # Consist the data decimal places (ABS:2, REL:1)
    df_subtracted_1 = format_decimal_places(df_subtracted_1)
    df_subtracted_2 = format_decimal_places(df_subtracted_2)
    # Reorder columns into numerical sequence by the NUM column
    df_subtracted_1 = df_subtracted_1.sort_index()
    df_subtracted_2 = df_subtracted_2.sort_index()
    
    # Write to .csv file
    df_subtracted_1_filename = f'{pdb_id}_{chain1_id}-{chain_set}.csv'
    df_subtracted_2_filename = f'{pdb_id}_{chain2_id}-{chain_set}.csv'
    # sentence = 'REM,File of subtracted absolute (ABS) and relative (REL) (%) accessibilities for\n'
    header_1 = 'REM,RES_NUM, , ,All-atoms, ,Total-Side, ,Main-Chain, ,Non-polar, ,All-polar\n'
    header_2 = 'REM,AA,CHAIN,NUM,ABS,REL,ABS,REL,ABS,REL,ABS,REL,ABS,REL\n'
    with open(df_subtracted_1_filename, 'w') as file:
        # file.write(sentence)
        file.write(header_1)
        file.write(header_2)
        df_subtracted_1.to_csv(file, sep=',', index=False, header=False, mode='a')  # Append mode in case we want to add more lines
        print(f'\nWritten subtraction file {df_subtracted_1_filename}, aa: {len(df_subtracted_1)}')
    with open(df_subtracted_2_filename, 'w') as file:
        # file.write(sentence)
        file.write(header_1)
        file.write(header_2)
        df_subtracted_2.to_csv(file, sep=',', index=False, header=False, mode='a')  # Append mode in case we want to add more lines
        print(f'Written subtraction file {df_subtracted_2_filename}, aa: {len(df_subtracted_2)}')
    
    return df_subtracted_1, df_subtracted_2, df_subtracted_1_filename, df_subtracted_2_filename

df_subtracted_1, df_subtracted_2, df_subtracted_1_filename, df_subtracted_2_filename = subtract_rsa_values_and_write_to_file(df_chain1, df_chain2, df_combined, df_combined_name)
print(f'{df_subtracted_1_filename}: \n{df_subtracted_1}')
print(f'{df_subtracted_2_filename}: \n{df_subtracted_2}')


Written subtraction file 2uv8_A-AG.csv, aa: 1613
Written subtraction file 2uv8_G-AG.csv, aa: 2033
2uv8_A-AG.csv: 
       0    1  2     3      4     5      6     7      8     9      10    11  \
0     RES  MET  A     1  78.09  40.2  65.09  41.6  13.01  34.7  65.09  41.2   
1     RES  LYS  A     2  55.29  27.5  51.84  31.7   3.45   9.2  40.15  34.4   
2     RES  PRO  A     3   1.44   1.0   0.00   0.0   1.44   8.8   0.00   0.0   
3     RES  GLU  A     4   1.60   0.9   0.00   0.0   1.60   4.3   0.00   0.0   
4     RES  VAL  A     5  33.46  22.1  33.47  29.3   0.00   0.0  33.47  29.0   
...   ...  ... ..   ...    ...   ...    ...   ...    ...   ...    ...   ...   
1608  RES  SER  A  1743   0.00   0.0   0.00   0.0   0.00   0.0   0.00   0.0   
1609  RES  TYR  A  1744   0.00   0.0   0.00   0.0   0.00   0.0   0.00   0.0   
1610  RES  ILE  A  1745   0.00   0.0   0.00   0.0   0.00   0.0   0.00   0.0   
1611  RES  ASN  A  1746   0.00   0.0   0.00   0.0   0.00   0.0   0.00   0.0   
1612  RES  ALA  

In [13]:
# 2.5 Check for buried residues and write to files
def check_for_buried_residues_and_write_to_file(df_subtracted_1, df_subtracted_2, df_subtracted_1_filename, df_subtracted_2_filename):
    # Play with names
    basename1 = df_subtracted_1_filename.split('.')[0]
    basename2 = df_subtracted_2_filename.split('.')[0]
    pdb_id, subtracted_name1 = basename1.split('_')
    pdb_id, subtracted_name2 = basename2.split('_')
    chain1_id, combined_id = subtracted_name1.split('-')
    chain2_id, combined_id = subtracted_name2.split('-')
    # Convert the relevant columns to numeric for comparison in the next step
    df_subtracted_1.iloc[:, 4] = pd.to_numeric(df_subtracted_1.iloc[:, 4], errors='coerce')
    df_subtracted_1.iloc[:, 5] = pd.to_numeric(df_subtracted_1.iloc[:, 5], errors='coerce')
    df_subtracted_2.iloc[:, 4] = pd.to_numeric(df_subtracted_2.iloc[:, 4], errors='coerce')
    df_subtracted_2.iloc[:, 5] = pd.to_numeric(df_subtracted_2.iloc[:, 5], errors='coerce')
    # Check: counted as a buried residue if the All-atoms-ABS or All-atoms-REL is greater than 0
    buried_residues_in_chain1 = df_subtracted_1.loc[(df_subtracted_1.iloc[:, 4] > 0) | (df_subtracted_1.iloc[:, 5] > 0)]
    buried_residues_in_chain2 = df_subtracted_2.loc[(df_subtracted_2.iloc[:, 4] > 0) | (df_subtracted_2.iloc[:, 5] > 0)]
    
    # Write to .csv files
    filename1 = f'{pdb_id}_{chain1_id}-{combined_id}_buried_residues.csv'
    filename2 = f'{pdb_id}_{chain2_id}-{combined_id}_buried_residues.csv'
    # sentence = 'REM,File of subtracted absolute (ABS) and relative (REL) (%) accessibilities for\n'
    header_1 = 'REM,RES_NUM, , ,All-atoms, ,Total-Side, ,Main-Chain, ,Non-polar, ,All-polar\n'
    header_2 = 'REM,AA,CHAIN,NUM,ABS,REL,ABS,REL,ABS,REL,ABS,REL,ABS,REL\n'
    if not buried_residues_in_chain1.empty:
        with open(filename1, 'w') as file:
            # file.write(sentence)
            file.write(header_1)
            file.write(header_2)
            buried_residues_in_chain1.to_csv(file, sep=',', index=False, header=False, mode='a')
        print(f'Written buried residues to file, PDB model {pdb_id} complex {combined_id}, by chain {chain1_id}, aa: {len(buried_residues_in_chain1)}')
        # print(f'{buried_residues_in_chain1}')
    else:
        print(f'No residues buried in PDB model {pdb_id} complex {combined_id}, by chain {chain1_id}')
    if not buried_residues_in_chain2.empty:
        with open(filename2, 'w') as file:
            # file.write(sentence)
            file.write(header_1)
            file.write(header_2)
            buried_residues_in_chain2.to_csv(file, sep=',', index=False, header=False, mode='a')
        print(f'Written buried residues to file, PDB model {pdb_id} complex {combined_id}, by chain {chain2_id}, aa: {len(buried_residues_in_chain2)}')
        # print(f'{buried_residues_in_chain2}')
    else:
        print(f'No residues buried in PDB model {pdb_id} complex {combined_id}, by chain {chain2_id}')

       
check_for_buried_residues_and_write_to_file(df_subtracted_1, df_subtracted_2, df_subtracted_1_filename, df_subtracted_2_filename)


Written buried residues to file, PDB model 2uv8 complex AG, by chain A, aa: 109
Written buried residues to file, PDB model 2uv8 complex AG, by chain G, aa: 157
