In [None]:
import os
from Bio import SeqIO

def create_individual_fasta_files(fasta_file, output_directory):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    for record in SeqIO.parse(fasta_file, "fasta"):
        file_name = f"{output_directory}/{record.id}.fasta"
        with open(file_name, "w") as output_handle:
            SeqIO.write(record, output_handle, "fasta")

# Using the provided FASTA file
fasta_file = "/home/samith/Downloads/peptides_lengthLT60_pdbAvailable.fasta"
output_directory = "/media/samith/My Passport/Bench/fasta"  # Replace with your desired output directory

create_individual_fasta_files(fasta_file, output_directory)

In [None]:
import csv
import requests
import os
import shutil

# Paths
fasta_directory = "/media/samith/My Passport/Bench/fasta"  # Update this path
metadata_file = "/home/samith/Downloads/peptides_lt60_metadata.csv"  # Update this path if necessary
base_output_directory = "/media/samith/My Passport/Bench/output"  # Replace with your desired output directory

def read_metadata(csv_path):
    starpep_to_pdb = {}
    with open(csv_path, mode='r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            peptide = row['Peptide'].strip()
            metadata_entries = row['Metadata'].split(';')
            pdb_ids = [entry.split(':')[-1].strip() for entry in metadata_entries if 'PDB:' in entry]

            if peptide not in starpep_to_pdb:
                starpep_to_pdb[peptide] = set()
            starpep_to_pdb[peptide].update(pdb_ids)

    return starpep_to_pdb

def download_pdb_files(pdb_ids, output_path):
    for pdb_id in pdb_ids:
        url = f'https://files.rcsb.org/download/{pdb_id}.pdb'
        response = requests.get(url)
        if response.status_code == 200:
            file_path = os.path.join(output_path, f'{pdb_id}.pdb')
            with open(file_path, 'wb') as file:
                file.write(response.content)
            print(f"Downloaded {pdb_id}.pdb")
        else:
            print(f"Failed to download {pdb_id}. Status code: {response.status_code}")

def copy_fasta_file(starpep_id, fasta_directory, output_path):
    fasta_file_name = f"{starpep_id}.fasta"
    fasta_file_path = os.path.join(fasta_directory, fasta_file_name)
    if os.path.exists(fasta_file_path):
        shutil.copy(fasta_file_path, output_path)
        print(f"Copied FASTA file for {starpep_id}")
    else:
        print(f"FASTA file for {starpep_id} not found in {fasta_directory}")

def main():
    peptide_to_pdb = read_metadata(metadata_file)

    for starpep_id, pdb_ids in peptide_to_pdb.items():
        starpep_output_path = os.path.join(base_output_directory, starpep_id)
        os.makedirs(starpep_output_path, exist_ok=True)

        copy_fasta_file(starpep_id, fasta_directory, starpep_output_path)
        download_pdb_files(pdb_ids, starpep_output_path)

if __name__ == "__main__":
    main()

In [None]:
from Bio import SeqIO
from Bio.PDB import PDBParser, PPBuilder, PDBIO, Select
from Bio import pairwise2
import os

class ChainSelect(Select):
    def __init__(self, chain_letter):
        self.chain_letter = chain_letter

    def accept_chain(self, chain):
        return chain.get_id() == self.chain_letter

def extract_chain_sequences(pdb_file):
    parser = PDBParser()
    structure = parser.get_structure("PDB_structure", pdb_file)
    ppb = PPBuilder()
    chain_sequences = {}

    for chain in structure.get_chains():
        sequence = ""
        for pp in ppb.build_peptides(chain):
            sequence += str(pp.get_sequence())
        chain_sequences[chain.id] = sequence

    return chain_sequences

def align_sequences(pdb_sequences, fasta_sequence):
    best_match = {'score': -1, 'chain': None}

    for chain_id, chain_seq in pdb_sequences.items():
        alignments = pairwise2.align.globalxx(chain_seq, str(fasta_sequence))
        for alignment in alignments:
            if alignment.score > best_match['score']:
                best_match['score'] = alignment.score
                best_match['chain'] = chain_id

    return best_match

def save_chain(pdb_file, chain_id, output_file):
    parser = PDBParser()
    structure = parser.get_structure("PDB_structure", pdb_file)
    model = structure[0]  # Assuming we want the first model
    chain = model[chain_id]

    io = PDBIO()
    io.set_structure(structure)
    io.save(output_file, ChainSelect(chain_id))

def process_folder(folder_path):
    fasta_file_path = None
    pdb_files = []

    for file in os.listdir(folder_path):
        if file.endswith('.fasta'):
            fasta_file_path = os.path.join(folder_path, file)
        elif file.endswith('.pdb'):
            pdb_files.append(os.path.join(folder_path, file))

    if not fasta_file_path or not pdb_files:
        print(f"No FASTA or PDB files found in folder: {folder_path}")
        return

    fasta_sequence = SeqIO.read(fasta_file_path, "fasta").seq

    for pdb_file in pdb_files:
        pdb_sequences = extract_chain_sequences(pdb_file)
        best_match = align_sequences(pdb_sequences, fasta_sequence)

        if best_match['chain']:
            pdb_id = os.path.splitext(os.path.basename(pdb_file))[0]
            output_file = f"{pdb_id}_${best_match['chain']}.pdb"
            output_file_path = os.path.join(folder_path, output_file)
            print(f"Saving chain {best_match['chain']} of {pdb_id} to {output_file}")
            save_chain(pdb_file, best_match['chain'], output_file_path)

# The top-level directory containing all the StarPep_* folders
top_level_directory = "/media/samith/My Passport/output"

# Iterate over each folder in the top-level directory and process it
for folder_name in os.listdir(top_level_directory):
    folder_path = os.path.join(top_level_directory, folder_name)
    if os.path.isdir(folder_path) and folder_name.startswith("starPep_"):
        process_folder(folder_path)

print("All folders processed.")

In [None]:
import os

def count_pdb_files_in_subfolders(directory_path):
    pdb_file_count = 0

    # Walk through directory
    for root, dirs, files in os.walk(directory_path):
        # Count the pdb files in each directory
        pdb_file_count += sum(file.endswith('.pdb') for file in files)

    return pdb_file_count

# Specify the top-level directory you want to search
top_level_directory = '/media/samith/My Passport/output'  # Replace with the path to your directory
number_of_pdb_files = count_pdb_files_in_subfolders(top_level_directory)

print(f"There are {number_of_pdb_files} PDB files in the directory and its subfolders.")