In [3]:
# Python script to count unique IDs in the first column and write them to a file
def count_unique_ids(file_path, output_file="unique_pdb_ids.txt"):
    unique_ids = set()
    
    with open(file_path, 'r') as f:
        # Skip header line
        next(f)
        for line in f:
            if line.strip():
                columns = line.strip().split('\t')
                unique_ids.add(columns[0])
    
    # Write unique IDs to output file
    with open(output_file, 'w') as out_f:
        for pdb_id in sorted(unique_ids):
            out_f.write(f"{pdb_id}\n")
    
    print(f"Total unique IDs found: {len(unique_ids)}")
    print(f"Unique PDB IDs have been written to {output_file}")
    return unique_ids


In [5]:

file_path = "/cosybio/project/mabouzid/EEI_networks/EEI-Conservation-main/data/uniprot_pdb_Ensembl_finalized_mus.txt"
output_file = "/cosybio/project/mabouzid/EEI_networks/EEI-Conservation-main/data/unique_pdb_ids_mus.txt"
unique_ids = count_unique_ids(file_path, output_file)
print("Unique IDs:", sorted(list(unique_ids)))

Total unique IDs found: 216
Unique PDB IDs have been written to /cosybio/project/mabouzid/EEI_networks/EEI-Conservation-main/data/unique_pdb_ids_mus.txt
Unique IDs: ['1abo', '1awc', '1b07', '1bqh', '1dow', '1gbr', '1ikn', '1j19', '1jeg', '1juf', '1k8d', '1kgy', '1kj3', '1kyf', '1leg', '1lez', '1lk2', '1lnu', '1oeb', '1oy3', '1pq1', '1s4z', '1s7q', '1shw', '1uef', '1uj0', '1uti', '1vet', '1xdk', '1y19', '1ymt', '2bzw', '2c1m', '2ckl', '2clv', '2clz', '2ems', '2f4m', '2f8n', '2fnj', '2hdx', '2hmh', '2j3t', '2j3w', '2jkr', '2ld7', '2nla', '2o26', '2pbi', '2psm', '2ptt', '2ql2', '2qpy', '2qxv', '2rm0', '2v87', '2v8c', '2v8f', '2vof', '2vog', '2voh', '2voi', '2w10', '2wd5', '2wp1', '2wp2', '2xl2', '2xl3', '2z32', '2zxx', '3a9j', '3a9k', '3ade', '3aji', '3bp6', '3c7k', '3c8k', '3cqx', '3dmm', '3eg5', '3ejj', '3f5c', '3f7d', '3hs8', '3jv6', '3kio', '3l51', '3mj7', '3ml4', '3mnp', '3o7l', '3oky', '3oux', '3qbq', '3ro3', '3t6q', '3tnp', '3ul0', '3unb', '3unf', '3utm', '3vq2', '3wcy', '3wn7', '3

In [7]:
def compare_pdb_ids(unique_ids_file, dir_list_file):
    """
    Compare PDB IDs from two files and find common entries.
    
    Args:
        unique_ids_file: Path to the file with unique PDB IDs (from previous script)
        dir_list_file: Path to the file with directory names (from ls command)
        output_file: Path to save the common PDB IDs
    """
    # Read unique PDB IDs from first file
    with open(unique_ids_file, 'r') as f:
        unique_ids = set(line.strip() for line in f if line.strip())
    
    # Read directory list from second file
    with open(dir_list_file, 'r') as f:
        dir_ids = set(line.strip() for line in f if line.strip())
    
    # Find common IDs
    common_ids = unique_ids.intersection(dir_ids)
    
    # Print summary
    print(f"Unique PDB IDs in first file: {len(unique_ids)}")
    print(f"Directory names in second file: {len(dir_ids)}")
    print(f"Common PDB IDs found in both files: {len(common_ids)}")
    
    # Return counts for further analysis if needed
    return {
        "unique_ids_count": len(unique_ids),
        "dir_ids_count": len(dir_ids),
        "common_ids_count": len(common_ids),
        "common_ids": common_ids
    }

In [8]:
dir_list = "/cosybio/project/mabouzid/EEI_networks/EEI-Conservation-main/data/unique_pdb_ids_mus.txt"
unique = "/cosybio/project/mabouzid/EEI_networks/EEI-Conservation-main/data/pdb_directory_list.txt"
compare_pdb_ids(unique, dir_list)

Unique PDB IDs in first file: 2608
Directory names in second file: 216
Common PDB IDs found in both files: 2


{'unique_ids_count': 2608,
 'dir_ids_count': 216,
 'common_ids_count': 2,
 'common_ids': {'2j3t', '4p6z'}}