In [1]:
import os
import re
import pandas as pd

In [2]:
def parse_starting_pos(pos_range):
#     print(pos_range)
    return int(pos_range.rstrip().split('-')[0])

def parse_ending_pos(pos_range):
#     print(pos_range)
    return int(pos_range.rstrip().split('-')[1])

def load_accession_pdb_map(fn):
    pdb_map_df = pd.read_csv(fn, sep='\t', header=None, names=['PDB', 'Chain', 'Range', 'Descriptor', 'Pfam Family', 'Accession', 'Position Range'], index_col=None, engine='python')
    pdb_map_df['PDB'] = pdb_map_df['PDB'].str.rstrip(';')
    pdb_map_df['Chain'] = pdb_map_df['Chain'].str.rstrip(';')
    pdb_map_df['Range'] = pdb_map_df['Range'].str.rstrip(';')
    pdb_map_df['Descriptor'] = pdb_map_df['Descriptor'].str.rstrip(';')
    pdb_map_df['Pfam Family'] = pdb_map_df['Pfam Family'].str.rstrip(';')
    pdb_map_df['Accession'] = pdb_map_df['Accession'].str.rstrip(';')
    pdb_map_df['Position Range'] = pdb_map_df['Position Range'].str.rstrip(';')
    print(pdb_map_df)
    pdb_map_df['Start Position'] = pdb_map_df['Position Range'].apply(parse_starting_pos) #lambda x: int(x.rstrip().split('-')[0]))
    pdb_map_df['End Position'] = pdb_map_df['Position Range'].apply(parse_ending_pos) # lambda x: int(x.rstrip().split('-')[1]))
    print(f'There are {len(pdb_map_df["Pfam Family"].unique())} unique families in this mapping.')
    print(f'There are {len(pdb_map_df["PDB"].unique())} unique PDBs in this mapping.')
    print(f'There are {len(pdb_map_df["Accession"].unique())} Uniprot accessions in this mapping.')
    pdb_map_groups = pdb_map_df.groupby('Pfam Family')
    pdb_counts = pdb_map_groups['PDB'].nunique()
    print(f'The highest number of PDBs for a Pfam family is {pdb_counts.max()} and the lowest is {pdb_counts.min()}')
    accession_counts = pdb_map_groups['Accession'].nunique()
    print(f'The highest number of Uniprot accessions for a Pfam family is {accession_counts.max()} and the lowest is {accession_counts.min()}')
    return pdb_map_df

In [3]:
pdb_map_fn = '/media/daniel/ExtraDrive1/Pfam/pdbmap'
pdb_map_df = load_accession_pdb_map(pdb_map_fn)

         PDB Chain Range Descriptor Pfam Family Accession Position Range
0       4ZLP     A   NaN      Notch     PF00066    Q9UM47      1385-1418
1       5CZX     B   NaN      Notch     PF00066    Q9UM47      1387-1418
2       5CZV     A   NaN      Notch     PF00066    Q9UM47      1391-1418
3       5CZX     A   NaN      Notch     PF00066    Q9UM47      1387-1418
4       4ZLP     B   NaN      Notch     PF00066    Q9UM47      1392-1418
...      ...   ...   ...        ...         ...       ...            ...
698484  1GM5     A   NaN     RecG_C     PF19833    Q9WY48        711-755
698485  6OX6     A   NaN     Ntox46     PF15538    Q9I739        266-373
698486  4ZV4     D   NaN     Ntox46     PF15538    Q9I739        266-422
698487  4ZV0     A   NaN     Ntox46     PF15538    Q9I739        282-422
698488  4ZV4     C   NaN     Ntox46     PF15538    Q9I739        266-422

[698489 rows x 7 columns]
There are 9921 unique families in this mapping.
There are 157520 unique PDBs in this mapping.
The

In [4]:
def extract_pfam_alignment(pfam_uniprot_fn, number):
    alignment = []
    with open(pfam_uniprot_fn, 'r') as handle:
        counter = 0
        for line in handle:
            if line.startswith('//'):
                counter += 1
                if counter == number:
                    break
            elif counter == (number - 1):
                alignment.append(line)
            else:
                pass
    return alignment

In [14]:
def process_alignment(aln_line_list):
    header = {}
    aln_dict = {}
    header_line_pattern = re.compile(r'^#=GF\s+([A-Z]{2})\s+(.*)\s*$')
    for line in aln_line_list:
        header_match = header_line_pattern.match(line)
        if header_match:
            if header_match.group(1) in header:
                header[header_match.group(1)] += header_match.group(2)
            else:
                header[header_match.group(1)] = header_match.group(2)
        elif line.startswith('#'):
            pass
        else:
#             print(line)
            line_list = line.split()
#             print(line_list)
#             print(line_list[0])
#             print(type(line_list[0]))
#             print(line_list[1])
#             print(type(line_list[1]))
            aln_dict[line_list[0]] = line_list[1]
#             print(aln_dict)
#             break
    return header, aln_dict

In [6]:
pfam_full_uniprot_fn = '/media/daniel/ExtraDrive1/Pfam/Pfam-A.full.uniprot'

In [7]:
alignment = extract_pfam_alignment(pfam_full_uniprot_fn, 1)
print(alignment[:50])

['# STOCKHOLM 1.0\n', '#=GF ID   1-cysPrx_C\n', '#=GF AC   PF10417.11\n', '#=GF DE   C-terminal domain of 1-Cys peroxiredoxin\n', '#=GF AU   Finn RD;0000-0001-8626-2148\n', '#=GF AU   Coggill P;0000-0001-5731-1588\n', '#=GF SE   Gene3D, pdb_1prx\n', '#=GF GA   21.10 21.10;\n', '#=GF TC   21.10 21.10;\n', '#=GF NC   21.00 21.00;\n', '#=GF BM   hmmbuild HMM.ann SEED.ann\n', '#=GF SM   hmmsearch -Z 57096847 -E 1000 --cpu 4 HMM pfamseq\n', '#=GF TP   Domain\n', '#=GF RN   [1]\n', '#=GF RM   9587003\n', '#=GF RT   Crystal structure of a novel human peroxidase enzyme at 2.0 A\n', '#=GF RT   resolution. \n', '#=GF RA   Choi HJ, Kang SW, Yang CH, Rhee SG, Ryu SE; \n', '#=GF RL   Nat Struct Biol. 1998;5:400-406.\n', '#=GF RN   [2]\n', '#=GF RM   15004285\n', '#=GF RT   Activation of the antioxidant enzyme 1-CYS peroxiredoxin\n', '#=GF RT   requires glutathionylation mediated by heterodimerization with\n', '#=GF RT   pi GST. \n', '#=GF RA   Manevich Y, Feinstein SI, Fisher AB; \n', '#=GF RL   Pr

In [16]:
h1, aln1 = process_alignment(alignment)
print(h1)
print(len(aln1))
print(len(aln1.keys()))
print(list(aln1.keys()))

{'ID': '1-cysPrx_C', 'AC': 'PF10417.11', 'DE': 'C-terminal domain of 1-Cys peroxiredoxin', 'AU': 'Finn RD;0000-0001-8626-2148Coggill P;0000-0001-5731-1588', 'SE': 'Gene3D, pdb_1prx', 'GA': '21.10 21.10;', 'TC': '21.10 21.10;', 'NC': '21.00 21.00;', 'BM': 'hmmbuild HMM.ann SEED.ann', 'SM': 'hmmsearch -Z 57096847 -E 1000 --cpu 4 HMM pfamseq', 'TP': 'Domain', 'RN': '[1][2]', 'RM': '958700315004285', 'RT': 'Crystal structure of a novel human peroxidase enzyme at 2.0 Aresolution. Activation of the antioxidant enzyme 1-CYS peroxiredoxinrequires glutathionylation mediated by heterodimerization withpi GST. ', 'RA': 'Choi HJ, Kang SW, Yang CH, Rhee SG, Ryu SE; Manevich Y, Feinstein SI, Fisher AB; ', 'RL': 'Nat Struct Biol. 1998;5:400-406.Proc Natl Acad Sci U S A. 2004;101:3780-3785.', 'DR': 'INTERPRO; IPR019479;SO; 0000417; polypeptide_domain;', 'CC': "This is the C-terminal domain of 1-Cys peroxiredoxin (1-cysPrx),a member of the peroxiredoxin superfamily which protect cellsagainst membrane ox