# Setup

In [1]:
import os
import re
from collections import defaultdict
from operator import itemgetter

import numpy as np
import pandas as pd

from IPython.display import display

DIR = r'c://downloads'

# Q1 - Load the dataset

In [2]:
# From: ftp://ftp.ebi.ac.uk/pub/databases/Pfam/releases/Pfam31.0/proteomes/9606.tsv.gz
human_pfam_records = pd.read_csv(os.path.join(DIR, '9606.tsv.gz'), sep = r'\t|> <', skiprows = 2, na_values = ['No_clan'], \
        engine = 'python').rename(columns = lambda name: re.sub(r'[ \-]', '_', re.sub(r'[#\<\>]', '', name)).lower())
display(human_pfam_records)

Unnamed: 0,seq_id,alignment_start,alignment_end,envelope_start,envelope_end,hmm_acc,hmm_name,type,hmm_start,hmm_end,hmm_length,bit_score,e_value,clan
0,A0A024QZ18,69,147,66,147,PF00595,PDZ,Domain,4,82,82,51.3,1.600000e-10,CL0466
1,A0A024QZ33,5,123,4,123,PF09745,DUF2040,Coiled-coil,2,121,121,124.9,2.600000e-33,
2,A0A024QZ42,25,84,22,86,PF13499,EF-hand_7,Domain,4,69,71,41.7,1.800000e-07,CL0220
3,A0A024QZB8,40,436,39,437,PF02487,CLN3,Family,2,398,399,461.1,4.800000e-135,
4,A0A024QZP7,4,287,4,287,PF00069,Pkinase,Domain,1,264,264,258.8,7.400000e-74,CL0016
5,A0A024QZX5,11,380,10,380,PF00079,Serpin,Domain,2,370,370,435.8,2.300000e-127,
6,A0A024R0K5,40,140,38,141,PF07686,V-set,Domain,3,108,109,47.4,2.600000e-09,CL0011
7,A0A024R0K5,239,318,239,318,PF13895,Ig_2,Domain,1,79,79,62.7,4.500000e-14,CL0011
8,A0A024R0K5,418,495,417,496,PF13895,Ig_2,Domain,2,78,79,56.9,3.000000e-12,CL0011
9,A0A024R0K5,596,663,595,674,PF13895,Ig_2,Domain,2,69,79,40.2,4.700000e-07,CL0011


# A

In [3]:
display(human_pfam_records['type'].value_counts())

Domain         57203
Family         31138
Repeat          8354
Motif            761
Coiled-coil      508
Disordered       272
Name: type, dtype: int64

# B

In [4]:
human_pfam_records['length'] = human_pfam_records['alignment_end'] - human_pfam_records['alignment_start'] + 1
human_pfam_domains = human_pfam_records[human_pfam_records['type'] == 'Domain']

print('Average pfam domain length: %.2f aa' % human_pfam_domains['length'].mean())

lengths_per_hmm = defaultdict(list)

for _, record in human_pfam_domains.iterrows():
    lengths_per_hmm[record['hmm_name']].append(record['length'])
    
avg_length_per_hmm = {hmm: np.average(lengths) for hmm, lengths in lengths_per_hmm.items()}
sorted_hmm_and_avg_length = sorted(avg_length_per_hmm.items(), key = itemgetter(1), reverse = True)

print('The five longest domains:')

for hmm_name, avg_length in sorted_hmm_and_avg_length[:5]:
    print('\t' + '%s: %.2f aa' % (hmm_name, avg_length))
    
print('The five shortest domains:')

for hmm_name, avg_length in sorted_hmm_and_avg_length[-5:]:
    print('\t' + '%s: %.2f aa' % (hmm_name, avg_length))

Average pfam domain length: 88.60 aa
The five longest domains:
	Tet_JBP: 626.20 aa
	Glyco_hydro_15: 559.56 aa
	SMC_N: 554.00 aa
	Dynein_heavy: 536.65 aa
	RNA_pol_Rpb1_5: 524.17 aa
The five shortest domains:
	EF-hand_5: 18.68 aa
	OAR: 17.71 aa
	zf-CCCH_2: 17.48 aa
	zf-CCHC: 16.67 aa
	zf-C2H2_10: 16.00 aa


# C

In [5]:
from scipy.stats import ttest_ind

def print_domain_length_statistics(hmm_name, domain_pretty_name = None):

    if domain_pretty_name is None:
        domain_pretty_name = hmm_name

    hmm_lengths = lengths_per_hmm[hmm_name]
    print('%s: %d occurrences, avg size %.2f aa.' % (domain_pretty_name, len(hmm_lengths), np.mean(hmm_lengths)))

print_domain_length_statistics('G-alpha')
print_domain_length_statistics('Pkinase_Tyr', 'Tyrosine kinase')

_, pval = ttest_ind(lengths_per_hmm['G-alpha'], lengths_per_hmm['Pkinase_Tyr'])
print('t-test p-value: %.2f' % pval)

G-alpha: 34 occurrences, avg size 231.71 aa.
Tyrosine kinase: 299 occurrences, avg size 221.08 aa.
t-test p-value: 0.46


G-alpha and tyrosine kinase domains appear to be similar in length, with no significant differenence (p = 0.46).

# D

In [6]:
import gzip

from Bio import SeqIO

uniprot_id_to_seq = {}

# Downloaded as FASTA: http://www.uniprot.org/uniprot/?query=taxonomy%3A%22Homo+sapiens+%5B9606%5D%22&sort=score
with gzip.open(os.path.join(DIR, 'uniprot_human_proteins.fasta.gz'), 'rt') as f:
    for record in SeqIO.parse(f, 'fasta'):
        uniprot_id_to_seq[record.id.split('|')[1]] = str(record.seq)
        
human_pfam_records['protein_seq'] = human_pfam_records['seq_id'].apply(lambda seq_id: uniprot_id_to_seq.get(seq_id, np.nan))

def get_hmm_seq(record):
    
    protein_seq = record['protein_seq']
    
    if pd.isnull(protein_seq):
        return np.nan
    else:
        start = record['alignment_start'] - 1
        end = record['alignment_end']
        return protein_seq[start:end]

human_pfam_records['seq'] = human_pfam_records.apply(get_hmm_seq, axis = 1)
human_pfam_domains = human_pfam_records[human_pfam_records['type'] == 'Domain']

display(human_pfam_domains)

Unnamed: 0,seq_id,alignment_start,alignment_end,envelope_start,envelope_end,hmm_acc,hmm_name,type,hmm_start,hmm_end,hmm_length,bit_score,e_value,clan,length,protein_seq,seq
0,A0A024QZ18,69,147,66,147,PF00595,PDZ,Domain,4,82,82,51.3,1.600000e-10,CL0466,79,MPLLWITGPRYHLILLSEASCLRANYVHLCPLFQHRWLETCNAPPQ...,LVRGYAGFGLTLGGGRDVAGDTPLAVRGLLKDGPAQRCGRLEVGDL...
2,A0A024QZ42,25,84,22,86,PF13499,EF-hand_7,Domain,4,69,71,41.7,1.800000e-07,CL0220,60,MFDRENKAGVNFSEFTGVWKYITDWQNVFRTYDRDNSGMIDKNELK...,WQNVFRTYDRDNSGMIDKNELKQALSGFGYRLSDQFHDILIRKFDR...
4,A0A024QZP7,4,287,4,287,PF00069,Pkinase,Domain,1,264,264,258.8,7.400000e-74,CL0016,284,MEDYTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPS...,YTKIEKIGEGTYGVVYKGRHKTTGQVVAMKKIRLESEEEGVPSTAI...
5,A0A024QZX5,11,380,10,380,PF00079,Serpin,Domain,2,370,370,435.8,2.300000e-127,,370,MSAIMDVLAEANGTFALNLLKTLGKDNSKNVFFSPMSMSCALAMVY...,ANGTFALNLLKTLGKDNSKNVFFSPMSMSCALAMVYMGAKGNTAAQ...
6,A0A024R0K5,40,140,38,141,PF07686,V-set,Domain,3,108,109,47.4,2.600000e-09,CL0011,101,MESPSAPPHRWCIPWQRLLLTASLLTFWNPPTTAKLTIESTPFNVA...,STPFNVAEGKEVLLLVHNLPQHLFGYSWYKGERVDGNRQIIGYVIG...
7,A0A024R0K5,239,318,239,318,PF13895,Ig_2,Domain,1,79,79,62.7,4.500000e-14,CL0011,80,MESPSAPPHRWCIPWQRLLLTASLLTFWNPPTTAKLTIESTPFNVA...,APTISPLNTSYRSGENLNLSCHAASNPPAQYSWFVNGTFQQSTQEL...
8,A0A024R0K5,418,495,417,496,PF13895,Ig_2,Domain,2,78,79,56.9,3.000000e-12,CL0011,78,MESPSAPPHRWCIPWQRLLLTASLLTFWNPPTTAKLTIESTPFNVA...,PTISPSYTYYRPGVNLSLSCHAASNPPAQYSWLIDGNIQQHTQELF...
9,A0A024R0K5,596,663,595,674,PF13895,Ig_2,Domain,2,69,79,40.2,4.700000e-07,CL0011,68,MESPSAPPHRWCIPWQRLLLTASLLTFWNPPTTAKLTIESTPFNVA...,PIISPPDSSYLSGANLNLSCHSASNPSPQYSWRINGIPQQHTQVLF...
10,A0A024R0K5,146,219,146,219,PF13927,Ig_3,Domain,1,79,79,35.8,1.400000e-05,CL0011,74,MESPSAPPHRWCIPWQRLLLTASLLTFWNPPTTAKLTIESTPFNVA...,KPSISSNNSKPVEDKDAVAFTCEPETQDATYLWWVNNQSLPVSPRL...
11,A0A024R0K5,502,575,502,575,PF13927,Ig_3,Domain,1,79,79,34.5,3.500000e-05,CL0011,74,MESPSAPPHRWCIPWQRLLLTASLLTFWNPPTTAKLTIESTPFNVA...,KPSISSNNSKPVEDKDAVAFTCEPEAQNTTYLWWVNGQSLPVSPRL...


# E

According to [Pfam](https://pfam.xfam.org/family/PDZ#tabview=tab0): "The PDZ domain is a common structural domain of 80-90 amino-acids found in the signaling proteins of bacteria, yeast, plants, viruses[1] and animals.[2] Proteins containing PDZ domains play a key role in anchoring receptor proteins in the membrane to cytoskeletal components. Proteins with these domains help hold together and organize signaling complexes at cellular membranes. These domains play a key role in the formation and function of signal transduction complexes.[3] PDZ domains also play a highly significant role in the anchoring of cell surface receptors (such as Cftr and FZD7) to the actin cytoskeleton via mediators like NHERF and ezrin.[4]"

In [7]:
from Bio.Seq import Seq
from Bio.Alphabet import Alphabet
from Bio.SeqRecord import SeqRecord

pdz_seqs = human_pfam_domains.loc[human_pfam_domains['hmm_name'] == 'PDZ', 'seq'].dropna()
pdz_records = [SeqRecord(Seq(seq, Alphabet()), id = str(index), description = human_pfam_domains.loc[index, 'seq_id']) for \
        index, seq in pdz_seqs.iteritems()]
SeqIO.write(pdz_records, os.path.join(DIR, 'pdz_seqs.fasta'), 'fasta')

656

Uploading this FASTA file to ClustalW hsa generated the Multiple Sequence Alignment (MSA) results available [here](https://drive.google.com/file/d/14Gwj021gjIT_T3HILhSFYx3dZkI3W5_E/view?usp=sharing).

Uploading this MSA to [Seq2Logo](http://www.cbs.dtu.dk/biotools/Seq2Logo-2.1/) has generated the following logo:

![Generated logo - part 1](https://drive.google.com/uc?id=12AYdw3kmegNJ5hP3WPC9iQ_d0ftq4oco)
![Generated logo - part 2](https://drive.google.com/uc?id=1gcy1Nko99Q3svRU8J0zqLUp1nuG85TOP)

And here is Pfam's logo:

![Pfam's logo](https://pfam.xfam.org/family/PF00595/logo_image)

The two logos show some similar motifs (e.g. a strong D amino-acid often preceeded by a G, with a strong I and VNG sequences later on). On the other hand, the two profiles are also remarkebly different (most notably, the two profiles are of a different length, with no clear correspondence between each part of one profile to the other).

Such diffrences are not surprising, and are even expected, mainly due to the following reasons:
1. Pfam actually uses a different method to generate profiles. Its profiles are HMM-based, while ClustalW uses simple MSA.
2. Pfam's profile was created from a seed of sequences from multiple organisms, whereas our profile was based solely on human occurrences.

More generally, we can describe the process that occurred as follows:
1. Pfam selected a seed of sequences.
2. Pfam used these sequences to construct an HMM profile.
3. Pfam then used the HMM profile to detect all the sequences in human (and in other organisms) matching it.
4. We took these human sequences, and generated a new profile from them.

Even if we used the exact same methodology, some differences would still be expected, as moving back and forth from sequences to profiles and from profiles to sequences is a process that might take a while to converge, and it's possible to end up with different profile/sequences on each iteration of this process.

# F

In [8]:
pdz_seq_ids = human_pfam_domains.loc[human_pfam_domains['hmm_name'] == 'PDZ', 'seq_id']
pdz_occurances_per_protein = pdz_seq_ids.value_counts()
protein_with_9_pdzs, = pdz_occurances_per_protein[pdz_occurances_per_protein == 9].index

print('The protien with 9 copies of PDZ is: %s' % protein_with_9_pdzs)
print('And here are the 9 occurrences:')
display(human_pfam_domains[(human_pfam_domains['seq_id'] == protein_with_9_pdzs) & (human_pfam_domains['hmm_name'] == 'PDZ')])

The protien with 9 copies of PDZ is: Q8NI35
And here are the 9 occurrences:


Unnamed: 0,seq_id,alignment_start,alignment_end,envelope_start,envelope_end,hmm_acc,hmm_name,type,hmm_start,hmm_end,hmm_length,bit_score,e_value,clan,length,protein_seq,seq
81989,Q8NI35,1439,1516,1435,1517,PF00595,PDZ,Domain,3,81,82,65.5,6.1e-15,CL0466,78,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,EISKGRSGLGLSIVGGKDTPLNAIVIHEVYEEGAAARDGRLWAGDQ...
81990,Q8NI35,368,449,365,450,PF00595,PDZ,Domain,4,81,82,58.4,9.8e-13,CL0466,82,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,LVRKDGQSLGIRIVGYVGTSHTGEASGIYVKSIIPGSAAYHNGHIQ...
81991,Q8NI35,1535,1611,1534,1612,PF00595,PDZ,Domain,3,81,82,57.0,2.8e-12,CL0466,77,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,DLQKKAGRGLGLSIVGKRNGSGVFISDIVKGGAADLDGRLIQGDQI...
81992,Q8NI35,254,324,248,325,PF00595,PDZ,Domain,8,81,82,55.0,1.2e-11,CL0466,71,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,DGSGLGFGIVGGKTSGVVVRTIVPGGLADRDGRLQTGDHILKIGGT...
81993,Q8NI35,135,217,134,218,PF00595,PDZ,Domain,2,81,82,53.0,5e-11,CL0466,83,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,IDIERPSTGGLGFSVVALRSQNLGKVDIFVKDVQPGSVADRDQRLK...
81994,Q8NI35,1071,1153,1069,1157,PF00595,PDZ,Domain,4,78,82,52.8,5.7e-11,CL0466,83,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,IFREPNVSLGISIVGGQTVIKRLKNGEELKGIFIKQVLEDSPAGKT...
81995,Q8NI35,1678,1758,1676,1759,PF00595,PDZ,Domain,3,81,82,50.9,2.2e-10,CL0466,81,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,EINRELSDALGISIAGGRGSPLGDIPVFIAMIQASGVAARTQKLKV...
81996,Q8NI35,1241,1317,1239,1319,PF00595,PDZ,Domain,3,80,82,45.3,1.2e-08,CL0466,77,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,ELEKDKNGLGLSLAGNKDRSRMSIFVVGINPEGPAAADGRMRIGDE...
81997,Q8NI35,690,769,686,770,PF00595,PDZ,Domain,5,81,82,30.7,0.00046,CL0466,80,MPENPATDKLQVLQVLDRLKMKLQEKGDTSQNEKLSMFYETLKSPL...,VKDCKGLGFSILDYQDPLDPTRSVIVIRSLVADGVAERSGGLLPGD...


According to [UniProt](https://www.uniprot.org/uniprot/Q8NI35), The InaD-like protein (coded by the PATJ gene in human) is a "scaffolding protein that may bring different proteins into adjacent positions at the cell membrane (Probable). May regulate protein targeting, cell polarity and integrity of tight junctions (PubMed:11927608, PubMed:16678097). May regulate the surface expression and/or function of ASIC3 in sensory neurons (By similarity). May recruit ARHGEF18 to apical cell-cell boundaries (PubMed:22006950)." 