# Finding Closest Pair of Sequences

Template Executed by: [Tony Kabilan Okeke](mailto:tko35@drexel.edu)

Write your function in a file named `"ptns_closestpair.py"`, which is imported in this notebook.

You do not need to change anything in this Jupyter notebook. Run this file to produce the outputs. Then save it as a PDF. Submit both your PDF file, as well as ptns_closestpair.py file on Blackboard.

In [1]:
%load_ext autoreload

In [2]:
# Imports
%autoreload 2
import bmes

from Bio import SeqIO
from ptns_closestpair import ptns_closestpair

## Test 1

In [3]:
ptns = ['ANNA','ALLIE','HANNAH']

result = ptns_closestpair( ptns )
print(result)

{'pair': [1, 3], 'ident': 100}


## Test 2

In [4]:
ptns = ['AHMET','AMY','EMILY'];

result = ptns_closestpair( ptns );
print(result)

{'pair': [2, 3], 'ident': 50}


## Test 3

In [5]:
file = bmes.downloadurl('https://sacan.biomed.drexel.edu/lib/exe/fetch.php?rev=&media=course:binf:data:uteroglobin.blastresults.fasta','uteroglobin.blastresults.fasta')
ptns = [str(fastaptn.seq) for fastaptn in SeqIO.parse(file,'fasta') ]

# Remove any duplicates
ptns = list(set(ptns))

In [6]:
# Select first 10 proteins in list because it would take too long to iterate throug
# every possible pair of proteins
ptns = ptns[:10]

result = ptns_closestpair( ptns );
print(result)

{'pair': [4, 9], 'ident': 80}


## Appendix

In [7]:
# Printing the file here as well for easy reference when grading.
from pathlib import Path
txt = Path('ptns_closestpair.py').read_text()
print(txt)

# Author: Tony Kabilan Okeke <tko35@drexel.edu>
# Date: February 7, 2022

# Imports
from itertools import combinations
from Bio.Align import substitution_matrices
from Bio import pairwise2

def ptns_closestpair(ptns: list):
  """
  This function finds the most similar pair in a list of proteins
  based on their local alignment scores using a BLOSUM62 scoring matirx.
  It returns a dictionary containing the indicies of items in the pair,
  as well as the percent identity of the alignment.

  Parameters
  ----------
  ptns: list
    A list of at least 2 proteins
  """

  # Remove any gap characters from proteins
  for i in range(len(ptns)):
    ptns[i] = ptns[i].replace('-', '')

  # Create list of unique pairs (combinations)
  pairs = [pair for pair in combinations(ptns, 2)]

  # Load and store substituotion matrix
  subs_mat = substitution_matrices.load('BLOSUM62')

  # Loop through pairs
  max_score = 0
  for pair in pairs:
    # Compute alignment score
    score = pairwise2.align.loc