<a href="https://colab.research.google.com/github/NLZT/Deep-Dive-Projects/blob/main/Capstone_NTx_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip freeze | egrep '^(lxml|pandas|requests|numpy|bs4|urllib|json)'


bs4==0.0.1
jsonschema==4.3.3
lxml==4.9.1
numpy==1.21.6
pandas==1.3.5
pandas-datareader==0.9.0
pandas-gbq==0.17.9
pandas-profiling==1.4.1
requests==2.23.0
requests-oauthlib==1.3.1
urllib3==1.24.3


In [None]:
from lxml import html
import pandas as pd
import requests
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import json

In [None]:
#Generate mock data based on what the database (NoRine) expects as an input. 
#I used random choice to get a list of 1000 unique random peptides.
url = "https://bioinfo.lifl.fr/norine/traitAmino.jsp?code=&keyword=&type=all&molecularFormula=&molecularWeight1=&molecularWeight2=&pdbCode=&pubchem=&boutSubmit=submit"
page = urlopen(url).read()
soup = BeautifulSoup(page, features="html.parser")
mydivs = soup.find_all("ul", {"class": "arborescence"})
ele=mydivs[0].find_all("li", recursive=False)


monomers=[mono.text for mono in ele if mono.text != 'Unclustered']

datax=[]
for i in range(2000):
  datax.append(",".join(list(np.random.choice(monomers, size=np.random.randint(1,20)))))
  data_unique=np.unique(datax)

data=[]
for j in list(np.random.choice(data_unique, 1000, replace=False)):
  data.append(str(j))

data

['Hpg*,Spd*,Aib*,FA,Dpr*,PKS,Asx,PKS,Bmt*',
 'Carbohydrate,Sta*,Spd*,Dap*,XPro,Aad*,Phe+Trp,Hap*,End*,Spd*,Dap*',
 'Dap*,Orn*,Bz*,Orn*,Lys*,Spd*,Bz*,Hap*,Bmt*,Lac*,XSer,Kyn*,Ala+Gly,Chromophores,Cap*,Ala+Gly,Cys*',
 'Lys*,Carbohydrate,Dpr*,FA,PKS,Cit*,Glx',
 'Dpr*,Hpg*',
 'PKS,Hpg*,Chromophores,Bz*,Glx,Thz*,Kyn*,Hap*,Aib*,Dab*,Kyn*,Thr*,Phe+Trp,XSer,Thz*,Kyn*',
 'Thr*,Dpr*,Hpg*,Kyn*,Bmt*,Hiv*,Aad*,Met*,Dpr*,FA,Aad*,Dil*,His*,FA,Hiv*,Dap*,Aib*,Cap*',
 'Orn*,Asx,Nonpolar,Dpr*,Kyn*,FA,Glx,Chromophores,Ala+Gly,Lys*,Carbohydrate,Lys*,Ala+Gly,Met*,Spd*,XPro',
 'Cap*,Cap*,Vaa*,Cma*,Nonpolar,XArg,XArg',
 'Aad*,XPro,Met*,XTyr,Hiv*,Glx,Aib*,End*,Thr*,Phe+Trp',
 'Cap*,Hpg*,Cma*,XTyr,Lys*,Cma*,XArg,XPro,Cap*',
 'Chromophores,Dab*,Hpg*,Pyr*,Hiv*,Aad*',
 'XSer,Bz*,Aib*,Thz*,Aib*,Nonpolar,Cma*,Hiv*,Orn*,Dil*,Vaa*,Cit*',
 'Cys*,XTyr,His*,Sta*,Thz*,His*,Hiv*,PKS,Pyr*,Lac*,Pyr*,Hap*,Hap*,Kyn*,PKS,XArg,Chromophores,Lys*,Hpg*',
 'Phe+Trp,Hpg*,Aad*,Dpr*,Dap*,Spd*,XPro',
 'Dil*,Dpr*,XPro,Thz*',
 'Cma*,Hiv*,

In [None]:
def divination(peptide, similarity):
  """ API that takes a string and makes a GET request to NoRINE's 
  database, then scraps the html that it returns that meet a similarity 
  threshold calculated by calculated by NoRine and specified by the 
  user. Returns a list of dictionaries for each match that makes up """
  df={}
  url='https://bioinfo.lifl.fr/norine/fingerPrintSearch.jsp'
  peptide_rest={'nrps1': peptide}
  x=requests.get(url, params=peptide_rest)
  x_html = html.fromstring(x.content)

  #Table of the searched peptide
  output=pd.read_html(x.content)[0]

  #Links used in search table.
  z=x_html.xpath(f'/html/body/div[4]/div/div/form/div/table/tr[*]/td[*]/a//@href')
  y=[x.strip() for i, x in enumerate(z) if i%2 == 0]
  w=[x.strip() for i, x in enumerate(z) if i%2 == 1]

  fingerprint_link=[]
  for i in y:
    fingerprint_link.append('https://bioinfo.lifl.fr'+i)

  peptide_link=[]
  for i in w:
    peptide_link.append('https://bioinfo.lifl.fr'+i)

  #adding list to dataframe while removing Null field
  output['fingerprint_link']=fingerprint_link
  output['peptide_link']=peptide_link
  output.drop('download', axis=1, inplace=True)

  opmask=output['similarity']>=similarity
  op=output[opmask]
  df[peptide]=op

  k_dict={ k: v.to_dict(orient="records") for i, (k,v) in enumerate(df.items()) }
  

  return k_dict

In [None]:
def foresight(data)
  jlist=[]
  for i in datax[:10]:
    jlist.append(divination(i, 0.4))
  jlist

[{'Hap*,Chromophores,Thr*,Glx,Asx,Cys*,Orn*,Chromophores,Cap*,FA,Hap*,Bmt*,Cit*,Bz*': []},
 {'XTyr,Hiv*,XSer,Met*,Lac*,Aib*,Bz*,Bmt*,His*,Sta*,Met*,XPro,XSer,Dbu*,Lac*,His*,Kyn*': []},
 {'Pyr*,XPro,Bmt*,Aib*,Dap*,Glx,Spd*,XPro,Carbohydrate,XPro,Glx,Chromophores,Carbohydrate,Sta*,Nonpolar,Bmt*,XTyr,Dap*': []},
 {'Cit*,Ala+Gly,Chromophores,XArg': []},
 {'His*,XSer,Cys*,Orn*,Ala+Gly,Dap*,FA,Lac*,Dab*,Dpr*,Chromophores': [{'similarity': 0.44,
    'peptide': 'pyoverdin BTP16',
    'fingerprint_link': 'https://bioinfo.lifl.fr/norine/transimilarity.jsp;jsessionid=CBF5CA23AED189BDDAB8043FEFDF0A14?similValue=0.4397578142119488&namePep2=pyoverdin+BTP16&idPep2=00206&userGraph=His*%2cXSer%2cCys*%2cOrn*%2cAla%2bGly%2cDap*%2cFA%2cLac*%2cDab*%2cDpr*%2cChromophores&choices=His*%3e0%2cXSer%3eXSer%3eSer%2cCys*%3e0%2cOrn*%3eOrn*%3eD-OH-Orn%2cAla%2bGly%3eAla%2bGly%3eGly%2cDap*%3e0%2cFA%3eFA%3eC4%3a0-OH(3)%2cLac*%3e0%2cDab*%3eDab*%3eD-Dab%2cDpr*%3e0%2cChromophores%3eChromophores%3eChrP&matchGraph=ChrP%2cAs

In [None]:
with open('norine.json', 'w') as fp:
  json.dump(jlist, fp, indent=2)

In [None]:
!head -100 norine.json

[
  {
    "Hap*,Chromophores,Thr*,Glx,Asx,Cys*,Orn*,Chromophores,Cap*,FA,Hap*,Bmt*,Cit*,Bz*": []
  },
  {
    "XTyr,Hiv*,XSer,Met*,Lac*,Aib*,Bz*,Bmt*,His*,Sta*,Met*,XPro,XSer,Dbu*,Lac*,His*,Kyn*": []
  },
  {
    "Pyr*,XPro,Bmt*,Aib*,Dap*,Glx,Spd*,XPro,Carbohydrate,XPro,Glx,Chromophores,Carbohydrate,Sta*,Nonpolar,Bmt*,XTyr,Dap*": []
  },
  {
    "Cit*,Ala+Gly,Chromophores,XArg": []
  },
  {
    "His*,XSer,Cys*,Orn*,Ala+Gly,Dap*,FA,Lac*,Dab*,Dpr*,Chromophores": [
      {
        "similarity": 0.44,
        "peptide": "pyoverdin BTP16",
        "fingerprint_link": "https://bioinfo.lifl.fr/norine/transimilarity.jsp;jsessionid=CBF5CA23AED189BDDAB8043FEFDF0A14?similValue=0.4397578142119488&namePep2=pyoverdin+BTP16&idPep2=00206&userGraph=His*%2cXSer%2cCys*%2cOrn*%2cAla%2bGly%2cDap*%2cFA%2cLac*%2cDab*%2cDpr*%2cChromophores&choices=His*%3e0%2cXSer%3eXSer%3eSer%2cCys*%3e0%2cOrn*%3eOrn*%3eD-OH-Orn%2cAla%2bGly%3eAla%2bGly%3eGly%2cDap*%3e0%2cFA%3eFA%3eC4%3a0-OH(3)%2cLac*%3e0%2cDab*%3eDab*%3eD-Dab%

In [None]:
#This is to look nice
for (k,v) in dict_o_peps.items():
  v_json=v.to_json(orient="records")
  print(k, v_json.replace("[{","[\n  {").replace("},{","},\n  {").replace("]","\n]"))
  break