### Description:  
This script performs what is described in supp fig "Schematic example of inference of functional motifs in viral proteins", i.e., detecting PPIs mediated through domain-motif interactions

in this script "dmi" refers to "domain-motif interactions"  

* "non-dmi" is a sequence that matches a regular expression (regex) pattern of a certain motif that was dropped for 1 or 2 reasons:
1 Tthe motif des oo't match a domain of the human PPIs of said viral protein
2T the average plddt of said motif is higher tha50n   * 

"dm"is are motfs thaits residues are e disordered and ALSO thits human interactorihas a f mating a domais othe f said viral prote (e.g. a match between an SH3 domain and an SH3-binding motif)i50

The process consists of the following steps:  
1. Defining human proteins and their domains using the ELM database  
2. Defining domain-motif pairs using the ELM database  
3. Defining human protein-protein interactions (PPIs) per viral protein  
4. Finding motifs within the viral proteins using their regular expression (regex)  
5. Integrating the 4 components in order to "couple" a viral protein with an interacting human protein, in which the viral motif and the human domain are known to interact  

In [275]:
import pandas as pd
import os
import requests
import re
import numpy as np
import ast

## databases

In [280]:
#human proteins and their domain from ELM DB

path_in_human_prot_and_domain_csv = # path for elm_human_protein_domains.csv #

In [281]:
#domain and motifs interations from ELM DB

path_in_elm_interaction = #path for elm_motif_domains_interaction.tsv #

In [282]:
#motifs and their regular expression from ELM DB

path_elms_clean = # path for elm_motifs_data_regex.tsv #

In [283]:
#function to get protein sequences from uniprot.
#input: uniprot ID

def get_uniprot_sequence(uniprot_id):
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
    response = requests.get(url)
    
    if response.status_code == 200:
        fasta = response.text
        # Extract sequence (ignoring the header line)
        sequence = "".join(fasta.splitlines()[1:])
        return sequence
    else:
        raise ValueError(f"Failed to retrieve sequence for {uniprot_id}. HTTP Status: {response.status_code}")

##  motif regex dictionary

In [285]:
elm_regex_df = pd.read_csv(path_elms_clean , delimiter='\t', on_bad_lines='skip')
elm_regex_dict=dict(zip(elm_regex_df.ELMIdentifier, elm_regex_df.Regex))

## human protein - domain dictionary

In [287]:
human_domain_df = pd.read_csv(path_in_human_prot_and_domain_csv, skiprows=12, on_bad_lines='skip')

In [288]:
human_domain_df.rename(columns={'UniProt ID': 'UniProt_ID', 'Pfam ID': 'Pfam_ID',}, inplace=True)
human_domain_df["Pfam_ID"] = human_domain_df["Pfam_ID"].fillna(0)
human_domain_dict = dict(zip(human_domain_df.UniProt_ID, human_domain_df.Pfam_ID))


In [289]:
# Parse domains to list of domains
for prot, domains in human_domain_dict.items():
    if type(domains) == str:
        domains = list(domains.split(";"))
        human_domain_dict[prot] = domains
    else:
        continue


In [290]:
human_domain_dict = {x: y for x, y in human_domain_dict.items() if y != 0} ## human_uniprot: pfams list

## domain motif interaction dictionary

In [292]:
elm_int_df = pd.read_csv(path_in_elm_interaction, delimiter='\t', on_bad_lines='skip')

elm_int_dict = dict(zip(elm_int_df.ELM_identifier, elm_int_df.Interaction_Domain_Id))  ## motif_name : pfam_name

## hsv1

In [294]:
hsv_uniprot_path = # path for hsv1_protein_level_table.csv #

In [295]:
hsv_uniprot_df = pd.read_csv(hsv_uniprot_path)

In [297]:
hsv_prots = hsv_uniprot_df['Uniprot ID'].unique()

### find ppi

In [299]:
# Table that only has protein which have human ppi
hsv_uniprot_df_ppi = hsv_uniprot_df.dropna(subset = ['human PPIs'])

In [300]:
hsv_uniprot_df_ppi['human PPIs'] = hsv_uniprot_df_ppi['human PPIs'].apply(ast.literal_eval) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hsv_uniprot_df_ppi['human PPIs'] = hsv_uniprot_df_ppi['human PPIs'].apply(ast.literal_eval)


In [301]:
hsv_ppi_dict = dict(zip(hsv_uniprot_df_ppi['Uniprot ID'],hsv_uniprot_df_ppi['human PPIs']))

In [303]:
hsv_ppi_df = pd.DataFrame({'Uniprot ID': hsv_ppi_dict.keys(), 'human_ppi': hsv_ppi_dict.values()})

### find regex motifs

In [307]:
# Get sequences from uniprot for proteins with ppi
hsv_ppi_df['v_sequence'] = hsv_ppi_df['Uniprot ID'].apply(lambda x: get_uniprot_sequence(x))    

In [308]:
hsv_seq_dict = dict(zip(hsv_ppi_df['Uniprot ID'], hsv_ppi_df['v_sequence']))

In [309]:
# Look for motifs within the sequences using regular expression (re.finditer)
hsv_regex_df_rows = [] 

for name, seq in hsv_seq_dict.items():   
    found_match = False
    for identifier, pattern in elm_regex_dict.items():
        matches = re.finditer(pattern, str(seq))
        for match in matches:
            found_match = True
            start_Regex = match.start() + 1
            end_Regex = match.end()
            extract= seq[start_Regex - 1:end_Regex]
            row = {'v_prot': name,'motif_name': identifier, 'start': start_Regex ,'end': end_Regex ,'seq':extract}
            hsv_regex_df_rows.append(row)                    
    if not found_match:
        print(f"No matches found for protein: {name}")

hsv_regex_df = pd.DataFrame(hsv_regex_df_rows)

### plddt - create plddt dictionary and define disorder motifs

In [313]:
hsv_plddt_path = # path for hsv1_residue_level_table.csv #

In [314]:
hsv_plddt_df = pd.read_csv(hsv_plddt_path)

In [316]:
# Create plldt dictionary: 
# Protein : plddt list (in order of the residues)
hsv_plddt_dict = {}

for protein, sub_df in hsv_plddt_df.groupby('viral_uniprot'):
    plddt_list = sub_df['pLDDT'].tolist()
    hsv_plddt_dict[protein] = plddt_list

In [318]:
# Define motif's disorder score using average plddt
for index, row in hsv_regex_df.iterrows():
    uniprot = row['v_prot']
    motif_start = row['start']
    motif_end = row['end']

    res_scores = hsv_plddt_dict.get(uniprot)
    if res_scores is not None and not np.isnan(res_scores).any():
        motif_scores = res_scores[motif_start -1: motif_end]

        motif_plddt = np.round(np.mean(motif_scores), 5)
        # print( motif_plddt)

        hsv_regex_df.at[index, 'motif_plddt'] = motif_plddt
          

In [464]:
hsv_regex_df.to_csv(r"hsv1_motifs_plddt.csv", index= False)

In [320]:
# Table with only disorder motifs
hsv_regex_df_disorder = hsv_regex_df[hsv_regex_df['motif_plddt'] <= 50]

In [468]:
hsv_regex_df_disorder.to_csv(r"hsv_motifs_plddt_disorder_only.csv", index= False)

In [322]:
# Dictionary of the proteins as keys, and a list of their disordered motifs in a list as values
hsv_viral_motifs_dict = hsv_regex_df_disorder.groupby('v_prot')['motif_name'].apply(list).to_dict()  #viral uniprot: motifs list

### integrate data

In [324]:
# Store rows for nest step - creating a dataframe
hsv_dmi_rows = []

# Iterate through the proteins within the table
for index, row in hsv_uniprot_df.iterrows():
    v_prot = row['Uniprot ID']
    v_motifs_lst = hsv_viral_motifs_dict.get(v_prot, [])
    
    h_ppi_lst = hsv_ppi_dict.get(v_prot)
    
    if h_ppi_lst:
        
        for h_ppi in h_ppi_lst:
            h_domains_lst = human_domain_dict.get(h_ppi) # take only proteins with human PPIs that also have known domains
    
            if h_domains_lst is None:
                continue
    
            # Find "connected" pairs - human ppi that has a domain that matches the viral protein's motif
            connected_pairs = [(motif, elm_int_dict.get(motif)) for motif in v_motifs_lst if elm_int_dict.get(motif) in h_domains_lst]
            if connected_pairs:
                motifs, domains = zip(*connected_pairs)
                hsv_dmi_rows.append({'v_prot': v_prot, 'h_DMI': h_ppi, 'motifs': list(motifs), 'domains': list(domains)})
    else:
        continue


In [325]:
# Create the final DataFrame
hsv_dmi_df = pd.DataFrame(hsv_dmi_rows)

In [328]:
hsv_dmi_df['motif'] = hsv_dmi_df['motifs'].apply(lambda x: list(set(x)))

In [329]:
hsv_dmi_df['domain'] = hsv_dmi_df['domains'].apply(lambda x: list(set(x)))

In [330]:
hsv_dmi_df.drop(['domains','motifs'], axis=1, inplace= True)

In [332]:
hsv_dmi_df.to_csv(r"hsv_dmi_table.csv" , index= False)