### Description:  
This is the first step in the motif conservation analysis (results shown in shown in fig7).

In this step we create 2 tables -> one with dmi motifs and one with non-dmi motifs  
  
**reminder:**  
* "non-dmi" is a sequence that matches a regular expression (regex) pattern of a certain motif that was dropped for 1 or 2 reasons:
1. The motif does not match a domain of the human PPIs of said viral protein
2. The average plddt of said motif is higher than 50 

In [426]:
import pandas as pd
import os
import ast

## motif regex pattern dictionary

In [428]:
elm_motifs_and_regex_clean = # path for elm_motifs_data_regex.tsv #

In [429]:
elm_df = pd.read_csv(elm_motifs_and_regex_clean , delimiter='\t', on_bad_lines='skip')

In [430]:
elm_dict=dict(zip(elm_df.ELMIdentifier, elm_df.Regex))

## HSV1

In [624]:
# Table of all the virus's disordered motifs from the "identify DMIs" script
hsv_disordered_motifs_path = r"hsv_motifs_plddt_disorder_only.csv"

In [626]:
hsv_disorder_motifs_df = pd.read_csv(hsv_disordered_motifs_path )

In [434]:
hsv_disorder_motifs_df.head()

Unnamed: 0,v_prot,motif_name,start,end,seq,motif_plddt
0,P10238,CLV_C14_Caspase3-7,14,18,DLSDS,44.622
1,P10238,CLV_NRD_NRD_1,31,33,RRD,43.14
2,P10238,CLV_NRD_NRD_1,111,113,RRP,41.99333
3,P10238,CLV_NRD_NRD_1,141,143,RRG,45.12
4,P10238,CLV_NRD_NRD_1,145,147,RRG,46.68333


In [435]:
hsv_disorder_motifs_df = hsv_disorder_motifs_df[['v_prot' , 'motif_name']].drop_duplicates()

### add regex

In [437]:
# Using elm's database - creating a column for each motif with its regular expressin
hsv_disorder_motifs_df['motif_regex'] = hsv_disorder_motifs_df['motif_name'].map(elm_dict)

In [438]:
hsv_disorder_motifs_df.head()

Unnamed: 0,v_prot,motif_name,motif_regex
0,P10238,CLV_C14_Caspase3-7,[DSTE][^P][^DEWHFYC]D[GSAN]
1,P10238,CLV_NRD_NRD_1,(.RK)|(RR[^KR])
6,P10238,CLV_PCSK_FUR_1,R.[RK]R.
7,P10238,CLV_PCSK_KEX2_1,[KR]R.
12,P10238,CLV_PCSK_PC7_1,R...[KR]R.


### is said motif in dmi data or not?

motifs in dmi table are already only those who appear in disordered regions

In [441]:
hsv_dmi_path = r"hsv_dmi_table.csv"

In [442]:
hsv_dmis_df = pd.read_csv(hsv_dmi_path)

In [443]:
hsv_dmis_df.head()

Unnamed: 0,v_prot,h_DMI,motif,domain
0,P10238,H3BLV9,"['MOD_CDK_SPK_2', 'MOD_CK1_1', 'MOD_CK2_1', 'M...",['PF00069']
1,P06477,Q15772,"['MOD_GSK3_1', 'MOD_CK2_1', 'MOD_PKA_2']",['PF00069']
2,P04296,Q93009,['DOC_USP7_MATH_1'],['PF00917']
3,P10240,Q9P1W9,"['MOD_CK2_1', 'MOD_ProDKin_1', 'MOD_Plk_4', 'M...",['PF00069']
4,P10240,O00308,['DOC_WW_Pin1_4'],['PF00397']


In [444]:
# Create dictionary of the dmi table from the "identify DMIs"

hsv_dmis_dict = {}

for prot, sub_df in hsv_dmis_df.groupby('v_prot'):
    # Convert stringified lists to actual lists
    motif_series = sub_df['motif'].apply(ast.literal_eval)

    # Flatten the list and remove duplicates
    flat_unique_motifs = list(set(motif for sublist in motif_series for motif in sublist))

    # Save to dictionary
    hsv_dmis_dict[prot] = flat_unique_motifs

In [None]:
# Function recieve a row of motifs table- looks for it in the dmi dictionary
# Defines motif as DMI or non DMI

def is_dmi(row):
    prot = row['v_prot']
    motif = row['motif_name']
    if prot in hsv_dmis_dict.keys():
        if motif in hsv_dmis_dict.get(prot):
            return "dmi"
        else:
            return "non dmi"
    else:
        return "non dmi"

In [446]:
# Add column "is_dmi" to define motifs as dmi/non dmi
hsv_disorder_motifs_df['is_dmi'] = hsv_disorder_motifs_df.apply(is_dmi, axis=1)

In [447]:
hsv_disorder_motifs_df.head()

Unnamed: 0,v_prot,motif_name,motif_regex,is_dmi
0,P10238,CLV_C14_Caspase3-7,[DSTE][^P][^DEWHFYC]D[GSAN],non dmi
1,P10238,CLV_NRD_NRD_1,(.RK)|(RR[^KR]),non dmi
6,P10238,CLV_PCSK_FUR_1,R.[RK]R.,non dmi
7,P10238,CLV_PCSK_KEX2_1,[KR]R.,non dmi
12,P10238,CLV_PCSK_PC7_1,R...[KR]R.,non dmi


### split to 2 tables - dmi motifs and non-dmi motifs

In [449]:
hsv_motifs_df_dmi = hsv_disorder_motifs_df[hsv_disorder_motifs_df['is_dmi'] == 'dmi']
hsv_motifs_df_non_dmi = hsv_disorder_motifs_df[hsv_disorder_motifs_df['is_dmi'] == 'non dmi']

In [452]:
hsv_motifs_df_dmi.to_csv(r'hsv_dmi_motifs.csv', index= False)

In [453]:
hsv_motifs_df_non_dmi.to_csv(r'hsv_non_dmi_motifs.csv', index= False)