### Description:  

This is the third step in the motif conservation analysis (results shown in shown in fig7).  

In this step, for better readabily and ease, the occurrences tables are converted into ones where each column is a variant of HSV1 and each row ia a protein with one of its motifs  
'1' in a certain variant's column means that this variant's ortholog has a match for the HSV1 protein's motif in question  
'0' means that there was no match found  
'NA' means that there was no protein ortholog within this variant  

### we do simillar a process in both dmi motifs and non-dmi motifs  

**reminder:**  
* "non-dmi" is a sequence that matches a regular expression (regex) pattern of a certain motif that was dropped for 1 or 2 reasons:
1. The motif does not match a domain of the human PPIs of said viral protein
2. The average plddt of said motif is higher than 50 50

In [219]:
import pandas as pd
import os

## HSV

In [222]:
# Input tables for dmi and non-dmi from script "02 - check motifs in evolution"
hsv_dmi_motifs_occurence=r"hsv_dmi_motifs_OCCURENCES.csv"
hsv_non_dmi_motifs_occurence=r"hsv_non_dmi_motifs_OCCURENCES.csv"

In [221]:
# Output paths for dmi and non-dmi
hsv_dmi_motifs_output = r"dmi_motifs_OCCURENCES_by_variant.csv"
hsv_non_dmi_motifs_output = r"non_dmi_motifs_OCCURENCES_by_variant.csv"

### dmi

In [224]:
hsv_occurences_dmi_df = pd.read_csv(hsv_dmi_motifs_occurence)

In [225]:
hsv_occurences_dmi_df.head()

Unnamed: 0,viral_uniprot,viral_prot_name,motif_name,variant,ocurrence
0,P10238,UL54,MOD_CDK_SPK_2,UL54_NC_001798,1
1,P10238,UL54,MOD_CDK_SPK_2,UL54_NC_001806,1
2,P10238,UL54,MOD_CDK_SPK_2,UL54_NC_023677,1
3,P10238,UL54,MOD_CDK_SPK_2,UL54_NC_004812,1
4,P10238,UL54,MOD_CDK_SPK_2,UL54_NC_007653,0


In [226]:
hsv_dmi_uniprot_dict = dict(zip(hsv_occurences_dmi_df['viral_uniprot'].unique(), hsv_occurences_dmi_df['viral_prot_name'].unique()))

In [227]:
# dmi occurences

hsv_dmi_occurs_rows = []

our_variants=["NC_014567","NC_006560","NC_007653","NC_004812","NC_023677","NC_001806","NC_001798"]  ## specific for HSV ##

for uniprot, sub_df in hsv_occurences_dmi_df.groupby('viral_uniprot'):
    prot_name = hsv_dmi_uniprot_dict.get(uniprot)
    motifs_lst = list(sub_df["motif_name"].unique())
    for motif in motifs_lst:
        uniprot_motif_table = sub_df[sub_df["motif_name"] == motif]
        
        variants_lst = list(uniprot_motif_table["variant"].unique())
        matches = [(item1, item2) for item1 in our_variants for i, item2 in enumerate(variants_lst) if item1 in item2]
        
        variant_occur_dict={}
        for item1, item2 in matches:
            df_variant_ocurrences = uniprot_motif_table[uniprot_motif_table["variant"] == item2]
            occurences = df_variant_ocurrences["ocurrence"].array[0]
            variant_occur_dict[item1] = occurences
            
            
        hsv_dmi_row = {'viral_uniprot' : uniprot, 'prot_name' : prot_name, 'motif_name' : motif,
                                                      'Human HSV2' : variant_occur_dict.get('NC_001798',"NA"),
                                                      'Human HSV1' : variant_occur_dict.get('NC_001806',"NA"),
                                                      'Chimp HSV1' : variant_occur_dict.get('NC_023677',"NA"),
                                                      'Macacque HSV1' : variant_occur_dict.get('NC_004812',"NA"),
                                                      'Baboon HSV2' : variant_occur_dict.get('NC_007653',"NA") ,
                                                      'Grivet HSV2' : variant_occur_dict.get('NC_006560',"NA"),
                                                      'squirrel monkey HSV1' : variant_occur_dict.get('NC_014567',"NA")}
        hsv_dmi_occurs_rows.append(hsv_dmi_row)
    
            
            
 

In [228]:
hsv_dmi_occurs_df = pd.DataFrame(hsv_dmi_occurs_rows)

In [229]:
hsv_dmi_occurs_df.head()

Unnamed: 0,viral_uniprot,prot_name,motif_name,Human HSV2,Human HSV1,Chimp HSV1,Macacque HSV1,Baboon HSV2,Grivet HSV2,squirrel monkey HSV1
0,P04288,UL10,LIG_SH2_CRK,1,1,1,1,1,1,1
1,P04288,UL10,LIG_SH2_NCK_1,1,1,1,1,1,1,1
2,P04288,UL10,LIG_SH2_SRC,1,1,1,1,1,1,1
3,P04288,UL10,LIG_SH2_STAP1,1,1,1,1,1,1,1
4,P04288,UL10,LIG_SH2_STAT3,1,1,1,1,1,1,1


In [230]:
hsv_dmi_occurs_df.to_csv(hsv_dmi_motifs_output , index= False )

### non dmi

In [232]:
hsv_occurences_non_dmi_df = pd.read_csv(hsv_non_dmi_motifs_occurence)

In [233]:
hsv_occurences_non_dmi_df.head()

Unnamed: 0,viral_uniprot,viral_prot_name,motif_name,variant,ocurrence
0,P10238,UL54,CLV_C14_Caspase3-7,UL54_NC_001798,1
1,P10238,UL54,CLV_C14_Caspase3-7,UL54_NC_001806,1
2,P10238,UL54,CLV_C14_Caspase3-7,UL54_NC_023677,1
3,P10238,UL54,CLV_C14_Caspase3-7,UL54_NC_004812,1
4,P10238,UL54,CLV_C14_Caspase3-7,UL54_NC_007653,1


In [234]:
hsv_non_dmi_uniprot_dict = dict(zip(hsv_occurences_non_dmi_df['viral_uniprot'].unique(), hsv_occurences_non_dmi_df['viral_prot_name'].unique()))

In [235]:
# non-dmi occurences

hsv_non_dmi_occurs_rows = []

our_variants=["NC_014567","NC_006560","NC_007653","NC_004812","NC_023677","NC_001806","NC_001798"]  ## specific for HSV ##

for uniprot, sub_df in hsv_occurences_non_dmi_df.groupby('viral_uniprot'):
    prot_name = hsv_non_dmi_uniprot_dict.get(uniprot)
    motifs_lst = list(sub_df["motif_name"].unique())
    for motif in motifs_lst:
        uniprot_motif_table = sub_df[sub_df["motif_name"] == motif]
        
        variants_lst = list(uniprot_motif_table["variant"].unique())
        matches = [(item1, item2) for item1 in our_variants for i, item2 in enumerate(variants_lst) if item1 in item2]
        
        variant_occur_dict={}
        for item1, item2 in matches:
            df_variant_ocurrences = uniprot_motif_table[uniprot_motif_table["variant"] == item2]
            occurences = df_variant_ocurrences["ocurrence"].array[0]
            variant_occur_dict[item1] = occurences
            
            
        hsv_non_dmi_row = {'viral_uniprot' : uniprot, 'prot_name' : prot_name, 'motif_name' : motif,
                                                      'Human HSV2' : variant_occur_dict.get('NC_001798',"NA"),
                                                      'Human HSV1' : variant_occur_dict.get('NC_001806',"NA"),
                                                      'Chimp HSV1' : variant_occur_dict.get('NC_023677',"NA"),
                                                      'Macacque HSV1' : variant_occur_dict.get('NC_004812',"NA"),
                                                      'Baboon HSV2' : variant_occur_dict.get('NC_007653',"NA") ,
                                                      'Grivet HSV2' : variant_occur_dict.get('NC_006560',"NA"),
                                                      'squirrel monkey HSV1' : variant_occur_dict.get('NC_014567',"NA")}
        hsv_non_dmi_occurs_rows.append(hsv_non_dmi_row)
    
            
            
 

In [236]:
hsv_non_dmi_occurs_df = pd.DataFrame(hsv_non_dmi_occurs_rows)

In [237]:
hsv_non_dmi_occurs_df.head()

Unnamed: 0,viral_uniprot,prot_name,motif_name,Human HSV2,Human HSV1,Chimp HSV1,Macacque HSV1,Baboon HSV2,Grivet HSV2,squirrel monkey HSV1
0,P04288,UL10,CLV_NRD_NRD_1,1,1,1,1,1,1,1
1,P04288,UL10,CLV_PCSK_KEX2_1,1,1,1,1,1,1,1
2,P04288,UL10,DOC_USP7_MATH_1,1,1,1,1,1,1,1
3,P04288,UL10,DOC_WW_Pin1_4,1,1,1,0,0,1,1
4,P04288,UL10,LIG_FHA_1,1,1,1,1,1,1,1


In [238]:
hsv_non_dmi_occurs_df.to_csv(hsv_non_dmi_motifs_output , index= False )