In [18]:
from CoDIAC import featureTools, InterPro, UniProt, PDB, IntegrateStructure_Reference, PTM, translateFeatures, jalviewFunctions
import CoDIAC
import pandas as pd
Interpro_ID = 'IPR000980' #IPR000980 is the SH2 domain superfamily

#We will be creating a lot of files, this is how we would like them to be named
data_root = 'Data/'
name_root = 'SH2_'+Interpro_ID

# The files we will make in this process (so that different pieces of code can be run below as needed)
uniprot_reference_file = data_root+'Uniprot_Reference/'+name_root+'_uniprot_reference.csv' # The uniprot reference
fasta_long_header_file = data_root + 'Uniprot_Reference/' + name_root+'_long_header.fasta'
fasta_file = data_root + 'Uniprot_Reference/' + name_root+'.fasta'
#note: in addition to these 3 files, this also makes a mapping file for movng between fasta_long_header_file and fasta_file

#PDB Files we'll make in this process
PDB_file = data_root + 'PDB_Reference/' + name_root + '_PDB.csv'
PDB_file_annotated = data_root+ 'PDB_Reference/' + name_root + '_PDB_annotated.csv'
PDB_file_filtered = data_root + 'PDB_Reference/' + name_root + '_PDB_reference.csv' #The final PDB structure file, containing only filtered proteins

# PTMs feature directory location
feature_dir = 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/'

N_OFFSET = 0
C_OFFSET = -1


### STEP 1: Get all Uniprot IDs that match a family of interest


In [2]:
uniprot_IDs, species_dict = CoDIAC.InterPro.fetch_uniprotids(Interpro_ID, REVIEWED=True, species='Homo sapiens')

Fetched 109 Uniprot IDs linked to IPR000980, where count expected to be 109


### STEP 2: Make a human reference file of the family of interest

In [3]:

uniprot_df = CoDIAC.UniProt.makeRefFile(uniprot_IDs, uniprot_reference_file)

Domain Reference File successfully created!
Adding Interpro Domains
Fetching domains..
Appending domains to file..
Interpro metadata succesfully incorporated


Manually checked Uniprot reference file for errors/issues. Found Uniprot and Interpro generally agreeing, with Interpro adding additional domains of interest. However, the atypical SH2 domains (JAK family) differ quite a bit in their boundaries. After testing alignment effects, we selected the SMART domain boundaries, changing the InterPro boundaries by hand (since this is the column used) (SMART landed somehwere between the Uniprot defined boundaries and the longer boundaries of the InterPro database in length)

Changes:
1. JAK1 SH2:IPR000980:437:546 -> SH2:IPR000980:437:544; JAK2 SH2:IPR000980:397:501 -> 397:487; JAK3 SH2:IPR000980:373:477 -> 373:463; TYK2 SH2:IPR000980:450:553 -> 452:539 
2. Manually removed the alpha-helix region termed "PI3K_P85_iSH2:IPR032498" in the Interpro file for P27986,PIK3R1; Q92569,PIK3R3; O00459,PIK3R2
3. Manually removed C lobe SH2 domain in Supt6h, which overlaps with the parent SH2 domain:
Spt6_SH2_C:IPR035018:1424:1515 (removed)


### STEP 3: Get information about all PDB IDs that exist for the reference proteins of interest

In [2]:

CoDIAC.PDB.generateStructureRefFile_fromUniprotFile(uniprot_reference_file, PDB_file)

Structure Reference File successfully created!
All PDBs successfully fetched


### STEP 4: Annotate the structure file with reference, for domain annotation

In [2]:
struct_df_out = CoDIAC.IntegrateStructure_Reference.add_reference_info_to_struct_file(PDB_file, uniprot_reference_file, PDB_file_annotated, INTERPRO=True, verbose=False)

### STEP 5: Reduce the structure file to just those that contain the domain of interest

In [3]:
# Now with an appended PDB File, create an output that contains only the lines that have SH2 domains
CoDIAC.IntegrateStructure_Reference.filter_structure_file(PDB_file_annotated, Interpro_ID, PDB_file_filtered)

Made Data/PDB_Reference/SH2_IPR000980_PDB_reference.csv file: 456 structures retained of 1409 total


### STEP 6: Create the FASTA Reference file for SH2 domains

In [19]:
# Given the SH2 domain file, create the fasta reference file (using INTERPRO as default)

CoDIAC.UniProt.print_domain_fasta_file(uniprot_reference_file, Interpro_ID, fasta_long_header_file, N_OFFSET, C_OFFSET, APPEND=False)


# Shortening the fasta headers, still unique for each domain/protein pair
# dropping the redundant information about the domains printed 

key_array_order= ['uniprot', 'gene', 'domain_num', 'start', 'end']
#translation creates a mapping file 
output_fasta, mapping_file = CoDIAC.UniProt.translate_fasta_to_new_headers(fasta_long_header_file, fasta_file, key_array_order)


n offset is 0 and c offset is -1
Created files: Data/Uniprot_Reference/SH2_IPR000980.fasta and Data/Uniprot_Reference/SH2_IPR000980_mapping.csv


Perform Promals3D alignment at http://prodata.swmed.edu/promals3d/promals3d.php
Using the fasta_file with shorter headers 
Once complete, replace the _ with | in the promals3d results
Committed that to the alignment file location (see next cell where alignment file can be instatiated for later steps)


In [6]:
# alignment file location, once created on Promals3D
alignment_file = 'Data/Uniprot_Reference/alignment/SH2_IPR000980_promals3d.fasta'


## STEP 7 Create the PTM feature files

#### STEP 7a create the ProteomeScout based Features files

In [20]:
feature_dir_prot = feature_dir+'/ProteomeScout/'
ptm_feature_file_list, ptm_count_dict, ptm_feature_dict, mapping_dict = CoDIAC.PTM.write_PTM_features(Interpro_ID, uniprot_reference_file, feature_dir_prot, mapping_file, N_OFFSET, C_OFFSET, gap_threshold=0.7, num_PTM_threshold = 5)
print("Wrote these feature files:")
print(ptm_feature_file_list)
print("These belong to the following fasta file:")
print(output_fasta) #comes from block above - the short header format of the fasta header
print(ptm_count_dict)

n offset is 0 and c offset is -1
n offset is 0 and c offset is -1
Error: PTM record not found by Q7Z4S9
Error: PTM record not found by Q7Z4S9
Error: PTM record not found by Q8TC17
Error: PTM record not found by Q8TC17
Wrote these feature files:
['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_N6-acetyllysine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphoserine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Ubiquitination.feature']
These belong to the following fasta file:
Data/Uniprot_Reference/SH2_IPR000980.fasta
{'Phosphotyrosine': 182, 'N6-acetyllysine': 32, 'Phosphothreonine': 61, 'Pho

#### STEP 7b create the PhosphoSite based Features files

In [21]:
feature_dir_psite = feature_dir + 'PHOSPHOSITE_PLUS/'
ptm_feature_file_list, ptm_count_dict, ptm_feature_dict, mapping_dict = CoDIAC.PTM.write_PTM_features(Interpro_ID, uniprot_reference_file, feature_dir_psite, mapping_file, N_OFFSET, C_OFFSET, gap_threshold=0.7, num_PTM_threshold = 5, PHOSPHOSITE_PLUS=True)
print("Wrote these feature files:")
print(ptm_feature_file_list)
print("These belong to the following fasta file:")
print(output_fasta) #comes from block above - the short header format of the fasta header
print(ptm_count_dict)

n offset is 0 and c offset is -1
n offset is 0 and c offset is -1
Found no mods for Q7Z4S9
Found no mods for Q9HBL0
Using an isoform for PTMs Q9HBL0-1, found mods
S2-Phosphoserine;S4-Phosphoserine;T68-Phosphothreonine;T84-Phosphothreonine;S92-Phosphoserine;S170-Phosphoserine;Y210-Phosphotyrosine;Y217-Phosphotyrosine;S257-Phosphoserine;Y327-Phosphotyrosine;T329-Phosphothreonine;S330-Phosphoserine;S338-Phosphoserine;Y339-Phosphotyrosine;S343-Phosphoserine;T357-Phosphothreonine;S364-Phosphoserine;Y366-Phosphotyrosine;S374-Phosphoserine;S378-Phosphoserine;T379-Phosphothreonine;S390-Phosphoserine;T399-Phosphothreonine;S401-Phosphoserine;S403-Phosphoserine;S404-Phosphoserine;S406-Phosphoserine;S433-Phosphoserine;S445-Phosphoserine;Y458-Phosphotyrosine;T460-Phosphothreonine;S465-Phosphoserine;S471-Phosphoserine;S475-Phosphoserine;T498-Phosphothreonine;S512-Phosphoserine;T518-Phosphothreonine;S538-Phosphoserine;S547-Phosphoserine;Y561-Phosphotyrosine;T575-Phosphothreonine;Y582-Phosphotyrosine;

#### Modified PTM feature files to remove modifications for TNS1, which has issues that have yet to be resolved.
This involved Phosphserine, Phosphothreonine, and Phosphotyrosine for ProteomeScout.
This involved Phosphoserine, Phosphothreonine, Phosphotyrosine, and Ubiquitination for PhosphositePlus


In [32]:
ptm_feature_file_list_psite = ['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Ubiquitination.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Acetylation.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphoserine.feature']
ptm_feature_file_list_pscout = ['Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphotyrosine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_N6-acetyllysine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphothreonine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphoserine.feature', 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Ubiquitination.feature']
ptm_feature_file_list_all = ptm_feature_file_list_psite + ptm_feature_file_list_pscout

In [22]:
# Create the annotation tracks for PTM features and 
for feature_file in ptm_feature_file_list_all:
    file_arr = feature_file.split('.')
    annotation_file = file_arr[0] + '.ann'
    CoDIAC.jalviewFunctions.print_ann_file(feature_file, alignment_file, annotation_file)




Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphotyrosine.ann
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Ubiquitination.ann
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Acetylation.ann
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphothreonine.ann
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_Phosphoserine.ann
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_Phosphotyrosine.ann
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features//ProteomeScout/IPR000980_N6-acetyllysine.ann
Wrote annotation track at Data

In [34]:
#create an integration of all PTMs into one file.
feature_file = feature_dir+'PTM_all_features.feature'
feature_combined, feature_color_dict = CoDIAC.jalviewFunctions.combine_feature_files(feature_file, ptm_feature_file_list)
 
annotation_file = feature_dir+'PTM_all_features_promals3d.ann'
jalviewFunctions.print_ann_file(feature_file, alignment_file, annotation_file)

Created Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PTM_all_features.feature
Wrote annotation track at Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PTM_all_features_promals3d.ann


### Step 8 combine feature files from ProteomeScout and PhosphoSitePlus and generate annotation tracks.

#paired list



In [24]:
pairs = {}

proteomescout_base = 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/ProteomeScout/IPR000980_'
PSP_base = 'Data/Uniprot_Reference/Features_relative_to_reference/PTM_features/PHOSPHOSITE_PLUS/IPR000980_'
pairs['N6-acetyllysine'] = [proteomescout_base+'N6-acetyllysine.feature', PSP_base+'Acetylation.feature']
pairs['Phosphoserine'] = [proteomescout_base+'Phosphoserine.feature', PSP_base+'Phosphoserine.feature']
pairs['Phosphothreonine'] = [proteomescout_base+'Phosphothreonine.feature', PSP_base+'Phosphothreonine.feature']
pairs['Phosphotyrosine'] = [proteomescout_base+'Phosphotyrosine.feature', PSP_base+'Phosphotyrosine.feature']
pairs['Ubiquitination'] = [proteomescout_base+'Ubiquitination.feature', PSP_base+'Ubiquitination.feature']

output_dir = 'Data/Features/PTMS_all/'
new_feature_files = {}

for mod in pairs.keys():
    feature_file = output_dir+'SH2_IPR000980_'+mod+'.feature'
    feature_combined, feature_color_dict = CoDIAC.jalviewFunctions.combine_feature_files(feature_file, pairs[mod])
    new_feature_files[mod] = feature_file


Created Data/Features/PTMS_all/SH2_IPR000980_N6-acetyllysine.feature
Created Data/Features/PTMS_all/SH2_IPR000980_Phosphoserine.feature
Created Data/Features/PTMS_all/SH2_IPR000980_Phosphothreonine.feature
Created Data/Features/PTMS_all/SH2_IPR000980_Phosphotyrosine.feature
Created Data/Features/PTMS_all/SH2_IPR000980_Ubiquitination.feature


In [25]:
# also combine the phosphoserine and phosphothreonine sets
feature_file = output_dir+'SH2_IPR000980_Phosphoserine_Phosphothreonine.feature'
feature_combined, feature_color_dict = CoDIAC.jalviewFunctions.combine_feature_files(feature_file, ['Data/Features/PTMS_all/SH2_IPR000980_Phosphothreonine.feature', 'Data/Features/PTMS_all/SH2_IPR000980_Phosphoserine.feature'])

Created Data/Features/PTMS_all/SH2_IPR000980_Phosphoserine_Phosphothreonine.feature


In [29]:
# Changed the Phosphoserine and Phosphothreonine feature names to Phosphorylation(ST)
feature_file = output_dir+'SH2_IPR000980_Phosphoserine_Phosphothreonine.feature'
jalviewFunctions.print_ann_file(feature_file, alignment_file, output_dir+'SH2_IPR000980_Phosphoserine_Phosphothreonine.ann')

Wrote annotation track at Data/Features/PTMS_all/SH2_IPR000980_Phosphoserine_Phosphothreonine.ann


In [27]:
#replaced the combined/merged features that overlapped in both resources and then run the annotation tracks
for  mod in new_feature_files.keys():
    feature_file = new_feature_files[mod]
    annotation_file = feature_file.replace('.feature', '.ann')
    jalviewFunctions.print_ann_file(feature_file, alignment_file, annotation_file)

Wrote annotation track at Data/Features/PTMS_all/SH2_IPR000980_N6-acetyllysine.ann
Wrote annotation track at Data/Features/PTMS_all/SH2_IPR000980_Phosphoserine.ann
Wrote annotation track at Data/Features/PTMS_all/SH2_IPR000980_Phosphothreonine.ann
Wrote annotation track at Data/Features/PTMS_all/SH2_IPR000980_Phosphotyrosine.ann
Wrote annotation track at Data/Features/PTMS_all/SH2_IPR000980_Ubiquitination.ann


In [39]:
#Combine all PTMs into one final file 
# updated the files for all PTMs to use just one modification name. These are in PTMs_final
ptm_feature_all_dir = 'Data/Features/PTMs_all/PTMs_final/'

#run the combined jalview annotation
ST_file = ptm_feature_all_dir+'SH2_IPR000980_Phosphoserine_Phosphothreonine.feature'
jalviewFunctions.print_ann_file(ST_file, alignment_file, ST_file.replace('.feature', '.ann'))

feature_file_list = []
for mod in ['N6-acetyllysine', 'Phosphotyrosine', 'Phosphoserine', 'Phosphothreonine', 'Ubiquitination']:
    feature_file = ptm_feature_all_dir+'SH2_IPR000980_'+mod+'.feature'
    jalviewFunctions.print_ann_file(feature_file, alignment_file, feature_file.replace('.feature', '.ann'))
    feature_file_list.append(feature_file)
output_final_PTMs_all = ptm_feature_all_dir+'SH2_IPR000980_PTMs_all.feature'
feature_combined, feature_color_dict = CoDIAC.jalviewFunctions.combine_feature_files(output_final_PTMs_all, feature_file_list)
jalviewFunctions.print_ann_file(output_final_PTMs_all, alignment_file, output_final_PTMs_all.replace('.feature', '.ann'))

Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_Phosphoserine_Phosphothreonine.ann
Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_N6-acetyllysine.ann
Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_Phosphotyrosine.ann
Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_Phosphoserine.ann
Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_Phosphothreonine.ann
Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_Ubiquitination.ann
Created Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_PTMs_all.feature
Wrote annotation track at Data/Features/PTMs_all/PTMs_final/SH2_IPR000980_PTMs_all.ann
