Updated mzIdentML/spec.rb

JesseJ · Jul 16, 2010 · 5802e3e · 5802e3e
1 parent 27dbfad
commit 5802e3e
Show file tree

Hide file tree

Showing 16 changed files with 42,138 additions and 531 deletions.
diff --git a/mzIdentML/format.rb b/mzIdentML/format.rb
@@ -113,6 +113,12 @@ def conformScoreName(name, engine)
           "mascot"
         when "OMSSA"
           "OMSSA"
+        when "Tide"
+           "sequest"
+        when "Phenyx"
+          "Phenyx"
+        when "SpectraST"
+          "SpectraST"
       end
 
     [base, name].join(':')

diff --git a/mzIdentML/mascot-key.mzid b/mzIdentML/mascot-key.mzid
diff --git a/mzIdentML/obo_converter.rb b/mzIdentML/obo_converter.rb
@@ -1,12 +1,12 @@
 require 'nokogiri'
 require 'yaml'
 
-#This program will convert the psi-ms.obo file into a format that's easier to parse.
-#Creates the mzid_name and the pepxml_name to allow for conversion from pepxml names to mzid names. Sadly, pepxml names which
-#differ from mzid names will have to be set by hand.
+# This program will convert the psi-ms.obo file into a format that's easier to parse.
+# Creates the mzid_name and the pepxml_name to allow for conversion from pepxml names to mzid names. Sadly, pepxml names which
+# differ from mzid names will have to be set by hand.
 
 
-#Create yaml file
+# Create yaml file
 file = File.new("#{File.dirname($0)}/obo.yaml", "w")
 obo = File.open("#{File.dirname($0)}/psi-ms.obo", "r")
 yml = []

diff --git a/mzIdentML/oboe.yaml b/mzIdentML/oboe.yaml
@@ -4085,6 +4085,9 @@
 - :pepxml_name: xtandem
   :id: MS:1001476
   :mzid_name: xtandem
+- :pepxml_name: X! Tandem
+  :id: MS:1001476
+  :mzid_name: xtandem
 - :pepxml_name: SpectraST
   :id: MS:1001477
   :mzid_name: SpectraST

diff --git a/mzIdentML/omssa-key.mzid b/mzIdentML/omssa-key.mzid
diff --git a/mzIdentML/pepxml.rb b/mzIdentML/pepxml.rb
@@ -2,7 +2,10 @@
 require "#{File.dirname($0)}/natcmp.rb"
 require "ms/fasta.rb"
 
+# The pepXML implementation of Format
 class PepXML < Format
+  # file == a string containing the pepXML file location
+  # database == a string containing the FASTA database that was used by the search engine
   def initialize(file, database)
     super
     @type = "pepxml"
@@ -11,7 +14,7 @@ def initialize(file, database)
     @sequences = 0
     @proteinIndices = []
 
-    #Nokogiri won't parse out the information of an XML file that uses namespaces unless you add xmlns, and vice versa.
+    # Nokogiri won't parse out the information of an XML file that uses namespaces unless you add xmlns, and vice versa.
     @xmlns = "xmlns:" if hasNamespace
 
     findAllPepLocations
@@ -34,27 +37,27 @@ def database
     @database
   end
 
-  #Retrieves the date in the pepXML file
+  # Retrieves the date in the pepXML file
   def date
     @doc.xpath("#{@xmlns}msms_pipeline_analysis/@date").to_s
   end
 
-  #Retrieves the number of database sequences
+  # Retrieves the number of database sequences
   def numberOfSequences
     @sequences
   end
 
-  #Retrieves the name of the search engine
+  # Retrieves the name of the search engine
   def searchEngine
     @engine
   end
 
-  #Simply returns 0 because I don't know how to obtain the threshold from pepXML
+  # Simply returns 0 because I don't know how to obtain the threshold from pepXML
   def threshold
     0
   end
 
-  #Retrieves all the proteins. Not sure if this is correct.
+  # Retrieves all the proteins. Not sure if this is correct.
   def proteins
     allHits = @doc.xpath("//#{@xmlns}search_hit/@protein|//#{@xmlns}search_hit/@protein_descr")
     pros = []
@@ -70,7 +73,7 @@ def proteins
     @pros
   end
 
-  #Retrieves all the peptides. Not sure if this is correct.
+  # Retrieves all the peptides. Not sure if this is correct.
   def peptides
     allHits = @doc.xpath("//#{@xmlns}search_hit/@peptide")
     peps = []
@@ -96,12 +99,12 @@ def peptides
     peps
   end
 
-  #Retrieves the name of the search database that was used.
+  # Retrieves the name of the search database that was used.
   def databaseName
     @databaseName
   end
 
-  #Retrieves the spectrum queries. Spectrum indexs not guarenteed to be correct.
+  # Retrieves the spectrum queries. Spectrum indexs not guarenteed to be correct.
   def results
     queries = @doc.xpath("//#{@xmlns}spectrum_query")
     indicies = @doc.xpath("//#{@xmlns}spectrum_query/@spectrum").collect {|index| index.to_s}
@@ -128,7 +131,7 @@ def results
 
   private
 
-  #Checks if the pepXML file used namespaces
+  # Checks if the pepXML file used namespaces
   def hasNamespace
     if @doc.xpath("msms_pipeline_analysis").to_s.length == 0
       true
@@ -137,7 +140,7 @@ def hasNamespace
     end
   end
 
-  #Obtains the result items
+  # Obtains the result items
   def getItem(hit, rank, charge)
     mass = hit.xpath("./@calc_neutral_pep_mass").to_s.to_f
     diff = hit.xpath("./@massdiff").to_s.to_f
@@ -166,7 +169,7 @@ def getItem(hit, rank, charge)
     item
   end
 
-  #Obtains the peptideEvidence
+  # Obtains the peptideEvidence
   def getEvidence(hit, pep, id)
     pre = hit.xpath("./@peptide_prev_aa").to_s
     post = hit.xpath("./@peptide_next_aa").to_s
@@ -186,7 +189,7 @@ def getEvidence(hit, pep, id)
     PepEvidence.new(id, startVal, endVal, pre, post, missedCleavages, false, ref)
   end
 
-  #Gets the start and end location of the peptide
+  # Gets the start and end location of the peptide
   def pepLocation(hit, pro, pep)
     @locations.each do |location|
       if location[0] == pep && location[1] == pro
@@ -197,14 +200,14 @@ def pepLocation(hit, pro, pep)
     return 0, 0    #In case it doesn't find anything
   end
 
-  #Obtains all peptide locations and puts them in an array in the format: [[peptide, protein, start, end]]
+  # Obtains all peptide locations and puts them in an array in the format: [[peptide, protein, start, end]]
   def findAllPepLocations
     hits = @doc.xpath("//#{@xmlns}search_hit")
     all = []
     @locations = []
     i = 0
 
-    #Parses out each peptide and protein
+    # Parses out each peptide and protein
     hits.each do |hit|
       all << [hit.xpath("./@peptide").to_s, proteinID(hit.xpath("./@protein").to_s)]
       i += 1
@@ -231,8 +234,8 @@ def findAllPepLocations
     end
   end
 
-  #Not all pepXML files simply list the protein ID, so this method obtains it.
-  #Are there other cases to cover?
+  # Not all pepXML files simply list the protein ID, so this method obtains it.
+  # Are there other cases to cover?
   def proteinID(protein)
     #If a protein ID contains a "|", then it contains more than just the ID
     if protein.include?('|')
@@ -253,7 +256,7 @@ def proteinID(protein)
   end
 end
 
-#For quickly getting the start and end indexes of a string
+# For quickly getting the start and end indexes of a string
 class String
   def scan_i seq
     pos = 0

diff --git a/mzIdentML/search2mzidentml.rb b/mzIdentML/search2mzidentml.rb
@@ -1,14 +1,14 @@
 require "#{File.dirname($0)}/pepxml.rb"
 require 'nokogiri'
 
-#Creates an mzIdentML file from a file type created by a search engine, using the format classes such as PepXML.
+# Creates an mzIdentML file from a file type created by a search engine, using the format classes such as PepXML.
 class Search2mzIdentML
-  #format == a Format object
+  # format == a Format object
   def initialize(format)
     @format = format
   end
 
-  #Starts the Nokogiri build process. Other methods build the different parts of the file. Root is depth 0
+  # Starts the Nokogiri build process. Other methods build the different parts of the file. Root is depth 0
   def convert(opts={})
     puts "Creating file...\n\n"
 
@@ -43,7 +43,7 @@ def base_file
     end
   end
 
-  #Depth 1
+  # Depth 1
   def cvList(xml)
     xml.cvList {
       xml.cv(:id => "PSI-MS", :fullName => "Proteomics Standards Initiative Mass Spectrometry Vocabularies", :URI => "http://psidev.cvs.sourceforge.net/viewvc/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo", :version => "2.32.0")
@@ -52,7 +52,7 @@ def cvList(xml)
     }
   end
 
-  #Depth 1
+  # Depth 1
   def analysisSoftwareList(xml)
     xml.AnalysisSoftwareList {
       xml.AnalysisSoftware(:id => @format.searchEngine) {
@@ -64,7 +64,7 @@ def analysisSoftwareList(xml)
     }
   end
 
-  #Depth 1
+  # Depth 1
   def provider(xml)
     xml.Provider(:Software_ref => "search2mzIdentML.rb", :id => "PROVIDER") {
       xml.ContactRole(:Contact_ref => "PERSON_DOC_OWNER") {
@@ -75,15 +75,15 @@ def provider(xml)
     }
   end
 
-  #Depth 1
+  # Depth 1
   def sequenceCollection(xml)
     xml.SequenceCollection {
       dBSequences(xml)
       peptides(xml)
     }
   end
 
-  #Depth 2
+  # Depth 2
   def dBSequences(xml)
     proteins = @format.proteins
 
@@ -94,7 +94,7 @@ def dBSequences(xml)
     end
   end
 
-  #Depth 2
+  # Depth 2
   def peptides(xml)
     peptides = @format.peptides
 
@@ -105,24 +105,24 @@ def peptides(xml)
     end
   end
 
-  #Depth 1
+  # Depth 1
   def analysisCollection(xml)
     xml.AnalysisCollection {
       xml.SpectrumIdentification(:id => "SI", :SpectrumIdentificationProtocol_ref => "SIP", :SpectrumIdentificationList_ref => "SIL_1", :activityDate => @format.date) {
-        xml.InputSpectra(:SpectraData_ref => @format.file)
+        xml.InputSpectra(:SpectraData_ref => File.basename(@format.file))
         xml.SearchDatabase(:SearchDatabase_ref => "SDB_1")
       }
     }
   end
 
-  #Depth 1
+  # Depth 1
   def analysisProtocolCollection(xml)
     xml.AnalysisProtocolCollection {
       SpectrumIdentificationProtocol(xml)
     }
   end
 
-  #Depth 2
+  # Depth 2
   def SpectrumIdentificationProtocol(xml)
     xml.SpectrumIdentificationProtocol(:id => "SIP", :AnalysisSoftware_ref => @format.searchEngine) {
       xml.SearchType {
@@ -139,7 +139,7 @@ def SpectrumIdentificationProtocol(xml)
     }
   end
 
-  #Depth 1
+  # Depth 1
   def dataCollection(xml)
     xml.DataCollection {
       inputs(xml)
@@ -151,7 +151,7 @@ def dataCollection(xml)
     }
   end
 
-  #Depth 2
+  # Depth 2
   def inputs(xml)
     xml.Inputs {
       xml.SearchDatabase(:location => @format.database, :id => "SDB_1") {
@@ -162,13 +162,13 @@ def inputs(xml)
     }
   end
 
-  #Depth 4
+  # Depth 4
   def spectrumIdentificationResult(xml)
     results = @format.results
     i = 1
 
     results.each do |result|
-      xml.SpectrumIdentificationResult(:id => "SIR_#{i}", :spectrumID => "index=#{result.index}", :SpectraData_ref => @format.file) {
+      xml.SpectrumIdentificationResult(:id => "SIR_#{i}", :spectrumID => "index=#{result.index}", :SpectraData_ref => File.basename(@format.file)) {
         result.items.each do |item|
           ident = item.ident
           siiID = "SII_#{i}_#{ident.id}"
@@ -189,7 +189,7 @@ def spectrumIdentificationResult(xml)
     end
   end
 
-  #Depth 6
+  # Depth 6
   def spectrumIdentificationItemVals(xml, item, siiID)
     pepEv = item.pepEvidence
 

diff --git a/mzIdentML/search2mzidentml_cl.rb b/mzIdentML/search2mzidentml_cl.rb
@@ -39,9 +39,16 @@
 #  format = PepXML.new(options[:infile], options[:database])
 #end
 
+if ARGV.size != 2
+  puts "\nusage: #{File.basename(__FILE__)} inputFile database"
+  puts "inputFile: The location of the file to turn into mzIdentML (Currently only supports pepXML)"
+  puts "database: The location of the FASTA database\n\n"
+  exit
+end
+
 begin
   format = PepXML.new(ARGV[0], ARGV[1])
   Search2mzIdentML.new(format).convert
-#rescue
-#  $stderr.print "\n\tError: #{$!}\n"
+rescue
+  $stderr.print "\n\tError: #{$!}\n"
 end
diff --git a/mzIdentML/spec.rb b/mzIdentML/spec.rb
@@ -4,22 +4,35 @@
 
 describe 'PepXML2mzIdentML' do
   before do
-    @p2mm = Search2mzIdentML.new(PepXML.new("#{File.dirname($0)}/test.pep.xml", "#{File.dirname($0)}/../databases/uni_human_var_100517_fwd.fasta"))
-    #@p2mo = Search2mzIdentML.new(PepXML.new("#{File.dirname($0)}/test-omssa.pep.xml", "#{File.dirname($0)}/../databases/uni_human_var_100517_fwd.fasta"))
-    #@p2mt = Search2mzIdentML.new(PepXML.new("#{File.dirname($0)}/test-tandem.pep.xml", "#{File.dirname($0)}/../databases/uni_human_var_100517_fwd.fasta"))
+    @dir = File.dirname($0)
+    @p2mm = Search2mzIdentML.new(PepXML.new("#{@dir}/test.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
+    @p2mo = Search2mzIdentML.new(PepXML.new("#{@dir}/test-omssa.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
+    @p2mt = Search2mzIdentML.new(PepXML.new("#{@dir}/test-tandem.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
+    @p2mi = Search2mzIdentML.new(PepXML.new("#{@dir}/test-tide.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
   end
 
   it 'takes a pepXML file and outputs an mzIdentML file' do
     @p2mm.convert
-    #@p2mo.convert
-    #@p2mt.convert
+    @p2mo.convert
+    @p2mt.convert
+    @p2mi.convert
 
-    file1 = File.open("#{File.dirname($0)}/test.mzid")
-    file2 = File.open("#{File.dirname($0)}/test-mascot.mzid")
-
-
-    ok File.exist?(file1)
-    #FileUtils::cmp(file1, file2).is true
+    mascot = File.open("#{@dir}/test.mzid")
+    mascot_key = File.open("#{@dir}/mascot-key.mzid")
+
+    tandem = File.open("#{@dir}/test-tandem.mzid")
+    tandem_key = File.open("#{@dir}/tandem-key.mzid")
+
+    tide = File.open("#{@dir}/test-tide.mzid")
+    tide_key = File.open("#{@dir}/tide-key.mzid")
+
+    omssa = File.open("#{@dir}/test-omssa.mzid")
+    omssa_key = File.open("#{@dir}/omssa-key.mzid")
+
+    FileUtils::cmp(mascot, mascot_key).is true
+    FileUtils::cmp(tandem, tandem_key).is true
+    FileUtils::cmp(tide, tide_key).is true
+    FileUtils::cmp(omssa, omssa_key).is true
   end
 end