Skip to content

Commit

Permalink
Updated mzIdentML/spec.rb
Browse files Browse the repository at this point in the history
  • Loading branch information
JesseJ committed Jul 16, 2010
1 parent 27dbfad commit 5802e3e
Show file tree
Hide file tree
Showing 16 changed files with 42,138 additions and 531 deletions.
6 changes: 6 additions & 0 deletions mzIdentML/format.rb
Expand Up @@ -113,6 +113,12 @@ def conformScoreName(name, engine)
"mascot"
when "OMSSA"
"OMSSA"
when "Tide"
"sequest"
when "Phenyx"
"Phenyx"
when "SpectraST"
"SpectraST"
end

[base, name].join(':')
Expand Down
20,222 changes: 20,222 additions & 0 deletions mzIdentML/mascot-key.mzid

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions mzIdentML/obo_converter.rb
@@ -1,12 +1,12 @@
require 'nokogiri'
require 'yaml'

#This program will convert the psi-ms.obo file into a format that's easier to parse.
#Creates the mzid_name and the pepxml_name to allow for conversion from pepxml names to mzid names. Sadly, pepxml names which
#differ from mzid names will have to be set by hand.
# This program will convert the psi-ms.obo file into a format that's easier to parse.
# Creates the mzid_name and the pepxml_name to allow for conversion from pepxml names to mzid names. Sadly, pepxml names which
# differ from mzid names will have to be set by hand.


#Create yaml file
# Create yaml file
file = File.new("#{File.dirname($0)}/obo.yaml", "w")
obo = File.open("#{File.dirname($0)}/psi-ms.obo", "r")
yml = []
Expand Down
3 changes: 3 additions & 0 deletions mzIdentML/oboe.yaml
Expand Up @@ -4085,6 +4085,9 @@
- :pepxml_name: xtandem
:id: MS:1001476
:mzid_name: xtandem
- :pepxml_name: X! Tandem
:id: MS:1001476
:mzid_name: xtandem
- :pepxml_name: SpectraST
:id: MS:1001477
:mzid_name: SpectraST
Expand Down
562 changes: 562 additions & 0 deletions mzIdentML/omssa-key.mzid

Large diffs are not rendered by default.

39 changes: 21 additions & 18 deletions mzIdentML/pepxml.rb
Expand Up @@ -2,7 +2,10 @@
require "#{File.dirname($0)}/natcmp.rb"
require "ms/fasta.rb"

# The pepXML implementation of Format
class PepXML < Format
# file == a string containing the pepXML file location
# database == a string containing the FASTA database that was used by the search engine
def initialize(file, database)
super
@type = "pepxml"
Expand All @@ -11,7 +14,7 @@ def initialize(file, database)
@sequences = 0
@proteinIndices = []

#Nokogiri won't parse out the information of an XML file that uses namespaces unless you add xmlns, and vice versa.
# Nokogiri won't parse out the information of an XML file that uses namespaces unless you add xmlns, and vice versa.
@xmlns = "xmlns:" if hasNamespace

findAllPepLocations
Expand All @@ -34,27 +37,27 @@ def database
@database
end

#Retrieves the date in the pepXML file
# Retrieves the date in the pepXML file
def date
@doc.xpath("#{@xmlns}msms_pipeline_analysis/@date").to_s
end

#Retrieves the number of database sequences
# Retrieves the number of database sequences
def numberOfSequences
@sequences
end

#Retrieves the name of the search engine
# Retrieves the name of the search engine
def searchEngine
@engine
end

#Simply returns 0 because I don't know how to obtain the threshold from pepXML
# Simply returns 0 because I don't know how to obtain the threshold from pepXML
def threshold
0
end

#Retrieves all the proteins. Not sure if this is correct.
# Retrieves all the proteins. Not sure if this is correct.
def proteins
allHits = @doc.xpath("//#{@xmlns}search_hit/@protein|//#{@xmlns}search_hit/@protein_descr")
pros = []
Expand All @@ -70,7 +73,7 @@ def proteins
@pros
end

#Retrieves all the peptides. Not sure if this is correct.
# Retrieves all the peptides. Not sure if this is correct.
def peptides
allHits = @doc.xpath("//#{@xmlns}search_hit/@peptide")
peps = []
Expand All @@ -96,12 +99,12 @@ def peptides
peps
end

#Retrieves the name of the search database that was used.
# Retrieves the name of the search database that was used.
def databaseName
@databaseName
end

#Retrieves the spectrum queries. Spectrum indexs not guarenteed to be correct.
# Retrieves the spectrum queries. Spectrum indexs not guarenteed to be correct.
def results
queries = @doc.xpath("//#{@xmlns}spectrum_query")
indicies = @doc.xpath("//#{@xmlns}spectrum_query/@spectrum").collect {|index| index.to_s}
Expand All @@ -128,7 +131,7 @@ def results

private

#Checks if the pepXML file used namespaces
# Checks if the pepXML file used namespaces
def hasNamespace
if @doc.xpath("msms_pipeline_analysis").to_s.length == 0
true
Expand All @@ -137,7 +140,7 @@ def hasNamespace
end
end

#Obtains the result items
# Obtains the result items
def getItem(hit, rank, charge)
mass = hit.xpath("./@calc_neutral_pep_mass").to_s.to_f
diff = hit.xpath("./@massdiff").to_s.to_f
Expand Down Expand Up @@ -166,7 +169,7 @@ def getItem(hit, rank, charge)
item
end

#Obtains the peptideEvidence
# Obtains the peptideEvidence
def getEvidence(hit, pep, id)
pre = hit.xpath("./@peptide_prev_aa").to_s
post = hit.xpath("./@peptide_next_aa").to_s
Expand All @@ -186,7 +189,7 @@ def getEvidence(hit, pep, id)
PepEvidence.new(id, startVal, endVal, pre, post, missedCleavages, false, ref)
end

#Gets the start and end location of the peptide
# Gets the start and end location of the peptide
def pepLocation(hit, pro, pep)
@locations.each do |location|
if location[0] == pep && location[1] == pro
Expand All @@ -197,14 +200,14 @@ def pepLocation(hit, pro, pep)
return 0, 0 #In case it doesn't find anything
end

#Obtains all peptide locations and puts them in an array in the format: [[peptide, protein, start, end]]
# Obtains all peptide locations and puts them in an array in the format: [[peptide, protein, start, end]]
def findAllPepLocations
hits = @doc.xpath("//#{@xmlns}search_hit")
all = []
@locations = []
i = 0

#Parses out each peptide and protein
# Parses out each peptide and protein
hits.each do |hit|
all << [hit.xpath("./@peptide").to_s, proteinID(hit.xpath("./@protein").to_s)]
i += 1
Expand All @@ -231,8 +234,8 @@ def findAllPepLocations
end
end

#Not all pepXML files simply list the protein ID, so this method obtains it.
#Are there other cases to cover?
# Not all pepXML files simply list the protein ID, so this method obtains it.
# Are there other cases to cover?
def proteinID(protein)
#If a protein ID contains a "|", then it contains more than just the ID
if protein.include?('|')
Expand All @@ -253,7 +256,7 @@ def proteinID(protein)
end
end

#For quickly getting the start and end indexes of a string
# For quickly getting the start and end indexes of a string
class String
def scan_i seq
pos = 0
Expand Down
36 changes: 18 additions & 18 deletions mzIdentML/search2mzidentml.rb
@@ -1,14 +1,14 @@
require "#{File.dirname($0)}/pepxml.rb"
require 'nokogiri'

#Creates an mzIdentML file from a file type created by a search engine, using the format classes such as PepXML.
# Creates an mzIdentML file from a file type created by a search engine, using the format classes such as PepXML.
class Search2mzIdentML
#format == a Format object
# format == a Format object
def initialize(format)
@format = format
end

#Starts the Nokogiri build process. Other methods build the different parts of the file. Root is depth 0
# Starts the Nokogiri build process. Other methods build the different parts of the file. Root is depth 0
def convert(opts={})
puts "Creating file...\n\n"

Expand Down Expand Up @@ -43,7 +43,7 @@ def base_file
end
end

#Depth 1
# Depth 1
def cvList(xml)
xml.cvList {
xml.cv(:id => "PSI-MS", :fullName => "Proteomics Standards Initiative Mass Spectrometry Vocabularies", :URI => "http://psidev.cvs.sourceforge.net/viewvc/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo", :version => "2.32.0")
Expand All @@ -52,7 +52,7 @@ def cvList(xml)
}
end

#Depth 1
# Depth 1
def analysisSoftwareList(xml)
xml.AnalysisSoftwareList {
xml.AnalysisSoftware(:id => @format.searchEngine) {
Expand All @@ -64,7 +64,7 @@ def analysisSoftwareList(xml)
}
end

#Depth 1
# Depth 1
def provider(xml)
xml.Provider(:Software_ref => "search2mzIdentML.rb", :id => "PROVIDER") {
xml.ContactRole(:Contact_ref => "PERSON_DOC_OWNER") {
Expand All @@ -75,15 +75,15 @@ def provider(xml)
}
end

#Depth 1
# Depth 1
def sequenceCollection(xml)
xml.SequenceCollection {
dBSequences(xml)
peptides(xml)
}
end

#Depth 2
# Depth 2
def dBSequences(xml)
proteins = @format.proteins

Expand All @@ -94,7 +94,7 @@ def dBSequences(xml)
end
end

#Depth 2
# Depth 2
def peptides(xml)
peptides = @format.peptides

Expand All @@ -105,24 +105,24 @@ def peptides(xml)
end
end

#Depth 1
# Depth 1
def analysisCollection(xml)
xml.AnalysisCollection {
xml.SpectrumIdentification(:id => "SI", :SpectrumIdentificationProtocol_ref => "SIP", :SpectrumIdentificationList_ref => "SIL_1", :activityDate => @format.date) {
xml.InputSpectra(:SpectraData_ref => @format.file)
xml.InputSpectra(:SpectraData_ref => File.basename(@format.file))
xml.SearchDatabase(:SearchDatabase_ref => "SDB_1")
}
}
end

#Depth 1
# Depth 1
def analysisProtocolCollection(xml)
xml.AnalysisProtocolCollection {
SpectrumIdentificationProtocol(xml)
}
end

#Depth 2
# Depth 2
def SpectrumIdentificationProtocol(xml)
xml.SpectrumIdentificationProtocol(:id => "SIP", :AnalysisSoftware_ref => @format.searchEngine) {
xml.SearchType {
Expand All @@ -139,7 +139,7 @@ def SpectrumIdentificationProtocol(xml)
}
end

#Depth 1
# Depth 1
def dataCollection(xml)
xml.DataCollection {
inputs(xml)
Expand All @@ -151,7 +151,7 @@ def dataCollection(xml)
}
end

#Depth 2
# Depth 2
def inputs(xml)
xml.Inputs {
xml.SearchDatabase(:location => @format.database, :id => "SDB_1") {
Expand All @@ -162,13 +162,13 @@ def inputs(xml)
}
end

#Depth 4
# Depth 4
def spectrumIdentificationResult(xml)
results = @format.results
i = 1

results.each do |result|
xml.SpectrumIdentificationResult(:id => "SIR_#{i}", :spectrumID => "index=#{result.index}", :SpectraData_ref => @format.file) {
xml.SpectrumIdentificationResult(:id => "SIR_#{i}", :spectrumID => "index=#{result.index}", :SpectraData_ref => File.basename(@format.file)) {
result.items.each do |item|
ident = item.ident
siiID = "SII_#{i}_#{ident.id}"
Expand All @@ -189,7 +189,7 @@ def spectrumIdentificationResult(xml)
end
end

#Depth 6
# Depth 6
def spectrumIdentificationItemVals(xml, item, siiID)
pepEv = item.pepEvidence

Expand Down
11 changes: 9 additions & 2 deletions mzIdentML/search2mzidentml_cl.rb
Expand Up @@ -39,9 +39,16 @@
# format = PepXML.new(options[:infile], options[:database])
#end

if ARGV.size != 2
puts "\nusage: #{File.basename(__FILE__)} inputFile database"
puts "inputFile: The location of the file to turn into mzIdentML (Currently only supports pepXML)"
puts "database: The location of the FASTA database\n\n"
exit
end

begin
format = PepXML.new(ARGV[0], ARGV[1])
Search2mzIdentML.new(format).convert
#rescue
# $stderr.print "\n\tError: #{$!}\n"
rescue
$stderr.print "\n\tError: #{$!}\n"
end
35 changes: 24 additions & 11 deletions mzIdentML/spec.rb
Expand Up @@ -4,22 +4,35 @@

describe 'PepXML2mzIdentML' do
before do
@p2mm = Search2mzIdentML.new(PepXML.new("#{File.dirname($0)}/test.pep.xml", "#{File.dirname($0)}/../databases/uni_human_var_100517_fwd.fasta"))
#@p2mo = Search2mzIdentML.new(PepXML.new("#{File.dirname($0)}/test-omssa.pep.xml", "#{File.dirname($0)}/../databases/uni_human_var_100517_fwd.fasta"))
#@p2mt = Search2mzIdentML.new(PepXML.new("#{File.dirname($0)}/test-tandem.pep.xml", "#{File.dirname($0)}/../databases/uni_human_var_100517_fwd.fasta"))
@dir = File.dirname($0)
@p2mm = Search2mzIdentML.new(PepXML.new("#{@dir}/test.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
@p2mo = Search2mzIdentML.new(PepXML.new("#{@dir}/test-omssa.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
@p2mt = Search2mzIdentML.new(PepXML.new("#{@dir}/test-tandem.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
@p2mi = Search2mzIdentML.new(PepXML.new("#{@dir}/test-tide.pep.xml", "#{@dir}/../databases/uni_human_var_100517_fwd.fasta"))
end

it 'takes a pepXML file and outputs an mzIdentML file' do
@p2mm.convert
#@p2mo.convert
#@p2mt.convert
@p2mo.convert
@p2mt.convert
@p2mi.convert

file1 = File.open("#{File.dirname($0)}/test.mzid")
file2 = File.open("#{File.dirname($0)}/test-mascot.mzid")


ok File.exist?(file1)
#FileUtils::cmp(file1, file2).is true
mascot = File.open("#{@dir}/test.mzid")
mascot_key = File.open("#{@dir}/mascot-key.mzid")

tandem = File.open("#{@dir}/test-tandem.mzid")
tandem_key = File.open("#{@dir}/tandem-key.mzid")

tide = File.open("#{@dir}/test-tide.mzid")
tide_key = File.open("#{@dir}/tide-key.mzid")

omssa = File.open("#{@dir}/test-omssa.mzid")
omssa_key = File.open("#{@dir}/omssa-key.mzid")

FileUtils::cmp(mascot, mascot_key).is true
FileUtils::cmp(tandem, tandem_key).is true
FileUtils::cmp(tide, tide_key).is true
FileUtils::cmp(omssa, omssa_key).is true
end
end

Expand Down

0 comments on commit 5802e3e

Please sign in to comment.