In [9]:
# imports
import xml.etree.ElementTree as ET
import pandas as pd

In [10]:
# Load the XML file
tree = ET.parse('data/decompressed/uniprot.xml')

# define the namespace prefixes:
namespaces = {
    'uniprot': 'http://uniprot.org/uniprot',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
}

#   <recommendedName>
#            <fullName evidence="4">Adenosylhomocysteinase</fullName>
#<gene>
#       <name evidence="4 9" type="primary">ahcY</name>


In [22]:

# prepares the list in which results will be saved (for the output file)
gene_info_list = [["ACCESSION", "UNIPROT_NAME","GID", "GENE_NAME", "SUBMITTED_PROTEIN_NAME", "RECOMMENDED_PTN_NAME"]] # these will basically be our column names in the output



In [23]:
# get the root element
root = tree.getroot()

# Find all <entry> elements within the 'uniprot' namespace
entry_elements = root.findall('uniprot:entry', namespaces=namespaces)


# Iterate through the <entry> elements
for entry_element in entry_elements:
	
	# assign temporary variables for each category we're going to retrieve
	accession = None
	uniprot_id = None
	ordered_locus = None
	primary_name = None
	submitted_fullName = None
	rec_fullName = None

		# find and retrieve all the accession numbers
	accession_elements = entry_element.findall('uniprot:accession', namespaces=namespaces)
	for acc in accession_elements:
		# retrieve the value
		accession = acc.text

			# find and retrieve all the uniprot ids
	uid_elements = entry_element.findall('uniprot:name', namespaces=namespaces)
	for uid in uid_elements:
		# retrieve the locus tag
		uniprot_id = uid.text

	# find and retrieve all the locus tag information
	locus_tag_elements = entry_element.findall('uniprot:gene/uniprot:name[@type="ordered locus"]', namespaces=namespaces)
	for lt in locus_tag_elements:
		# retrieve the locus tag
		ordered_locus = lt.text
	
	# find and retrieve all the primary names
	primary_name_elements = entry_element.findall('uniprot:gene/uniprot:name[@type="primary"]', namespaces=namespaces)
	for gene_name in primary_name_elements:
		# retrieve the name of the gene, if present
		primary_name = gene_name.text

	# need to add check if it is NA!

	# find and retrieve all protein names
		
	# check different names and also PP_0135
	ptn_name_elements = entry_element.findall('uniprot:protein/uniprot:submittedName/uniprot:fullName', namespaces=namespaces)
	for ptn_name in ptn_name_elements:
		# retrieve the protein names
		submitted_fullName = ptn_name.text
	

	rec_name_elements = entry_element.findall('uniprot:protein/uniprot:recommendedName/uniprot:fullName', namespaces=namespaces)
	for rec_name in rec_name_elements:
			# retrieve the protein names
			rec_fullName = rec_name.text

	# finally, update the gene information list
	gene_info_list.append([accession, uniprot_id, ordered_locus, primary_name, submitted_fullName, rec_fullName])


In [24]:
gene_info_list

[['ACCESSION',
  'UNIPROT_NAME',
  'GID',
  'GENE_NAME',
  'SUBMITTED_PROTEIN_NAME',
  'RECOMMENDED_PTN_NAME'],
 ['A0A140FW83', 'A0A140FW83_PSEPK', 'PP_5529', None, 'Epimerase', None],
 ['A0A140FWI4',
  'A0A140FWI4_PSEPK',
  'PP_5631',
  None,
  'Uncharacterized protein',
  None],
 ['A0A140FWN3',
  'A0A140FWN3_PSEPK',
  'PP_5681',
  None,
  'Uncharacterized protein',
  None],
 ['A0A140FWS3',
  'A0A140FWS3_PSEPK',
  'PP_4976',
  'ahcY',
  None,
  'Adenosylhomocysteinase'],
 ['P59308',
  'ARGC2_PSEPK',
  'PP_3633',
  'argC2',
  None,
  'N-acetyl-gamma-glutamyl-phosphate reductase 2'],
 ['Q88BZ0', 'Q88BZ0_PSEPK', 'PP_5393', None, 'Metal-binding chaperone', None],
 ['Q88BZ9', 'Q88BZ9_PSEPK', 'PP_5384', 'copS', None, 'Sensor protein'],
 ['Q88C68', 'Q88C68_PSEPK', 'PP_5315', 'rubA', None, 'Rubredoxin'],
 ['Q88CB5',
  'Q88CB5_PSEPK',
  'PP_5266',
  None,
  'Acetyl-CoA hydrolase family protein',
  None],
 ['Q88CN9', 'TYSY_PSEPK', 'PP_5141', 'thyA', None, 'Thymidylate synthase'],
 ['Q88CS7',
  

In [25]:
df = pd.DataFrame(gene_info_list)

#set column names equal to values in row index position 0
df.columns = df.iloc[0]

#remove first row from DataFrame
df = df[1:]


# saving as tsv file 
df.to_csv('GID_names_table.tsv', sep="\t", index=False, ) 


# # save the parsed data in a TSV file
# with open('resources/fGO.tsv', 'w', newline='') as file:
	
# 	writer = csv.writer(file, delimiter="\t")
	
# 	for newEntry in gene_info_list:
# 		print(newEntry)
# 		writer.writerow(newEntry)
