# Script for uploading our rProtein sequences

Uses a pregenerated csv file with the columns:

*Txid*, *Accession*, *Origin database*, *Description*, and *Full sequence*

Updates tables: **Polymer_Data**, **Polymer_metadata**, and **Residues**

In [1]:
#!/usr/bin/env python3
import csv, sys, getopt, getpass, mysql.connector

def usage():
	print (\
	"USAGE:\n./upload_accession.py -c [csv_file_path]-h\n\
	-c: defines path to csv file with txids, accessions, database, protein name, description, and sequence.\tREQUIRED\n\
	-h: prints this\
")

try:
	opts, args = getopt.getopt(sys.argv[1:], 'c:h', ['csv=', 'help'])
except getopt.GetoptError:
	usage()
	sys.exit(2)

for opt, arg in opts:
	if opt in ('-h', '--help'):
		usage()
		sys.exit(2)
	elif opt in ('-c', '--csv'):
		csv_path = arg
	else:
		usage()
		sys.exit(2)

uname = input("User name: ")
pw = getpass.getpass("Password: ")
cnx = mysql.connector.connect(user=uname, password=pw, host='130.207.36.75', database='DESIRE')
cursor = cnx.cursor()

def read_csv(csv_path):
	with open(csv_path, 'r') as csv_file:
		reader = csv.reader(csv_file)
		csv_list = list(reader)
	return csv_list

def superkingdom_info(ID):
	'''
	Gets the superkingdom for a strain ID
	'''
	#print(ID)
	cursor.execute("SELECT DESIRE.TaxGroups.groupName FROM DESIRE.Species_TaxGroup\
		INNER JOIN DESIRE.TaxGroups ON DESIRE.Species_TaxGroup.taxgroup_id=DESIRE.TaxGroups.taxgroup_id\
		INNER JOIN DESIRE.Species ON DESIRE.Species_TaxGroup.strain_id=DESIRE.Species.strain_id\
		WHERE DESIRE.TaxGroups.groupLevel = 'superkingdom' AND DESIRE.Species.strain_id = '"+ID+"'")
	results = cursor.fetchall()
	#print(ID,results)
	try:
		superkingdom=(results[0][0])
	except:
		raise ValueError ("No result for specie "+str(ID)+" in the MYSQL query")
	return superkingdom

def check_nomo_id(occur, prot_name):
	'''
	Gets nom_id for new name and superkingdom
	'''
	cursor.execute("SELECT DESIRE.Nomenclature.nom_id FROM DESIRE.Nomenclature\
		INNER JOIN DESIRE.Old_name ON DESIRE.Nomenclature.nom_id=DESIRE.Old_name.nomo_id\
		WHERE DESIRE.Old_name.old_name = '"+prot_name+"' AND DESIRE.Old_name.N_B_Y_H_A = 'BAN' AND DESIRE.Nomenclature.occurrence = '"+occur+"'")
	result = cursor.fetchall()
	#nom_id=result[0][0]
	try:
		nom_id=result[0][0]
	except:
		raise ValueError ("No result for nom_id "+prot_name+" and occurrence "+occur+" in the MYSQL query")
	return nom_id

def upload_resi(poldata_id, fullseq):
	i = 1
	for resi in fullseq:
		query = "INSERT INTO `DESIRE`.`Residues`(`PolData_id`,`resNum`,`unModResName`) VALUES('"+poldata_id+"','"+str(i)+"','"+resi+"')"
		cursor.execute(query)
		#print(query)
		i+=1
	return True

def main():
	csv_list = read_csv(csv_path)
	for entry in csv_list:
		superK = superkingdom_info(entry[0])
		nom_id = check_nomo_id(superK[0], entry[3])	
		query = "INSERT INTO `DESIRE`.`Polymer_Data`(`GI`,`strain_ID`,`nomgd_id`, `GeneDescription`) VALUES('"+entry[1]+"','"+str(entry[0])+"','"+str(nom_id)+"','"+entry[4]+"')"
		print(query)
		cursor.execute(query)
		lastrow_id = str(cursor.lastrowid)
		query = "INSERT INTO `DESIRE`.`Polymer_metadata`(`polymer_id`,`accession_type`,`polymer_type`, `Fullseq`) VALUES('"+str(lastrow_id)+"','LDW-prot','protein','"+entry[5]+"')"
		cursor.execute(query)
		#print(query)
		upload_resi(str(lastrow_id), entry[5])
	

if __name__ == "__main__":
	main()

#cnx.commit()
cursor.close()
cnx.close()
print("Success!")

USAGE:
./upload_accession.py -c [csv_file_path]-h
	-c: defines path to csv file with txids, accessions, database, protein name, description, and sequence.	REQUIRED
	-h: prints this


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
