In [1]:
# Import libraries
import re
import math
import pandas as pd
import gspread

# Define functions

# Checks if a string is NaN
def isNaN_(string):
    return string != string

# This function takes a string as input and returns a list of all the numbers
# found in the string. It works by first removing any spaces and parenthesis 
# from the string, and then using regular expressions to find all the sequences 
# of digits in the string. Note that this function assumes that the input string 
# only contains numbers and separators such as "or", "and", and "(", ")".
def extract_numbers(string):
    # Remove any spaces and parenthesis from the string
    string = string.replace("(", "").replace(")", "").replace(" ", "")
    # Use regular expressions to extract the numbers
    numbers = re.findall(r'\d+', string)
    return numbers

# Checks if the non NaN string are equal
def compare_strings(strings):
    list_of_strings = [s for s in strings if s]
    non_zero_strings_ = [s for s in list_of_strings if not(isNaN_(s))]
    non_zero_strings = [s for s in non_zero_strings_ if s!="-----"]

    first_string = non_zero_strings[0] if non_zero_strings else ""
    if isNaN_(non_zero_strings) :
        s = []
    else:
        s = first_string

    # for s in non_zero_strings:
    #     if s != first_string and not(isNaN_(s)):
    #         s = []
    #         return False, s  

    if all(element == first_string for element in non_zero_strings):
        return True, s
    else:
        return False, s
          
    # return True, s

# Removed the spaces and returns the pure number if and only if is inbetween
# a set of parenthesis
def extract_number_from_string(string):
    stripped = string.strip()
    if stripped[0] == "(" and stripped[-1] == ")":
        if stripped[1:-1].isdigit():
            return int(stripped[1:-1])
        else:
            return None

# Returns a whole string but without the parenthesis on a number
def remove_parentheses_from_number(string, number):
    return string.replace("(" + str(number) + ")", str(number))

# Returns the gene dataset but it convertes (100761706) or (100768654) or (100757658) 
# to 100761706 or 100768654 or 100757658
def curate_genes(genes):
    gpr_set = [0] * len(genes)
    
    for counter, g in enumerate(genes):
    
        if isNaN_(g):
            gpr_set[counter] = "-----"
        else:

                


            gp=str(g)

            if gp[0] == '(' and gp[-1] == ')':
                gpr = gp[1:-1]
            elif "-" in gp:
                gpr = str(gp.replace("-", ""))
            else:
                gpr = str(gp)



            
            # Removes the dashes from the gene name            
            # if "-" in gp:
            #     gpr = str(gp.replace("-", ""))
            # else:
            #     gpr = str(gp)


            gpr_or = (gpr.split('or'))
            # print(1, gpr_or)

            for gpr_or_and in gpr_or:
                # print(gpr_or_and)
                if gpr_or_and.split('and') == gpr_or_and:
                    1
                    # print(i, 1)
                else:
                    genes_with_parenthesis = gpr_or_and.split('and')
                    # print(genes_with_parenthesis)
                    for gene_with_parenthesis in genes_with_parenthesis:
                        gpr = remove_parentheses_from_number(gpr, extract_number_from_string(gene_with_parenthesis))
                        gpr_set[counter] = gpr
                        # print(genes[counter])
        

    return gpr_set

In [4]:
#Read excel file
GEM = pd.read_excel('../../CHO Network Reconstruction.xlsx', sheet_name = "Rxns")

In [5]:
# Raw Data
Hef_Genes = GEM.iloc[:,6]
Fou_Genes = GEM.iloc[:,7]
Yeo_Genes = GEM.iloc[:,8]
Recon3D_Genes = GEM.iloc[:,9]
Final_Genes = GEM.iloc[:,10]

# Curated Data
Hef_Genes_Curate = curate_genes(Hef_Genes)
Fou_Genes_Curate = curate_genes(Fou_Genes)
Yeo_Genes_Curate = curate_genes(Yeo_Genes)
Recon3D_Genes_Curate = curate_genes(Recon3D_Genes)
# Final_Genes_Curate = curate_genes(Final_Genes)


In [6]:
# Prints all the reactions that have at least 2 of the same GPRs in the datasets
for i in range(Final_Genes.size):
    # print("Final1 = ",Final_Genes[i])
    flag = (compare_strings(list([Hef_Genes_Curate[i], Fou_Genes_Curate[i], Yeo_Genes_Curate[i], Recon3D_Genes_Curate[i]])))
    if flag[0]:
        Final_Genes[i] = flag[1]
    # else:
    #     Final_Genes[i] = "NaN"
        # input("Press Enter to continue...")
    # print("Final2 = ",Final_Genes[i])

# Perctentage of rxns that have no GPR in the Final Gene dataset
c = 0
for g in Final_Genes:
    if not isNaN_(g):
        c +=1
print((Final_Genes.size - c)/Final_Genes.size)

# Write the lists to a txt file
with open('Output.txt', 'w') as f:
    # write each item in the list to the file
    for item in Final_Genes:
        f.write(str(item) + '\n')

# with open('Output.txt', 'w') as f:
#     # write each item in the list to the file
#     for item in Fou_Genes_Curate:
#         f.write(str(item) + '\n')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_Genes[i] = flag[1]


0.28548780487804876


In [7]:
# Isolate all the genes, and create a dataframe containing them
genes = [0] * len(Hef_Genes_Curate)
counter = 0 
for i in range(len(genes)):
    genes_set = extract_numbers(Hef_Genes_Curate[i])
    for j in genes_set:
        if j not in genes:     
            genes[counter] = j
            counter += 1
    genes_set = extract_numbers(Fou_Genes_Curate[i])
    for j in genes_set:
        if j not in genes:     
            genes[counter] = j
            counter += 1
    genes_set = extract_numbers(Yeo_Genes_Curate[i])
    for j in genes_set:
        if j not in genes:     
            genes[counter] = j
            counter += 1
    genes_set = extract_numbers(Recon3D_Genes_Curate[i])
    for j in genes_set:
        if j not in genes:     
            genes[counter] = j
            counter += 1

Non_Zero_Genes = [element for element in genes if element != 0]

# Write the Genes to a txt file
with open('GeneOutput.txt', 'w') as f:
    # write each item in the list to the file
    for item in Non_Zero_Genes:
        f.write(str(item) + '\n') 