In [1]:
## Import libraries
import pandas as pd
# from sklearn.preprocessing import MinMaxScaler
from Bio import Seq

class UCAY():
    
    def __init__(self, targetDNA, k):
        self.targetDNA = str(targetDNA)
        self.k = int(k)
        
        self.SEQ_PROMOTOR = 'GGG'
        self.SEQ_SPACER_1 = 'CCUC'
        self.SEQ_UCAYPROTEIN_1 = 'AUGUAUUCAUAUACAU'
        self.SEQ_SPACER_2 = 'CUCCUCCUCCUCCUCU'
        self.SEQ_UCAYPROTEIN_2 = 'AUGUAUUCAUAUACAU'
        self.SEQ_SPACER_3 = 'CGAGAGGAGGAGGAGGAGGAGG'

        self.len_common = len(self.SEQ_PROMOTOR+self.SEQ_SPACER_1+self.SEQ_UCAYPROTEIN_1+self.SEQ_SPACER_2+self.SEQ_UCAYPROTEIN_2+self.SEQ_SPACER_3)

        # Read DNA
        f_targetDNA = open('../data/'+self.targetDNA+'.txt', 'r', encoding='utf-8')
        self.dna_sense = f_targetDNA.read()
        f_targetDNA.close()
        
    def convert_dna_to_rna(self):
        old_chars = ['A', 'T', 'G', 'C']
        new_chars = ['A', 'U', 'G', 'C']
        self.rna = ''.join([new_chars[old_chars.index(c)] if c in old_chars else c for c in self.dna_sense])

    def extract_all_kmers(self):
        self.convert_dna_to_rna()
        self.kmers = [self.rna[i:i+self.k] for i in range(len(self.rna)-self.k+1)]

    def export_fasta(self):
        self.extract_all_kmers()
        text = ''
        i = 0
        for kmer in self.kmers:
            text += '>' + str(i) + '\n' + self.SEQ_PROMOTOR + kmer + self.SEQ_SPACER_1 + self.SEQ_UCAYPROTEIN_1 + self.SEQ_SPACER_2 + self.SEQ_UCAYPROTEIN_2 + self.SEQ_SPACER_3 + '\n'
            i += 1
        text = text[:-1]
        f_fasta = open('../analysis/'+self.targetDNA+'_'+str(self.k)+'mer.fa', 'w')
        f_fasta.write(text)
        f_fasta.close()

    def set_sequence(self):
        self.extract_all_kmers()
        self.score['sequence'] = self.kmers

    def set_structure_mxfold2(self):
        with open('../analysis/'+self.targetDNA+'_'+str(self.k)+'mer_mxfold2.txt', 'r') as f_structure:
            lines = f_structure.readlines()
        lines_rstrip = [line.rstrip("\n") for line in lines]
        for i in range(len(self.score)):
            self.score['structure_mxfold2'][i] = lines_rstrip[3*i+2][0:self.k+self.len_common]

    def score_cc9_A(self):
        for i in range(len(self.score)):
            self.score['cc9_A'][i] = self.score['sequence'][i].count('A')/len(self.score['sequence'][i])
    
    def score_cc9_U_G(self):
        for i in range(len(self.score)):
            self.score['cc9_U_G'][i] = -1 * (self.score['sequence'][i].count('U') + self.score['sequence'][i].count('G') ) / len(self.score['sequence'][i])

    def score_cc9_A_middle(self):
        for i in range(len(self.score)):
            self.score['cc9_A_middle'][i] = (self.score['sequence'][i][int(len(self.score['sequence'][i])/4) : int(3*len(self.score['sequence'][i])/4)].count('A')) / (len(self.score['sequence'][i])/2)
    
    def score_cc9_AG_CA_AC_UA(self):
        for i in range(len(self.score)):
            k2mers = [self.score['sequence'][i][j:j+2] for j in range(len(self.score['sequence'][i])-2+1)]
            count = 0
            for j in k2mers:
                count += 1 if j == 'AG' or j == 'CA' or j == 'AC' or j == 'UA' else 0
            self.score['cc9_AG_CA_AC_UA'][i] = count/(len(self.score['sequence'][i])-1)
    
    def score_cc9_UU_GC(self):
        for i in range(len(self.score)):
            k2mers = [self.score['sequence'][i][j:j+2] for j in range(len(self.score['sequence'][i])-2+1)]
            count = 0
            for j in k2mers:
                count += 1 if j == 'UU' or j == 'GC' else 0
            self.score['cc9_UU_GC'][i] = -1 * (count / (len(self.score['sequence'][i])-1))
    
    def score_cc9_C_3(self):
        for i in range(len(self.score)):
            self.score['cc9_C_3'][i] = 1 if self.score['sequence'][i][2] == 'C' else 0
    
    def score_cc9_GG_GGG(self):
        for i in range(len(self.score)):
            k2mers = [self.score['sequence'][i][j:j+2] for j in range(len(self.score['sequence'][i])-2+1)]
            count_GG = 0
            for j in k2mers:
                count_GG += 1 if j == 'GG' else 0
            k3mers = [self.score['sequence'][i][j:j+3] for j in range(len(self.score['sequence'][i])-3+1)]
            count_GGG = 0
            for j in k3mers:
                count_GGG += 1 if j == 'GGG' else 0
            self.score['cc9_GG_GGG'][i] = -1 * ((count_GG + count_GGG) / ((len(self.score['sequence'][i]) - 1) + (len(self.score['sequence'][i])-2)))
            
    def score_pr_G_C(self):
        for i in range(len(self.score)):
            self.score['pr_G_C'][i] = -1 * abs(((self.score['sequence'][i].count('G') + self.score['sequence'][i].count('C')) - len(self.score['sequence'][i])/2)) / (len(self.score['sequence'][i])/2)
    
    def score_pr_GC_VS_AU(self):
        for i in range(len(self.score)):
            count_AUrich = 0
            count_GCrich = 0
            for j in range(int(len(self.score['sequence'][i])/10)):
                count_AUrich += 1 if (self.score['sequence'][i][10*i:10*(j+1)].count('A')+self.score['sequence'][i][10*j:10*(j+1)].count('U')) \
                > (self.score['sequence'][i][10*j:10*(j+1)].count('G')+self.score['sequence'][i][10*j:10*(j+1)].count('C')) else 0
                count_GCrich += 1 if (self.score['sequence'][i][10*i:10*(j+1)].count('A')+self.score['sequence'][i][10*j:10*(j+1)].count('U')) \
                < (self.score['sequence'][i][10*j:10*(j+1)].count('G')+self.score['sequence'][i][10*j:10*(j+1)].count('C')) else 0
            self.score['pr_GC_VS_AU'][i] = -1*abs(count_AUrich-count_GCrich)/(len(self.score['sequence'][i])/10)
    
    def score_pr_G_C_3p(self):
        for i in range(len(self.score)):
            self.score['pr_G_C_3p'][i] = 1 if self.score['sequence'][i][-1] == 'G' or self.score['sequence'][i][-1] == 'C' else 0
    
    def score_pr_U_3p(self):
        for i in range(len(self.score)):
            self.score['pr_U_3p'][i] = -1 if self.score['sequence'][i][-1] == 'U' else 0

    def score_pr_comp(self):
        def complementary_sequence_rna(sequence):
            old_chars = ['A', 'U', 'G', 'C']
            new_chars = ['U', 'A', 'C', 'G']
            return (''.join([new_chars[old_chars.index(c)] if c in old_chars else c for c in sequence]))[::-1]
        
        for i in range(len(self.score)):
            count = 0
            k3mer = [self.score['sequence'][i][j:j+3] for j in range(len(self.score['sequence'][i])-3+1)]
            kr3mer = [self.score['sequence'][i][::-1][j:j+3] for j in range(len(self.score['sequence'][i])-3+1)]

            for j in range(len(k3mer)-2):
                for k in range(len(kr3mer)-2):
                    count += 1 if complementary_sequence_rna(k3mer[j])==kr3mer[k] else 0
                    
            self.score['pr_comp'][i] = -1*(count/((len(self.score['sequence'][i])-2)*(len(self.score['sequence'][i])-2)/2))
    
    def score_both_AAAA_CCCC_GGGG_UUUU(self):
        for i in range(len(self.score)):
            k4mer = [self.score['sequence'][i][j:j+4] for j in range(len(self.score['sequence'][i])-4+1)]
            count = 0
            for j in k4mer:
                count += 1 if j=='AAAA' or j=='CCCC' or j=='GGGG' or j=='UUUU' else 0
            self.score['both_AAAA_CCCC_GGGG_UUUU'][i] = -1 * (count/(len(self.score['sequence'][i])-3))

    def score_both(self):
        self.score['score_both'] = self.score['cc9_A'] + self.score['cc9_U_G'] + self.score['cc9_A_middle'] + self.score['cc9_AG_CA_AC_UA'] + self.score['cc9_UU_GC'] + \
                                self.score['cc9_C_3'] + self.score['cc9_GG_GGG'] + self.score['pr_G_C'] + self.score['pr_GC_VS_AU'] + self.score['pr_G_C_3p'] + \
                                self.score['pr_U_3p'] + self.score['pr_comp'] + self.score['both_AAAA_CCCC_GGGG_UUUU']
        self.score['score_both'] = (self.score['score_both'] - min(self.score['score_both']))/(max(self.score['score_both'])-min(self.score['score_both']))
    
    def score_cc9(self):
        self.score['score_cc9'] = self.score['cc9_A'] + self.score['cc9_U_G'] + self.score['cc9_A_middle'] + self.score['cc9_AG_CA_AC_UA'] + self.score['cc9_UU_GC'] + \
                                self.score['cc9_C_3'] + self.score['cc9_GG_GGG'] + self.score['both_AAAA_CCCC_GGGG_UUUU']
        self.score['score_cc9'] = (self.score['score_cc9'] - min(self.score['score_cc9']))/(max(self.score['score_cc9'])-min(self.score['score_cc9']))

    def score_pr(self):
        self.score['score_pr'] = self.score['pr_G_C'] + self.score['pr_GC_VS_AU'] + self.score['pr_G_C_3p'] + self.score['pr_U_3p'] + self.score['pr_comp'] + self.score['both_AAAA_CCCC_GGGG_UUUU']
        self.score['score_pr'] = (self.score['score_pr'] - min(self.score['score_pr']))/(max(self.score['score_pr'])-min(self.score['score_pr']))

    def score_mxfold2(self):
        structure_ideal = '.'*len(self.SEQ_PROMOTOR) + '.'*self.k + '('*len(self.SEQ_SPACER_1) + '((((((....))))))' + '('*len(self.SEQ_SPACER_2) + '((((((....))))))' + \
                            '..' + ')'*len(self.SEQ_SPACER_1+self.SEQ_SPACER_2)
        for i in range(len(self.score)):
            count = 0
            for j in range(len(self.score['structure_mxfold2'][i])):
                count += 1 if self.score['structure_mxfold2'][i][j] == structure_ideal[j] else 0
            self.score['score_mxfold2'][i] = count
        self.score['score_mxfold2'] = (self.score['score_mxfold2']-min(self.score['score_mxfold2']))/(max(self.score['score_mxfold2'])-min(self.score['score_mxfold2']))

    def score_both_mxfold2(self):
        self.score['score_both_mxfold2'] = self.score['score_both'] + self.score['score_mxfold2']
    
    def score(self):
        # Define score talbes
        self.score = pd.DataFrame(columns=['sequence', 'structure_rnafold', 'structure_mxfold2', 'structure_ufold', 'cc9_A', 'cc9_U_G', 'cc9_A_middle', 'cc9_AG_CA_AC_UA', \
                                           'cc9_UU_GC', 'cc9_C_3', 'cc9_GG_GGG', 'pr_G_C', 'pr_GC_VS_AU', 'pr_G_C_3p', 'pr_U_3p', 'pr_comp', 'both_AAAA_CCCC_GGGG_UUUU', \
                                           'score_structure', 'score_cc9', 'score_pr', 'score_both', 'score_mxfold2', 'score_both_mxfold2'])
        self.set_sequence()
        self.set_structure_mxfold2()
        self.score_cc9_A()
        self.score_cc9_U_G()
        self.score_cc9_A_middle()
        self.score_cc9_AG_CA_AC_UA()
        self.score_cc9_UU_GC()
        self.score_cc9_C_3()
        self.score_cc9_GG_GGG()
        self.score_pr_G_C()
        self.score_pr_GC_VS_AU()
        self.score_pr_G_C_3p()
        self.score_pr_U_3p()
        self.score_pr_comp()
        self.score_both_AAAA_CCCC_GGGG_UUUU()

        self.score_both()
        self.score_cc9()
        self.score_pr()

        self.score_mxfold2()
        self.score_both_mxfold2()
        # Export to csv
        self.score.to_csv('../analysis/score_'+self.targetDNA+'_'+str(self.k)+'mer.csv')

In [2]:
gRNA_eGFP_30 = UCAY('eGFP', 30)
# gRNA_eGFP_30.export_fasta()
gRNA_eGFP_30.score()

gRNA_eGFP_50 = UCAY('eGFP', 50)
# gRNA_eGFP_50.export_fasta()
gRNA_eGFP_50.score()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  self.score['structure_mxfold2'][i] = lines_rstrip[3*i+2][0:self.k+self.len_common]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will nev