In [1]:
import os
import re
import pandas as pd
import numpy as np

stats = 'TBFV_ss.csv'
df = pd.read_csv(stats, skiprows=13, delimiter=";")
print(df)


    name  xrRNA-no  ST-I IL-I BL-I BR-I ST-II HL-II BL-II BR-II  ...  BL-III  \
0   ALKV         1    11    4    -    -     4     4     -     -  ...       -   
1    DTV         1    12    -    -    -     3     9     -     -  ...       -   
2    DTV         2    10    -    -    1     3     3     -     -  ...       -   
3    GGV         1    12    -    -    -     3     7     1     -  ...       -   
4    GGV         2    12    -    1    2     7     4     -     -  ...       -   
5   KFDV         1    12    2    -    -     4     4     -     -  ...       -   
6   KSIV         1    14    2    -    -     4     3     -     -  ...       -   
7   KSIV         2    13  3+2    -    -     4     9     -     -  ...       -   
8   LGTV         1    12  2+2    -    -     5     3     -     -  ...       -   
9   LGTV         2    14    -    -    1     5     7     -     -  ...       -   
10   LIV         1    12    2    1    -     3     4     -     -  ...       -   
11   LIV         2    13    -    1    1 

In [2]:
#only want xrRNA1
xrRNA1_df = df[df['xrRNA-no'] == 1]
#only want samples with both pseudoknots
xrRNA1_df_clean = xrRNA1_df[xrRNA1_df['PKs?'] == '1+2']
xrRNA1_df_clean = xrRNA1_df_clean.drop(columns=['PKs?', 'xrRNA-no', 'numSeqs'])

#only want samples with all 3 stems
xrRNA1_df_clean = xrRNA1_df_clean[xrRNA1_df_clean['ML-I/III'] == "-"]
xrRNA1_df_clean = xrRNA1_df_clean.drop(columns=['ML-I/III'])


#remove columns which have only 0s / -
col_no_values = xrRNA1_df_clean.columns[(xrRNA1_df_clean == "-").all()]
print("Columns have no data:", list(col_no_values))
xrRNA1_df_clean = xrRNA1_df_clean.drop(columns=col_no_values)

#remove columns which have only 0s / -
same_value_columns = xrRNA1_df_clean.columns[xrRNA1_df_clean.nunique() == 1]
print("Columns have only same value:")
for i in list(same_value_columns):
    print("\t", i, "=", xrRNA1_df_clean[i][0])
print("\n")
xrRNA1_df_clean = xrRNA1_df_clean.drop(columns=same_value_columns)

# change - to 0s
xrRNA1_df_clean = xrRNA1_df_clean.replace('-', 0)

# add Interloop 1 add up, (have some with 2 loops, add to one)
xrRNA1_df_clean = xrRNA1_df_clean.replace('2+2', 4)

temp_df = xrRNA1_df_clean.drop(columns='name').astype(int)
xrRNA1_df_clean['IB-I'] = temp_df['IL-I'] + temp_df['BL-I']

#double the nucleotides in Stems because we counted basepairs not nucleotides
xrRNA1_df_clean['ST-I'] = temp_df['ST-I'] * 2
xrRNA1_df_clean['ST-II'] = temp_df['ST-II'] * 2
xrRNA1_df_clean['ST-III'] = temp_df['ST-III'] * 2

print(xrRNA1_df_clean, '\n')

Columns have no data: ['BR-I', 'BR-II', 'BL-III', 'BR-III']
Columns have only same value:
	 ML-I/II = 6
	 ML-II/III = 1
	 ML-III/I = 1


    name  ST-I IL-I BL-I  ST-II HL-II BL-II  ST-III  HL-III  uPK1 PK1bPK2  \
0   ALKV    22    4    0      8     4     0      10      10     8       0   
1    DTV    24    0    0      6     9     0       8      11    12       2   
3    GGV    24    0    0      6     7     1      14       4    15       2   
5   KFDV    24    2    0      8     4     0      10      10     8       2   
6   KSIV    28    2    0      8     3     0       8      12     8       2   
8   LGTV    24    4    0     10     3     0      10      10    11       0   
10   LIV    24    2    1      6     4     0       8      12    11       0   
15  NEGV    26    0    1      6     4     0       8      12    11       0   
17  OHFV    24    2    2      6     4     0       8      12    11       0   
19  POWV    24    0    0      6     9     0       8      11    14       2   
21  SGEV    26  

In [3]:
xrRNA1_df_clean.describe()

Unnamed: 0,ST-I,ST-II,ST-III,HL-III,uPK1,IB-I
count,12.0,12.0,12.0,12.0,12.0,12.0
mean,24.333333,6.833333,9.0,10.666667,10.833333,2.333333
std,1.669694,1.337116,1.809068,2.269695,2.208798,1.775251
min,22.0,6.0,8.0,4.0,8.0,0.0
25%,24.0,6.0,8.0,10.0,9.5,0.75
50%,24.0,6.0,8.0,11.5,11.0,2.5
75%,24.5,8.0,10.0,12.0,11.25,4.0
max,28.0,10.0,14.0,12.0,15.0,5.0


In [4]:
xrRNA1_df_clean.drop(columns=['name']).median()

ST-I       24.0
IL-I        2.0
BL-I        0.0
ST-II       6.0
HL-II       4.0
BL-II       0.0
ST-III      8.0
HL-III     11.5
uPK1       11.0
PK1bPK2     0.0
IB-I        2.5
dtype: object

In [5]:
xrRNA1_df_clean.drop(columns=['name']).describe()


Unnamed: 0,ST-I,ST-II,ST-III,HL-III,uPK1,IB-I
count,12.0,12.0,12.0,12.0,12.0,12.0
mean,24.333333,6.833333,9.0,10.666667,10.833333,2.333333
std,1.669694,1.337116,1.809068,2.269695,2.208798,1.775251
min,22.0,6.0,8.0,4.0,8.0,0.0
25%,24.0,6.0,8.0,10.0,9.5,0.75
50%,24.0,6.0,8.0,11.5,11.0,2.5
75%,24.5,8.0,10.0,12.0,11.25,4.0
max,28.0,10.0,14.0,12.0,15.0,5.0


In [18]:
df = pd.read_csv(stats, skiprows=13, delimiter=";")
df_merge = df.copy()
temp_df = df.drop(columns=['name', 'PKs?', 'numSeqs']).replace('-', 0)
temp_df = temp_df.replace('3+2', 5).replace('2+2', 4).replace('4+2', 6)

# want to count nucleotides not basepairs --> multiply stems by two

temp_df = temp_df.astype(int)
temp_df['ST-I'] = temp_df['ST-I'] * 2
temp_df['ST-II'] = temp_df['ST-II'] * 2
temp_df['ST-III'] = temp_df['ST-III'] * 2

df_merge['I'] = temp_df['ST-I'] + temp_df['IL-I'] + temp_df['BL-I']
df_merge['II'] = temp_df['ST-II'] + temp_df['HL-II'] + temp_df['BL-II']
df_merge['III'] = temp_df['ST-III'] + temp_df['HL-III']
df_merge['HL'] = temp_df['HL-II'] + temp_df['HL-III']
df_merge['sum'] = temp_df[list(temp_df.columns)].sum(axis=1)
df_merge['name_'] = [f'{i} xrRNA {j}'for i, j in zip(df_merge['name'], df_merge['xrRNA-no'])]
temp_df['name_'] = [f'{i} xrRNA {j}'for i, j in zip(df_merge['name'], df_merge['xrRNA-no'])]
df_merge = df_merge[['name', 'xrRNA-no', 'name_', 'I', 'II', 'III', 'HL', 'sum']]
df_merge = df_merge.merge(temp_df, on='name_')
# print(df_merge)
df_merge = df_merge.drop(index= [28, 29, 30, 31, 26, 23, 12, 13,14]) # remove XiFV xrRNA 2 - 5, TYUV, SREV, MPFV

  temp_df = df.drop(columns=['name', 'PKs?', 'numSeqs']).replace('-', 0)


In [19]:
df_merge

Unnamed: 0,name,xrRNA-no_x,name_,I,II,III,HL,sum,xrRNA-no_y,ST-I,...,ST-III,HL-III,BL-III,BR-III,ML-I/II,ML-II/III,ML-III/I,ML-I/III,uPK1,PK1bPK2
0,ALKV,1,ALKV xrRNA 1,26,12,20,14,75,1,22,...,10,10,0,0,6,1,1,0,8,0
1,DTV,1,DTV xrRNA 1,24,15,19,20,81,1,24,...,8,11,0,0,6,1,1,0,12,2
2,DTV,2,DTV xrRNA 2,20,9,20,13,71,2,20,...,10,10,0,0,6,1,1,0,11,0
3,GGV,1,GGV xrRNA 1,24,14,18,11,82,1,24,...,14,4,0,0,6,1,1,0,15,2
4,GGV,2,GGV xrRNA 2,25,18,19,15,87,2,24,...,8,11,0,0,6,1,1,0,11,2
5,KFDV,1,KFDV xrRNA 1,26,12,20,14,77,1,24,...,10,10,0,0,6,1,1,0,8,2
6,KSIV,1,KSIV xrRNA 1,30,11,20,15,80,1,28,...,8,12,0,0,6,1,1,0,8,2
7,KSIV,2,KSIV xrRNA 2,31,17,20,19,85,2,26,...,10,10,0,0,6,1,1,0,7,0
8,LGTV,1,LGTV xrRNA 1,28,13,20,13,81,1,24,...,10,10,0,0,6,1,1,0,11,0
9,LGTV,2,LGTV xrRNA 2,28,17,20,17,84,2,28,...,10,10,0,0,6,1,1,0,8,0


In [20]:
def calculate_relations(df):
    st2_hl2 = []
    st3_hl3 = []
    sI_sII = []
    sII_sIII = []
    sI_sIII = []
    names = []
    for i in df_merge.index:
        st1 = df.loc[i, "ST-I"]
        st2 = df.loc[i, "ST-II"]
        st3 = df.loc[i, "ST-III"]
        hl2 = df.loc[i, "HL-II"]
        hl3 = df.loc[i, "HL-III"]

        sI = df.loc[i, "I"]
        sII = df.loc[i, "II"]
        sIII = df.loc[i, "III"]
        
        try:
            st2_hl2.append(float(st2) * 2/float(hl2))
        except:
            st2_hl2.append(np.NaN)
        try:
            st3_hl3.append(float(st3) * 2/float(hl3))
        except:
            st3_hl3.append(np.NaN)

        try:
            sI_sII.append(float(sI)/float(sII))
        except:
            sI_sII.append(np.NaN)

        try:
            sII_sIII.append(float(sII)/float(sIII))
        except:
            sII_sIII.append(np.NaN)

        try:
            sI_sIII.append(float(sI)/float(sIII))
        except:
            sI_sIII.append(np.NaN)


        names.append(df.loc[i, "name_"])
    relations = {"name": names, "STII/HLII": st2_hl2, "STIII/HLIII": st3_hl3, 'SI/SII': sI_sII, 'SII/SIII': sII_sIII, 'SI/SIII': sI_sIII, }
    df_relations = pd.DataFrame(relations)
    df_relations = df_relations.replace(0, np.NaN)
    return df_relations



In [21]:
df_rel = calculate_relations(df_merge)
df_rel

Unnamed: 0,name,STII/HLII,STIII/HLIII,SI/SII,SII/SIII,SI/SIII
0,ALKV xrRNA 1,4.0,2.0,2.166667,0.6,1.3
1,DTV xrRNA 1,1.333333,1.454545,1.6,0.789474,1.263158
2,DTV xrRNA 2,4.0,2.0,2.222222,0.45,1.0
3,GGV xrRNA 1,1.714286,7.0,1.714286,0.777778,1.333333
4,GGV xrRNA 2,7.0,1.454545,1.388889,0.947368,1.315789
5,KFDV xrRNA 1,4.0,2.0,2.166667,0.6,1.3
6,KSIV xrRNA 1,5.333333,1.333333,2.727273,0.55,1.5
7,KSIV xrRNA 2,1.777778,2.0,1.823529,0.85,1.55
8,LGTV xrRNA 1,6.666667,2.0,2.153846,0.65,1.4
9,LGTV xrRNA 2,2.857143,2.0,1.647059,0.85,1.4


In [22]:
df_rel.describe()

Unnamed: 0,STII/HLII,STIII/HLIII,SI/SII,SII/SIII,SI/SIII
count,23.0,23.0,23.0,23.0,23.0
mean,3.273982,1.929225,2.01722,0.683457,1.305088
std,1.461432,1.149805,0.545799,0.164315,0.17375
min,1.333333,1.230769,1.0,0.45,0.705882
25%,2.857143,1.333333,1.594118,0.5,1.292857
50%,3.0,2.0,1.823529,0.705882,1.35
75%,4.0,2.0,2.572222,0.85,1.4
max,7.0,7.0,2.9,0.947368,1.55
