In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats 
from scipy.stats import ttest_ind
import statsmodels.stats.multitest as multi
import statistics

In [67]:
circRNA_HCC=pd.read_table('HCC_circRNAs.txt').set_index ('circID')
circRNA_Healthy=pd.read_table('Healthy_circRNAs.txt').set_index ('circID')
circRNA=pd.concat([circRNA_HCC,circRNA_Healthy],axis=1,ignore_index=True)

longRNA_HCC=pd.read_table('HCC_longRNAs.txt').set_index('Gene.symbol')
longRNA_Healthy=pd.read_table('Healthy_longRNAs.txt').set_index('Gene.symbol')
longRNA=pd.concat([longRNA_HCC,longRNA_Healthy],axis=1,ignore_index=True)

Data=pd.concat([circRNA, longRNA],axis=0)
c=pd.concat([circRNA_Healthy, longRNA_Healthy],axis=0)
h=pd.concat([circRNA_HCC, longRNA_HCC],axis=0)

In [76]:
Data=Data[(~Data.astype('bool')).mean(axis=1)<0.8]
con=c.loc[Data.index]
hcc=h.loc[Data.index]


In [6]:
Con=np.log2(con+1) 
HCC=np.log2(hcc+1) 
Con_mean=Con.mean(axis=1)
HCC_mean=HCC.mean(axis=1)

In [7]:
Log2Fc=HCC_mean-Con_mean
fc=2**Log2Fc
Fc=[]
for i in fc:
    if (i<1):
        Fc.append(-1/i)
    else:
        Fc.append(i)
Fc

[2.7512445439519206,
 1.2969174922492555,
 1.9903216997870574,
 2.022537151426123,
 2.430943356815129,
 1.5646377352973897,
 1.0166586943405134,
 1.81209752230878,
 3.307628930278089,
 -1.5556397969737372,
 2.224109768054199,
 2.4890273102017004,
 1.7513479565197063,
 1.611904729730809,
 1.747306895279301,
 2.201864742099258,
 1.227096098277177,
 1.974961700994865,
 2.097721858041872,
 1.1944246369294795,
 1.4752987652965996,
 1.7706178547171243,
 1.2244002244120826,
 1.3135190735084579,
 2.778548957448857,
 2.0598686952298744,
 1.1519671004536427,
 1.3156447217551954,
 1.0493820361577475,
 2.2129266475530605,
 1.3557253096655872,
 3.3378734914809622,
 -1.0060682159360967,
 2.5979342952236997,
 1.9313590451091758,
 3.0773492610458186,
 1.7046249161949043,
 1.5256631020057303,
 1.2738103720944227,
 2.679903021623936,
 2.227167918589643,
 2.1290064225627416,
 2.0725881931755366,
 -1.096190983274973,
 1.0182422795290416,
 -1.7614743257581833,
 2.1460618704663026,
 -1.2075329157998238,
 1.

In [8]:
Log2Fc.index

Index(['exo_circ_29569', 'exo_circ_29604', 'exo_circ_29616', 'exo_circ_29626',
       'exo_circ_29632', 'exo_circ_29695', 'exo_circ_29708', 'exo_circ_29721',
       'exo_circ_29747', 'exo_circ_29771',
       ...
       'AL645939.5', 'AL133444.1', 'AC009879.3', 'AC125232.2', 'AL162458.1',
       'AL512598.2', 'AC010285.3', 'AC083855.2', 'AP002762.1', 'AL135905.2'],
      dtype='object', length=19022)

In [45]:
result=[]
for i in Con.index:
    x1 = Con.loc[i]
    x2 = HCC.loc[i]
    ttest_result=scipy.stats.mannwhitneyu(x=x1, y=x2, alternative = 'two-sided')
    result.append(
        {
            'Gene': i,
            'T': ttest_result.statistic,
            'p value':  ttest_result.pvalue
        })


In [46]:
Result=pd.DataFrame(result)

p_adjusted=multi.fdrcorrection(Result['p value'], alpha=0.05, method='indep', is_sorted=False)
Result['Fc']=Fc
Result['p adjusted']=p_adjusted[1]
Result['abs fc']=abs(np.array(fc))
Result['Average Normalized Values in Healthy']=np.array(Con_mean)
Result['Average Normalized Values in HCC']=np.array(HCC_mean)
Result=Result.sort_values(by='abs fc', ascending=False).reset_index()
DGE=Result[(Result['Fc']>1.2) & (Result['p adjusted']<0.05) ]
#DGE.to_csv('DGE.csv')

In [52]:
Summary = Result[['Gene','Average Normalized Values in HCC','Average Normalized Values in Healthy','Fc','p value','p adjusted']]
#.to_excel('Exosome_DGE_Summary.xlsx')

In [None]:
goi=['PPBP','S100A11','S100A9','ACTB','FTL','MTRNR2L8','TMSB4X','exo_circ_22106','exo_circ_79050','GSE1','TXLNGY']
GOI=pd.DataFrame()
x=pd.DataFrame()
y=pd.DataFrame()
z=pd.DataFrame()
for i in goi:
    GOI=pd.concat([GOI, Result[Result['Gene'].str.startswith(i)]],ignore_index=False)
    x=pd.concat([x,Log2Fc[Log2Fc.index.str.startswith(i)]], axis=0, ignore_index=True)
    y=pd.concat([y,Con_mean[Con_mean.index.str.startswith(i)]],ignore_index=True)
    z=pd.concat([z,HCC_mean[HCC_mean.index.str.startswith(i)]],ignore_index=True)


In [None]:
fc=2**(np.array(x[0]))
Fc=[]
for i in fc:
    if (i<1):
        Fc.append(-1/i)
    else:
        Fc.append(i)
        
GOI['Fc']=Fc
GOI['Log2Fc']=np.array(x[0])

GOI['Average Normalized Values in HCC']=np.array(z[0])
GOI['Average Normalized Values in Healthy']=np.array(y[0])
GOI[['Gene','Average Normalized Values in HCC','Average Normalized Values in Healthy','Fc','pvalue','p_adjusted']]

In [81]:
Hcc=hcc.reset_index()
Hcc=Hcc.rename(columns = {'index':'Gene'})
control=con.reset_index()
control=control.rename(columns = {'index':'Gene'})

In [87]:

Summary=Summary.merge(Hcc,on='Gene').merge(control,on='Gene')
Summary.to_excel('Exosome_ALL.xlsx')

In [88]:
Summary

Unnamed: 0,Gene,Average Normalized Values in HCC,Average Normalized Values in Healthy,Fc,p value,p adjusted,HCC001,HCC002,HCC003,HCC004,...,Healthy109,Healthy110,Healthy111,Healthy112,Healthy113,Healthy114,Healthy115,Healthy116,Healthy117,Healthy118
0,exo_circ_79050,8.054189,4.667711,10.457584,2.051423e-08,4.022904e-06,2450.525,0.000,2627.288,1524.414,...,722.916,332.260,8.410,0.000,165.141,6.646,673.834,1414.950,603.913,1155.178
1,exo_circ_79066,6.982648,3.927270,8.313055,1.191299e-06,1.259586e-04,760.508,0.000,1014.677,1866.629,...,635.898,1386.822,0.000,0.000,353.061,0.000,1057.115,2047.294,991.827,155.889
2,exo_circ_71478,6.807846,3.963024,7.184169,6.492658e-06,4.525081e-04,16.900,502.086,199.311,388.881,...,575.655,411.713,0.000,0.000,0.000,405.399,278.188,0.000,4.408,0.000
3,exo_circ_11335,6.421432,3.596733,7.084664,1.947738e-06,1.852493e-04,152.102,0.000,398.623,139.997,...,0.000,0.000,0.000,0.000,324.588,0.000,0.000,0.000,476.077,247.824
4,exo_circ_68977,6.773099,3.982198,6.920616,5.585783e-06,4.128792e-04,0.000,103.880,144.954,295.550,...,0.000,0.000,454.140,1387.292,0.000,0.000,544.012,338.085,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19017,XIST,0.420218,1.402049,-1.974970,6.494307e-06,4.525081e-04,0.837,6.385,0.000,0.000,...,0.000,0.000,0.735,31.646,0.000,0.475,0.000,0.113,0.007,0.020
19018,exo_circ_30857,4.174062,5.169666,-1.993916,2.762581e-02,1.222942e-01,152.102,17.313,18.119,233.329,...,0.000,787.311,647.570,891.831,5.695,332.294,302.916,281.738,0.000,167.880
19019,exo_circ_17924,3.949179,5.096801,-2.215485,4.126304e-02,1.561379e-01,33.800,17.313,0.000,0.000,...,629.204,0.000,681.210,0.000,239.170,398.753,6.182,1014.256,908.073,323.769
19020,LINC02280,1.016726,2.258550,-2.364974,2.159133e-05,1.061267e-03,1.187,5.066,0.498,0.000,...,23.731,4.258,3.714,0.000,21.709,8.641,4.443,0.000,16.298,2.987
