# I] Import

In [1]:
import pandas, numpy, math, seaborn
import scipy, scipy.stats
import matplotlib, matplotlib.pyplot as plt
# this is a trick to make figures look nicer
matplotlib.rcParams.update({'font.size':20, 'font.family':'FreeSans', 'xtick.labelsize':20, 'ytick.labelsize':10, 'figure.figsize':(12, 8)})

In [2]:
# pip install matplotlib-venn

In [3]:
from matplotlib_venn import venn2

In [4]:
input_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7//0 in_silico/Python/1)data_input/'
output_file_directory = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 in_silico/Python/3)output/'

## functions & options


In [5]:
#define a function to split after the ".". The 1 is for how many word to have. if we have several dot. The [1] is what is after the dot, and [0] what is before
def split(L):
  return L.split(".",1)[0]

# II] Dataframe settings

## A) Isoform data

In [6]:
%%time
path = input_file_directory + "xenabrowser_brut_data/xena_surv_ATG7.tsv"
df_ori= pandas.read_csv(path, sep = "\t")
print(df_ori.shape)
df_ori.head()

(19131, 31)
Wall time: 123 ms


Unnamed: 0,sample,samples,_sample_type,_primary_site,ENSG00000197548.12,ENST00000451513.5,ENST00000435760.5,ENST00000451830.5,ENST00000460444.5,ENST00000470474.1,...,ENST00000434066.6,ENST00000478638.5,ENST00000461278.1,ENST00000460291.1,ENST00000467121.1,ENST00000414717.5,ENST00000427759.5,ENST00000446110.1,OS,OS.time
0,TARGET-20-PARUBT-40,TARGET-20-PARUBT-40,Recurrent Blood Derived Cancer - Peripheral Blood,White blood cell,4.401,-9.966,0.3685,2.656,-0.7108,-1.086,...,-9.966,-9.966,-0.6193,-5.012,-9.966,0.8568,-9.966,-9.966,,
1,TARGET-20-PATJHJ-40,TARGET-20-PATJHJ-40,Recurrent Blood Derived Cancer - Peripheral Blood,White blood cell,2.82,-9.966,-0.1345,-1.595,-2.932,-9.966,...,-9.966,-9.966,-9.966,-3.308,-9.966,-2.053,-2.727,-0.2328,,
2,TARGET-10-PASLZM-40,TARGET-10-PASLZM-40,Recurrent Blood Derived Cancer - Peripheral Blood,White blood cell,1.158,-9.966,-2.635,-1.994,-9.966,-9.966,...,-9.966,-9.966,-9.966,-3.626,-9.966,-1.086,-9.966,-9.966,,
3,TARGET-21-PATAIJ-42,TARGET-21-PATAIJ-42,Post treatment Blood Cancer - Blood,White blood cell,4.332,0.2642,1.085,-1.086,-9.966,-9.966,...,-9.966,-9.966,-9.966,-1.595,-9.966,0.3115,-0.6643,1.723,,
4,TARGET-21-PASVJS-41,TARGET-21-PASVJS-41,Post treatment Blood Cancer - Bone Marrow,White blood cell,5.156,-9.966,1.628,-1.732,-2.826,-0.6643,...,-0.1187,-9.966,-1.355,0.4125,-9.966,0.2522,-9.966,2.431,,


In [7]:
# Column selection
df = df_ori[['sample',
            '_sample_type',
            '_primary_site',
             'OS', 'OS.time',
            'ENSG00000197548.12',
            'ENST00000354449.7', 
            'ENST00000354956.9',
            'ENST00000446450.6']]

# We have Adrenal Gland and Adrenal gland. I need to put "G".
df = df.assign(_primary_site=df['_primary_site'].str.title())

# Column rename
df = df.rename(columns = {'sample' : 'sample',
                    '_sample_type' : 'Sample_Type',
                    'OS.time' : 'OS_time',
                    '_primary_site' : 'Primary_Site',
                    'ENSG00000197548.12': 'ATG7',
                    'ENST00000354449.7': 'ATG7_1',
                    'ENST00000354956.9': 'ATG7_2',
                    'ENST00000446450.6': 'ATG7_3'})

# Calcul to have value from log2;  2**(ATG7(1))-0.001
df = df.assign(ATG7_total= pow(2,df['ATG7'])-0.001,
          ATG7_1tpm= pow(2,df['ATG7_1'])-0.001,
          ATG7_2tpm= pow(2,df['ATG7_2'])-0.001,
          ATG7_3tpm= pow(2,df['ATG7_3'])-0.001)

# Calcul to have log2+1 from value; 
df = df.assign(log2_p1_ATG7_total= numpy.log2(df['ATG7_total']+1),
               log2_p1_ATG7_1= numpy.log2(df['ATG7_1tpm']+1),
               log2_p1_ATG7_2= numpy.log2(df['ATG7_2tpm']+1))

# Calcul to see percentage of expression of the 3 isoforms of ATG7 
df = df.assign(ATG7_prot_tot= df['ATG7_1tpm'] + df['ATG7_2tpm'] + df['ATG7_3tpm'])

df = df.assign(ATG7_1perc= (df['ATG7_1tpm'] *100)/ df['ATG7_prot_tot'],
              ATG7_2perc= (df['ATG7_2tpm'] *100)/ df['ATG7_prot_tot'],
              ATG7_3perc= (df['ATG7_3tpm'] *100)/ df['ATG7_prot_tot'])                              

#Add columns, normalized on total
df = df.assign(ATG7_1norm= df['ATG7_1tpm']/df["ATG7_total"],
               ATG7_2norm= df['ATG7_2tpm']/df["ATG7_total"])                                   

df.head()

Unnamed: 0,sample,Sample_Type,Primary_Site,OS,OS_time,ATG7,ATG7_1,ATG7_2,ATG7_3,ATG7_total,...,ATG7_3tpm,log2_p1_ATG7_total,log2_p1_ATG7_1,log2_p1_ATG7_2,ATG7_prot_tot,ATG7_1perc,ATG7_2perc,ATG7_3perc,ATG7_1norm,ATG7_2norm
0,TARGET-20-PARUBT-40,Recurrent Blood Derived Cancer - Peripheral Blood,White Blood Cell,,,4.401,1.39,1.522,-1.732,21.125765,...,0.3000343,4.467655,1.855905,1.952665,5.79071,45.241201,49.577495,5.181305,0.124009,0.135895
1,TARGET-20-PATJHJ-40,Recurrent Blood Derived Cancer - Peripheral Blood,White Blood Cell,,,2.82,1.118,-1.086,-9.966,7.060624,...,-1.495113e-07,3.010892,1.664236,0.555881,2.639524,82.19128,17.808726,-6e-06,0.307262,0.066576
2,TARGET-10-PASLZM-40,Recurrent Blood Derived Cancer - Peripheral Blood,White Blood Cell,,,1.158,0.0158,-3.171,-3.816,2.230479,...,0.07000183,1.691748,1.007204,0.150597,1.190042,84.871949,9.245753,5.882299,0.452823,0.049329
3,TARGET-21-PATAIJ-42,Post treatment Blood Cancer - Blood,White Blood Cell,,,4.332,1.345,1.975,-9.966,20.139115,...,-1.495113e-07,4.401843,1.823465,2.30167,6.469584,39.249848,60.750154,-2e-06,0.126088,0.195157
4,TARGET-21-PASVJS-41,Post treatment Blood Cancer - Bone Marrow,White Blood Cell,,,5.156,1.804,3.769,0.5069,35.653197,...,1.419994,5.195867,2.166995,3.871026,18.542569,18.826249,73.515731,7.658019,0.097912,0.382342


In [8]:
#what kind of samples are the data ? 
df['Sample_Type'].value_counts()

Primary Tumor                                        9185
Normal Tissue                                        7429
Solid Tissue Normal                                   738
Cell Line                                             433
Metastatic                                            393
Primary Solid Tumor                                   286
Primary Blood Derived Cancer - Peripheral Blood       239
Primary Blood Derived Cancer - Bone Marrow            237
Recurrent Blood Derived Cancer - Bone Marrow          104
Recurrent Tumor                                        45
Recurrent Solid Tumor                                  13
Post treatment Blood Cancer - Bone Marrow              12
Additional - New Primary                               11
Recurrent Blood Derived Cancer - Peripheral Blood       3
Post treatment Blood Cancer - Blood                     1
Control Analyte                                         1
Additional Metastatic                                   1
Name: Sample_T

In [9]:
#what kind of samples are the data ? 
df['Primary_Site'].value_counts()

Brain                         1846
Lung                          1410
Breast                        1391
Skin                          1282
Kidney                        1193
Esophagus                      848
Prostate                       648
Colon                          639
Stomach                        624
Blood Vessel                   606
White Blood Cell               595
Thyroid Gland                  571
Head And Neck Region           564
Liver                          531
Adipose Tissue                 515
Ovary                          515
Blood                          444
Bladder                        435
Muscle                         396
Heart                          377
Pancreas                       350
Testis                         319
Cervix                         309
Thyroid                        279
Nerve                          278
Soft Tissue,Bone               264
Adrenal Gland                  205
Endometrium                    204
Paraganglia         

In [10]:
#Creation of two dataframe for blood only
##Normal tissue=Blood and Primary tumor=Bone Marrow & Peripheral Blood.
df_ATG7_Normal = df[(df['Primary_Site'] == 'Blood Vessel') | 
                      (df['Primary_Site'] == 'Blood') & (df['Sample_Type'] != 'Cell Line')]

df_ATG7_Primary = df[df['Sample_Type'].str.match('(Primary Blood Derived Cancer - Bone Marrow)|(Primary Blood Derived Cancer - Peripheral Blood)')]

print('normal_tissue' , len(df_ATG7_Normal))
print('primary_tumors' , len(df_ATG7_Primary))

normal_tissue 943
primary_tumors 476


In [11]:
df_ATG7_Primary.head(1)

Unnamed: 0,sample,Sample_Type,Primary_Site,OS,OS_time,ATG7,ATG7_1,ATG7_2,ATG7_3,ATG7_total,...,ATG7_3tpm,log2_p1_ATG7_total,log2_p1_ATG7_1,log2_p1_ATG7_2,ATG7_prot_tot,ATG7_1perc,ATG7_2perc,ATG7_3perc,ATG7_1norm,ATG7_2norm
16,TARGET-21-PASLZE-09,Primary Blood Derived Cancer - Bone Marrow,White Blood Cell,,,5.59,2.23,3.584,0.7916,48.166896,...,1.729993,5.619615,2.508515,3.69944,18.41133,25.475291,65.128358,9.396351,0.097377,0.248947


In [12]:
# Column rename
df_ATG7_Primary = df_ATG7_Primary.rename(columns = {'Sample_Type' : 'Primary_Site',
                                                    'Primary_Site' : 'Sample_Type'})
#replace texts to have a better table
df_ATG7_Primary['Sample_Type'] = df_ATG7_Primary['Sample_Type'].str.replace('White Blood Cell', 'Primary Blood Derived Cancer')
df_ATG7_Primary['Primary_Site'] = df_ATG7_Primary['Primary_Site'].str.replace('Primary Blood Derived Cancer - ', '')

df_ATG7_Primary.head(2)

Unnamed: 0,sample,Primary_Site,Sample_Type,OS,OS_time,ATG7,ATG7_1,ATG7_2,ATG7_3,ATG7_total,...,ATG7_3tpm,log2_p1_ATG7_total,log2_p1_ATG7_1,log2_p1_ATG7_2,ATG7_prot_tot,ATG7_1perc,ATG7_2perc,ATG7_3perc,ATG7_1norm,ATG7_2norm
16,TARGET-21-PASLZE-09,Bone Marrow,Primary Blood Derived Cancer,,,5.59,2.23,3.584,0.7916,48.166896,...,1.729993,5.619615,2.508515,3.69944,18.41133,25.475291,65.128358,9.396351,0.097377,0.248947
17,TARGET-21-PANZLR-09,Bone Marrow,Primary Blood Derived Cancer,,,5.048,0.4125,3.216,0.6969,33.081584,...,1.620018,5.09092,1.220324,3.363321,12.241078,10.864977,75.900749,13.234275,0.040203,0.280853


In [13]:
# create df with only Normal and Primary 
df_ATG7_NormPrim = pandas.concat([df_ATG7_Normal, df_ATG7_Primary], axis=0)
df_ATG7_NormPrim.reset_index(drop=True, inplace=True)

In [14]:
df_ATG7_NormPrim['Sample_Type'].value_counts()

Normal Tissue                   943
Primary Blood Derived Cancer    476
Name: Sample_Type, dtype: int64

In [15]:
df_ATG7_NormPrim['Primary_Site'].value_counts()

Blood Vessel        606
Blood               337
Peripheral Blood    239
Bone Marrow         237
Name: Primary_Site, dtype: int64

## B) Whole gene expression

In [13]:
%%time
# DL data gene expression for Normal Tissue and Primary Tumors
input_file_directory_gene_expr = '/Users/kja11/OneDrive - Háskóli Íslands/PhD ATG7/0 Python Analysis/gene_expr_preparation/'
path = output_file_directory + "/ATG7/Organ_specific_subsets/blood/"

directory = path + "ensembl_normal_blood_protcoding_expr.tsv"
normal_blood_protcoding= pandas.read_csv(directory, sep = "\t")
print("Blood Vessel, normal tissue shape:", normal_blood_protcoding.shape)

directory = path + "ensembl_primary_blood_protcoding_expr.tsv"
primary_blood_protcoding= pandas.read_csv(directory, sep = "\t")
print("White Blood Cell, Primary Blood Derived Cancer (Peripheral Blood) shape:", primary_blood_protcoding.shape)
primary_blood_protcoding.head(2)

Blood Vessel, normal tissue shape: (606, 19383)
White Blood Cell, Primary Blood Derived Cancer (Peripheral Blood) shape: (239, 19383)
Wall time: 11.9 s


Unnamed: 0,sample,ENSG00000198888,ENSG00000198763,ENSG00000198804,ENSG00000198712,ENSG00000228253,ENSG00000198899,ENSG00000198938,ENSG00000198840,ENSG00000212907,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694
0,TARGET-10-PAKSWW-03,13.8296,13.9389,13.9181,14.1603,15.0383,14.14,14.2213,12.7133,13.481,...,7.0239,5.2495,4.3378,-1.8314,4.4938,3.136,0.6517,5.5072,4.8465,2.128
1,TARGET-10-PANCVR-03,14.1665,13.6526,14.7652,13.7944,15.4894,14.1342,15.422,12.7528,13.364,...,5.4631,4.8885,3.1112,-2.2447,3.2328,4.5015,0.1519,5.2277,-2.114,0.9115


In [88]:
#see how many duplicates in genes
print('normal shape:', normal_blood_protcoding.shape)
print('number of duplicated genes:', normal_blood_protcoding.columns.duplicated().sum())
print()
print('primary shape is:', primary_blood_protcoding.shape)
print('number of duplicated genes:', primary_blood_protcoding.columns.duplicated().sum())

normal shape: (606, 19383)
number of duplicated genes: 0

primary shape is: (239, 19383)
number of duplicated genes: 0


In [None]:
FINIR CA 

In [89]:
#Preparation of the global dataframe with ATG7
data = [[normal_blood_protcoding, df_ATG7_NormPrim], [primary_blood_protcoding, df_ATG7_NormPrim]]

df_final = []

for df in data:
    #Merge the two df to have ATG7 isoform expression + whole gene expression 
  df_merged = pandas.merge(df[1], df[0], on= 'sample')

  #delete the sample column and select column
#   df_merged = df_merged.drop("sample", 1)
  df_merged = df_merged.drop(df_merged.columns[6:20], axis=1)
  
  #save in list
  df_final.append(df_merged)

normal_blood_protcoding_atg7_expr = df_final[0]
primary_blood_protcoding_atg7_expr = df_final[1]

print(normal_blood_protcoding_atg7_expr.shape)
print(primary_blood_protcoding_atg7_expr.shape)
normal_blood_protcoding_atg7_expr.head()

(0, 19390)
(239, 19390)


Unnamed: 0,sample,Sample_Type,Primary_Site,OS,OS_time,ATG7,ATG7_1norm,ATG7_2norm,ENSG00000198888,ENSG00000198763,...,ENSG00000160678,ENSG00000160679,ENSG00000143553,ENSG00000214193,ENSG00000196182,ENSG00000181817,ENSG00000116885,ENSG00000116898,ENSG00000119535,ENSG00000142694


In [None]:
#select samples label to have the good samples for the complete ATG7 dataframe
normal_samples_labels = normal_blood_protcoding_atg7_expr.iloc[:,0:3]
primary_samples_labels = primary_blood_protcoding_atg7_expr.iloc[:,0:3]
normal_samples_labels

In [None]:
#subset
df_ATG7_Normal_blood = normal_samples_labels.merge(df_ATG7)
df_ATG7_Primary_blood = primary_samples_labels.merge(df_ATG7)
df_ATG7_Primary_blood
print(df_ATG7_Normal_blood.shape)
print(df_ATG7_Primary_blood.shape)
df_ATG7_Primary_blood.head(2)

In [None]:
# create df with only Normal and Primary 
df_Norm_Prim = pandas.concat([df_ATG7_Normal_blood, df_ATG7_Primary_blood], axis=0)
df_Norm_Prim.reset_index(drop=True, inplace=True)

In [None]:
# What is the median for iso 1 and iso 2? If I want to create two groups
#High >50% and low <50%

Data = [[df_ATG7_Normal_blood, 'Normal_blood'], [df_ATG7_Primary_blood, 'Primary_blood']]
isoforms = [["ATG7_1", "iso1"], ["ATG7_2", "iso2"]]

df_LoHi = [] 

for df in Data:
  for isoform in isoforms:
    xMed = df[0][isoform[0]].median()

    low = df[0].loc[df[0][isoform[0]] < xMed]
    low['L/H'] = "Low_"+isoform[1]

    high = df[0].loc[df[0][isoform[0]] > xMed]
    high['L/H'] = "High_"+isoform[1]

  #merge low and high
  concat = pandas.concat([low, high], axis=0)
  col = concat.pop('L/H')
  concat.insert(3, 'L/H', col)
  df_LoHi.append(concat)

normal_LoHi = df_LoHi[0]
primary_LoHi = df_LoHi[1]

# III] ATG7 expression

##A) Distribution


In [None]:
# Distribution graph, for ATG7, iso1 & iso2
all_dfs = [[df_ATG7_Normal_blood, 'in blood', df_ATG7_Primary_blood, 'in primary blood (periph)']]

isoforms = [['log2_+1_ATG7(total)', 'ATG7'], ['log2_+1_ATG7(1)', 'ATG7(1)'], ['log2_+1_ATG7(2)', 'ATG7(2)']]


for df in all_dfs:
  for isoform in isoforms:
    xData = df[0][isoform[0]]
    x2Data = df[2][isoform[0]]

    seaborn.kdeplot(xData, color = 'mediumaquamarine', fill = True)
    seaborn.kdeplot(x2Data, color = 'sandybrown', fill = True)

    plt.xlabel('log2 '+ isoform[1]+ ' +1')
    plt.title(isoform[1]+ ' expression ' + df[1] )
    plt.legend(labels=["Normal Blood","Primary Blood (Periph) "])
    plt.show()
    plt.clf()

##B) Percentage do not express ATG7 isoforms

In [None]:
#Percentage of patients: 
all_dfs = [[df_ATG7_Normal_blood, 'all Normal Blood'], [df_ATG7_Primary_blood, 'all primary tumor']]

isoforms = [['ATG7(total)', 'ATG7'], ['ATG7(1)', 'ATG7(1)'], ['ATG7(2)', 'ATG7(2)'], ['ATG7(3)', 'ATG7(3)']]

for df in all_dfs:
  print(df[1])
  for isoform in isoforms:
        
    QUOI = isoform[1]
    OU = df[1]
    all = (df[0][isoform[0]]).shape
    no_exp = (df[0][isoform[0]]<0.1).sum()
    perc_no_exp = (no_exp*100)/all
    perc_no_exp = float(perc_no_exp)

    # print("The percentage of patient who do not express", QUOI, OU, "is:", round(perc_no_exp,1),'%')
    print(round(perc_no_exp,1),'% of tumor do not express', QUOI)
  print()


## C) Expression figures

### 1) Value TPM

In [None]:
# Distribution: Data preparation + graph
data = [[df_Norm_Prim, 'all tissue']]

for df in data:
   
  shape = df[0].shape
  shape = shape[0]
  print(shape)

  #select the sample type column
  sample_type = {'sample_type':[df[0].iloc[:,1][i]
      for i in range(0, shape)]}

  #convert to df, and copy it 3 times
  sample_typedf = pandas.DataFrame(sample_type)
  sample_typedf = pandas.concat([sample_typedf,sample_typedf,sample_typedf])
  sample_typedf.reset_index(drop=True, inplace=True)

  # Y for expression value 
  y = {'y':[df[0].iloc[:,j][i]
      for j in range(7,10)
        for i in range(0, shape)]}

  #create df with Y 
  data_distrib = pandas.DataFrame(y)

  # X for ATG7/iso1/iso2
  data_distrib['x'] = ''
  data_distrib['x'][: shape] = 'ATG7'
  data_distrib['x'][shape: shape*2] = 'ATG7(1)'
  data_distrib['x'][shape*2:] = 'ATG7(2)'

  # Will be used for Hue
  data_distrib['Sample Type'] = sample_typedf['sample_type']

  #graph
  seaborn.catplot(x="x", y="y", hue="Sample Type", aspect= 1.2, kind="box", palette = 'Set2', data = data_distrib, 
                  boxprops={'lw':2}, medianprops={'lw':2}, whiskerprops={'lw':2}, showcaps=True, showfliers=False)
  
  plt.ylabel('Expression of ATG7 (TPM)')
  plt.xlabel('')
  plt.title('ATG7 expression in '+ df[1], x=.55)

In [None]:
# Distribution: Data preparation + graph
data = [[df_Norm_Prim, 'all tissue']]

for df in data:
   
  shape = df[0].shape
  shape = shape[0]
  print(shape)

  #select the sample type column
  sample_type = {'sample_type':[df[0].iloc[:,1][i]
      for i in range(0, shape)]}

  #convert to df, and copy it 3 times
  sample_typedf = pandas.DataFrame(sample_type)
  sample_typedf = pandas.concat([sample_typedf,sample_typedf,sample_typedf])
  sample_typedf.reset_index(drop=True, inplace=True)

  # Y for expression value 
  y = {'y':[df[0].iloc[:,j][i]
      for j in range(8,10)
        for i in range(0, shape)]}

  #create df with Y 
  data_distrib = pandas.DataFrame(y)

  # X for ATG7/iso1/iso2
  data_distrib['x'] = ''
  data_distrib['x'][: shape] = 'ATG7(1)'
  data_distrib['x'][shape: shape*2] = 'ATG7(2)'

  # Will be used for Hue
  data_distrib['Sample Type'] = sample_typedf['sample_type']

  #graph
  seaborn.catplot(x="x", y="y", hue="Sample Type", aspect= 1.2, kind="box", palette = 'Set2', data = data_distrib, 
                  boxprops={'lw':2}, medianprops={'lw':2}, whiskerprops={'lw':2}, showcaps=True, showfliers=False)
  
  plt.ylabel('Expression of ATG7 (TPM)')
  plt.xlabel('')
  plt.title('ATG7 expression in '+ df[1], x=.55)

In [None]:
#Distribution: Stats for all 

all_dfs = [[df_ATG7_Normal_blood, 'all Normal Blood', df_ATG7_Primary_blood, 'in primary blood (periph)']]

isoforms = ['ATG7', 'ATG7(1)', 'ATG7(2)']

for df in all_dfs:
  for isoform in isoforms:

    n1 = "{:.3f}".format(numpy.median(df[0][isoform]))
    n2 = "{:.3f}".format(numpy.median(df[2][isoform]))

    # Mann-Whitney analysis 
    statistic, pvalue = scipy.stats.mannwhitneyu(df[0][isoform], df[2][isoform])
    Pvalue = "{:.3E}".format(pvalue)
    print("the pvalue between", df[1]+'('+n1+')', "and", df[3]+'('+n2+')', "for", isoform, 'is', Pvalue)
  print()


In [None]:
#median brut file to compare 
print(df_Norm_Prim['ATG7(total)'].mean())
print(df_Norm_Prim['ATG7(1)'].mean())
print(df_Norm_Prim['ATG7(2)'].mean())

In [None]:
#median after modification to be sure I selected the good ones
data_distribtest = data_distrib[data_distrib['x'] == 'ATG7']
data_distribtest1 = data_distrib[data_distrib['x'] == 'ATG7(1)']
data_distribtest2 = data_distrib[data_distrib['x'] == 'ATG7(2)']
print(data_distribtest.mean())
print(data_distribtest1.mean())
print(data_distribtest2.mean())

### 2) Normalized

In [None]:
# NORMALiZED Distribution: Data preparation + graph

data = [[df_Norm_Prim, 'all_tissue']]

for df in data:
   
  shape = df[0].shape
  shape = shape[0]
  print(shape)

  #select the sample type column
  sample_type = {'sample_type':[df[0].iloc[:,1][i]
      for i in range(0, shape)]}

  #convert to df, and copy it 3 times
  sample_typedf = pandas.DataFrame(sample_type)
  sample_typedf = pandas.concat([sample_typedf,sample_typedf,sample_typedf])
  sample_typedf.reset_index(drop=True, inplace=True)

  # Y for expression value 
  y = {'y':[df[0].iloc[:,j][i]
      for j in range(18,20)
        for i in range(0, shape)]}

  #create df with Y 
  data_distrib = pandas.DataFrame(y)

  # X for iso1/ATG7 and iso2/ATG7
  data_distrib['x'] = ''
  data_distrib['x'][: shape] = 'ATG7(1)/ATG7'
  data_distrib['x'][shape:] = 'ATG7(2)/ATG7'


  # Will be used for Hue
  data_distrib['Sample Type'] = sample_typedf['sample_type']

  #graph
  seaborn.catplot(x="x", y="y", hue="Sample Type", aspect= 1.2, kind="box", palette = 'Set2', data = data_distrib, 
                  boxprops={'lw':2}, medianprops={'lw':2}, whiskerprops={'lw':2}, showcaps=True, showfliers=False)
  
  plt.ylabel('ATG7 isoforms/total ATG7 in TPM')
  plt.xlabel('')
  plt.title('ATG7 proportion in '+ df[1], x=0.55)


In [None]:
#NORMALIZED Distribution: Stats for all 

all_dfs = [[df_ATG7_Normal_blood, 'all Normal Blood', df_ATG7_Primary_blood, 'in primary blood (periph)']]

isoforms = ['ATG7(1)norm', 'ATG7(2)norm']
print('NORMALIZED')

for df in all_dfs:
  for isoform in isoforms:

    n1 = "{:.3f}".format(numpy.median(df[0][isoform]))
    n2 = "{:.3f}".format(numpy.median(df[2][isoform]))

    # Mann-Whitney analysis 
    statistic, pvalue = scipy.stats.mannwhitneyu(df[0][isoform], df[2][isoform])
    Pvalue = "{:.3E}".format(pvalue)
    print("the pvalue between", df[1]+'('+n1+')', "and", df[3]+'('+n2+')', "for", isoform, 'is', Pvalue)
  print()


# IV] Expression correlation 

## 1) between the two isoforms

In [None]:
#isoform correlation in blood 
#three values are very high and make my graph ugly... I remove them.
df_ATG7_Primary_blood2 = df_ATG7_Primary_blood[df_ATG7_Primary_blood['ATG7(1)'] < 30 ]
print("shape:",df_ATG7_Primary_blood.shape)
print("shape after removing extrem dots:",df_ATG7_Primary_blood2.shape)


tissues = [[df_ATG7_Normal_blood, "Normal Blood"], [df_ATG7_Primary_blood2, "primary tumors"]]

for tissue in tissues: 
  x = tissue[0]['ATG7(1)'].to_list()
  y = tissue[0]['ATG7(2)'].to_list()

  r_value, p_value = scipy.stats.spearmanr(x, y)
  print("The correlation between iso1 and iso2 in",tissue[1],"is:", r_value, p_value)

  seaborn.regplot(x=x,y=y, x_bins=800, x_ci = 0, marker="+")
  name = "Correlation between the two isoforms in "+  tissue[1]
  plt.title(name)
  plt.xlabel("ATG7(1) expression in tpm")
  plt.ylabel('ATG7(2) expression tpm')
  plt.show()
  plt.clf()
  print()


##2) between all protein coding

In [None]:
#to know the ensembl code for HK2
protein_coding_list.loc[protein_coding_list['hgnc_symbol'] == 'MFAP3']

In [None]:
# Spearman Correlation, main genes 
all_tissue_used = [[normal_blood_protcoding_atg7_expr,'Normal Blood'], [primary_blood_protcoding_atg7_expr,'Primary Blood (Periph)']]

HK2 = 'ENSG00000159399'
YAP1 = 'ENSG00000137693'
AJUBA = 'ENSG00000129474'
YBX1 = 'ENSG00000065978'
TGFB1 = 'ENSG00000105329'
ATG7_1 = 'ATG7_1'
ATG7_2 = 'ATG7_2'
MTDH = 'ENSG00000147649'
MAPK1 = 'ENSG00000100030'
KPNA1 = 'ENSG00000114030'
TNPO1 = 'ENSG00000083312'

for tissue in all_tissue_used:
  genes_x = [[KPNA1,'KPNA1'], [MAPK1, 'MAPK1'], [TNPO1,'TNPO1'], [MTDH,'MTDH'], 
             [HK2, "HK2"], [YAP1, 'YAP1'], [AJUBA,'AJUBA'], [YBX1,'YBX1'], 
             [TGFB1,'TGFB1'], [ATG7_1,'ATG7_1'], [ATG7_2,'ATG7_2']]

  print(tissue[1])

  for gene in genes_x:
    x = tissue[0][gene[0]].to_list()
    y = tissue[0]['ATG7'].to_list()
    y1 = tissue[0]['ATG7_1'].to_list()
    y2 = tissue[0]['ATG7_2'].to_list()  
  
    r_value, p_value = scipy.stats.spearmanr(x, y)
    print('  correlation between ATG7 and', gene[1],  'is:', 'r_value = {:.2f} for a p_value of {:.2e}'.format(r_value, p_value))
    r_value, p_value = scipy.stats.spearmanr(x, y1)
    print('  correlation between ATG7_1 and', gene[1],  'is:', 'r_value = {:.2f} for a p_value of {:.2e}'.format(r_value, p_value))
    r_value, p_value = scipy.stats.spearmanr(x, y2)
    print('  correlation between ATG7_2 and', gene[1],  'is:', 'r_value = {:.2f} for a p_value of {:.2e}'.format(r_value, p_value))
    print()

In [None]:
normal_blood_protcoding_atg7_expr.head()

In [None]:
%%time
# Spearman Correlation, all genes WITHOUT cutoff
all_tissue_used = [[normal_blood_protcoding_atg7_expr,'Normal_blood'], [primary_blood_protcoding_atg7_expr,'Primary_Blood_Periph']]

isoforms = ['ATG7_1', 'ATG7_2']

all_positiv_corr_spearman = []
all_negativ_corr_spearman = []

for tissue in all_tissue_used:
  print(tissue[1], tissue[0].shape)

  #create a new row with max() for each gene and transpose
  tissue[0].loc['max()'] = tissue[0].max()
  tissue[0] = tissue[0].T

  #select all row where max() > 2.3 (= 5 tpm)
  nb_before = tissue[0]['max()'].count()
  tissue[0] = tissue[0].drop(tissue[0].index[0:3])
  tissue[0] = tissue[0][tissue[0]['max()'] > 2.3]
  nb_after = tissue[0]['max()'].count()
  print("number of genes removed:", nb_before-nb_after)
  print('the minimum of TPM is', tissue[0]['max()'].min())
  
  #delete the column max() and tranpose back
  tissue[0] =  tissue[0].drop('max()', axis = 1) 
  tissue[0] = tissue[0].T
  print('nombre genes after removing < 4tpm', tissue[0].shape)

  for isoform in isoforms:

    genepos = []
    geneneg = []
    rvaluepos = []
    rvalueneg = []
    pvaluepos = []
    pvalueneg = []

    print(isoform)
    x = tissue[0][isoform].to_list()
    genes = tissue[0].columns[3:]

    for gene in genes:
      y = tissue[0][gene].to_list()
      r_value, p_value = scipy.stats.spearmanr(x, y)

      if r_value > 0:
        genepos.append(gene)
        rvaluepos.append(r_value)
        pvaluepos.append(p_value)
        
      elif r_value < 0:
        geneneg.append(gene)
        rvalueneg.append(r_value)
        pvalueneg.append(p_value)

    #create a dictionnary
    d = dict(ensembl_gene_id= genepos, r_value=rvaluepos,  p_value=pvaluepos)
    d2 = dict(ensembl_gene_id= geneneg, r_value=rvalueneg,  p_value=pvalueneg)

    #Convert to dataframe
    df_positiv_corr = pandas.DataFrame.from_dict(d, orient='index')
    df_negativ_corr = pandas.DataFrame.from_dict(data=d2, orient='index')

    #Transpose to have a better table
    df_positiv_corr = df_positiv_corr.transpose()
    df_negativ_corr = df_negativ_corr.transpose()

    #name
    positiv_corr = 'pos_spearman_corr_'+tissue[1]+'_'+isoform
    negativ_corr = 'neg_spearman_corr_'+tissue[1]+'_'+isoform

    print(positiv_corr, df_positiv_corr.shape)
    print(negativ_corr, df_negativ_corr.shape)

    #scending orde, save to excel
    df_positiv_corr['info'] = tissue[1]+'_'+isoform
    df_positiv_corr = protein_coding_list.merge(df_positiv_corr)
    df_positiv_corr = df_positiv_corr.drop(['entrezgene_id', 'transcript_biotype'], axis = 1)
    df_positiv_corr = df_positiv_corr.sort_values(['r_value'], ascending = False)
    df_positiv_corr.to_excel(positiv_corr+'.xlsx', index=False)

    df_negativ_corr['info'] = tissue[1]+'_'+isoform
    df_negativ_corr = protein_coding_list.merge(df_negativ_corr)
    df_negativ_corr = df_negativ_corr.drop(['entrezgene_id', 'transcript_biotype'], axis = 1)
    df_negativ_corr = df_negativ_corr.sort_values(['r_value'], ascending = False)
    df_negativ_corr.to_excel(negativ_corr+'.xlsx', index=False)

    all_positiv_corr_spearman.append(df_positiv_corr)
    all_negativ_corr_spearman.append(df_negativ_corr)
  print()

In [None]:
%%time
#Graphic representation, two graphs in one, remove the extremities to have better graph

all_tissue_used = [[normal_blood_protcoding_atg7_expr,'Normal Blood', 'blue'], 
                   [primary_blood_protcoding_atg7_expr,'Primary Blood (Periph)', 'chocolate']]

HK2 = 'ENSG00000159399'
YAP1 = 'ENSG00000137693'
AJUBA = 'ENSG00000129474'
YBX1 = 'ENSG00000065978'
TGFB1 = 'ENSG00000105329'
MAPK1 = 'ENSG00000100030'
MTDH = 'ENSG00000147649'
KPNA1 = 'ENSG00000114030'
TNPO1 = 'ENSG00000083312'
ATG7 = 'ATG7'
ATG7_1 = 'ATG7_1'
 
genes = [[ATG7,'ATG7'], [ATG7_1,'ATG7_1'], [MAPK1,'MAPK1'], [MTDH,'MTDH'], [KPNA1,'KPNA1'], 
         [TNPO1,'TNPO1'], [HK2,'HK2'], [YAP1,'YAP1'], [AJUBA,'AJUBA'], [YBX1,'YBX1'], [TGFB1,'TGFB1']]

for gene in genes:
    for df in all_tissue_used:
        #sort by value, remove the extremities
        print(df[1], ':', df[0].shape)
        for_genes = df[0].sort_values([gene[0]], ascending = False) 
        lenght = len(for_genes)
        without_05perc = int(lenght*0.005)
        without_head = without_05perc
        without_tail = lenght - without_05perc
        print('', without_05perc, 'are removed from head and tail')
                
        df_no_extrem = for_genes.iloc[without_head:without_tail,:]
        print(' without extremities:', df_no_extrem.shape)
        print()
        
        print(df[1])
        r_value, p_value = scipy.stats.spearmanr(x, y1)
        print('  correlation between ATG7_1 and', gene[1],  'is:', 'r_value = {:.2f} for a p_value of {:.2e}'.format(r_value, p_value))
        r_value, p_value = scipy.stats.spearmanr(x, y2)
        print('  correlation between ATG7_2 and', gene[1],  'is:', 'r_value = {:.2f} for a p_value of {:.2e}'.format(r_value, p_value))
        
        #graph
        fig, ax = plt.subplots(1,2, figsize=(18,8))
        name = gene[1]+ " correlation with the two isoforms"
        name2 = gene[1]+ " expression (log2 tpm)"

        fig.suptitle(name, fontsize="x-large")
 
        x = df_no_extrem[gene[0]].to_list()
        y1 = df_no_extrem['ATG7_1'].to_list()
        y2 = df_no_extrem['ATG7_2'].to_list()

        fig = seaborn.regplot(x=x,y=y1, ax=ax[0], x_bins=500, x_ci = 0, marker="+", label = df[1], color = df[2])
        fig.set_title("ATG7(1)")
        fig.set_xlabel(name2)
        fig.set_ylabel('ATG7(1) expression (log2 tpm)')

        fig = seaborn.regplot(x=x,y=y2, ax=ax[1], x_bins=500, x_ci = 0, marker="+", label = df[1], color = df[2])
        fig.set_title("ATG7(2)")
        fig.set_xlabel(name2)
        fig.set_ylabel('ATG7(2) expression (log2 tpm)')
        plt.legend()
        plt.show()
        plt.clf()
    print()

# V] Distribution of all rvalue to select cutoff

In [None]:
%%time
#Merge Negative and Positive data
data =  [
         [all_negativ_corr_spearman[0], all_positiv_corr_spearman[0], 'Normal Blood, ATG7(1)'], 
         [all_negativ_corr_spearman[1], all_positiv_corr_spearman[1], 'Normal Blood, ATG7(2)'],
         [all_negativ_corr_spearman[2], all_positiv_corr_spearman[2], 'Primary Blood (Periph), ATG7(1)'], 
         [all_negativ_corr_spearman[3], all_positiv_corr_spearman[3], 'Primary Blood (Periph), ATG7(2)']
         ]

all_corr_negpos_merged = []
for dfs in data:
  print(dfs[2])
  print('shape positiv', dfs[0].shape)
  print('shape negativ', dfs[1].shape)

  df_merged = pandas.concat([dfs[0], dfs[1]], axis=0)
  df_merged = df_merged.reset_index(drop = True)
  print('shape after merge', df_merged.shape)
  all_corr_negpos_merged.append(df_merged)
  print()

#Merge ATG7(1) and ATG7(2)
data = [
        [all_corr_negpos_merged[0], all_corr_negpos_merged[1], 'Normal Blood'],
        [all_corr_negpos_merged[2], all_corr_negpos_merged[3], 'Primary Blood (Periph)']
        ]

all_corr_isoforms_merged = []
for dfs in data:
  print('shape normal', dfs[0].shape)
  print('shape primary', dfs[1].shape)

  df_merged = pandas.concat([dfs[0], dfs[1]], axis=0)
  df_merged = df_merged.reset_index(drop = True)
  print('shape after merge', df_merged.shape)
  all_corr_isoforms_merged.append(df_merged)
  print()

#Distribution of correlation
data = [
        [all_corr_isoforms_merged[0], 'Normal Blood'],
        [all_corr_isoforms_merged[1], 'Primary Blood (Periph)']
        ]

for dfs in data:
  seaborn.displot(dfs[0], x="r_value", hue='info', aspect = 2.6)
  label_title = 'Distribution of correlation in ' + dfs[1]
  plt.title(label_title, fontsize = 22)
  plt.show()
  plt.clf()
  print()

In [None]:
# have to select the 1% in extremities to have only the interesting one.
print((len(all_corr_negpos_merged)), '= norm iso1, norm iso2, prim iso1, prim iso2')

In [None]:
all_corr_negpos_extremities = []

for df in all_corr_negpos_merged:
  df = df.sort_values(['r_value'], ascending = False)
  df = df.reset_index(drop = True)
  
  lenght = len(df)
  only_1perc = int(lenght*0.01)
  just_head = only_1perc
  just_tail = lenght - only_1perc

  df_head = df.iloc[0:just_head,:]
  df_tail = df.iloc[just_tail:,:]
  df_extremeties = pandas.concat([df_head, df_tail], axis=0)

  inform = df['info'][1]
  extremities_name = 'corr_1%extremities'+ '_' +inform
  df_extremeties.to_excel(extremities_name+'.xlsx', index=False)

  all_corr_negpos_extremities.append(df_extremeties)

# VI] Venn diagrams

In [None]:
len(all_corr_negpos_extremities)

In [None]:
all_corr_negpos_extremities[0]

In [None]:
#separate pos and neg correlation
all_corr_positiv_extremities = []
all_corr_negativ_extremities = []

for df in all_corr_negpos_extremities:
    data_pos = df[df['r_value']>0]
    data_neg = df[df['r_value']<0]
    all_corr_positiv_extremities.append(data_pos)
    all_corr_negativ_extremities.append(data_neg)
len(all_corr_positiv_extremities)

In [None]:
len(all_corr_negativ_extremities[1])

In [None]:
#VENN DIAGRAMS FOR the 1% of the genes
data_pos =  [
         [all_corr_positiv_extremities[0], all_corr_positiv_extremities[1], 'Normal_Blood', 'pos'], 
         [all_corr_positiv_extremities[2], all_corr_positiv_extremities[3], 'Primary_Blood_(Periph)', 'pos']
         ]

data_neg =  [
         [all_corr_negativ_extremities[0], all_corr_negativ_extremities[1], 'Normal_Blood', 'neg'], 
         [all_corr_negativ_extremities[2], all_corr_negativ_extremities[3], 'Primary_Blood_(Periph)', 'neg']
         ]

data = data_pos + data_neg

for dfs in data:
    set1 = set(dfs[0]['hgnc_symbol'])
    set2 = set(dfs[1]['hgnc_symbol'])
    subset = (set1, set2)

    #save results in list
    with_only1 = list(set2.symmetric_difference(set1) & set1)
    with_only2 = list(set2.symmetric_difference(set1) & set2)
    intersection = set1.intersection(set2)

    #create a dictionnary
    d = dict(Only_ATG7_1 = with_only1, Only_ATG7_2 = with_only2, Intersection= intersection)

    #Convert to dataframe
    df_only_and_intersection = pandas.DataFrame.from_dict(d, orient='index')
    name_file = 'only_and_intersection_isos'+dfs[2]+dfs[3]
       
    #Transpose to have a better table and save
    df_only_and_intersection = df_only_and_intersection.transpose()
    df_only_and_intersection['info'] = dfs[2]+dfs[3]
    df_only_and_intersection.to_excel(name_file+'.xlsx', index=False)
    
    #Venn Diagram
    venn2(subset, set_labels=('ATG7(1)', 'ATG7(2)'), set_colors=('grey', 'darksalmon'))
    label_title = dfs[2] + ' ' + dfs[3]
    plt.title(label_title)
    plt.show()
    plt.clf()
    print()

In [None]:
len(data)

In [None]:
print(len(list_only1), '= norm_pos', 'prim_pos', 'norm_neg', 'prim_neg')

In [None]:
#Only one gene is commun in positive correlation. Which one?
print('normal_positive:',commun_both[0])
print('primaryl_positive:',commun_both[1])

In [None]:
#VENN DIAGRAMS FOR 100% of the genes

data_pos =  [
         [all_positiv_corr_spearman[0], all_positiv_corr_spearman[1], 'Normal Blood', 'positive'], 
         [all_positiv_corr_spearman[2], all_positiv_corr_spearman[3], 'Primary Blood (Periph)', 'positive']
         ]

data_neg =  [
         [all_negativ_corr_spearman[0], all_negativ_corr_spearman[1], 'Normal Blood', 'negative'], 
         [all_negativ_corr_spearman[2], all_negativ_corr_spearman[3], 'Primary Blood (Periph)', 'negative']
         ]

data = data_pos + data_neg

list_only1 = []
list_only2 = []

for dfs in data:
  set1 = set(dfs[0]['ensembl_gene_id'])
  set2 = set(dfs[1]['ensembl_gene_id'])
  subset = (set1, set2)

  #just to save some results in list
  with_only1 = set2.symmetric_difference(set1) & set1
  list_only1.append(with_only1)

  with_only2 = set2.symmetric_difference(set1) & set2
  list_only2.append(with_only2)

  #Venn Diagram
  venn2(subset, set_labels=('ATG7(1)', 'ATG7(2)'), set_colors=('grey', 'darksalmon'))
  label_title = dfs[2] + ' ' + dfs[3]
  plt.title(label_title)
  plt.show()
  plt.clf()
  print()



In [None]:
print((len(list_only2)), '= norm_pos_iso2, norm_neg_iso2, prim_pos_iso2, prim_neg_iso2, ')

In [None]:
#Interesting to see only 232 in Normal Isoform 2 positive correlation → list_only2[0]
set_to_dict = dict.fromkeys(list_only2[0],0)
df_only = pandas.DataFrame.from_dict(set_to_dict, orient='index')
df_only = df_only.reset_index()
df_only.rename(columns = {'index' : 'ensembl_gene_id'}, inplace = True)
dfmerged = df_only.merge(all_positiv_corr_spearman[1])
dfmerged['Type'] = 'Normal Blood'
dfmerged['Isoform'] = 'ATG7(2) only'
dfmerged.drop(dfmerged.columns[[1, 5]], axis = 1, inplace = True)

#remove duplicates in genes
print(dfmerged.shape)
print('number of duplicated genes:', dfmerged.duplicated('ensembl_gene_id').sum())
dfmerged.drop_duplicates("ensembl_gene_id", inplace = True)
print(dfmerged.shape)

dfmerged = dfmerged.sort_values(['r_value'], ascending = False)
only_iso2_normal_pos = dfmerged.copy()
only_iso2_normal_pos

In [None]:
seaborn.displot(only_iso2_normal_pos, x="r_value", aspect = 2.6)
label_title = 'Distribution of pos correlation in iso2 only, normal'
plt.title(label_title, fontsize = 22)

In [None]:
#Save to csv
# only_iso2_normal_pos.to_excel('Correlated_only_iso2_normal_pos.xlsx', index=False)