<a href="https://colab.research.google.com/github/JoshJingtianWang/TCGA-Splicing-Data-Cleaning-and-Filtering/blob/main/ORAI2PSIcompare.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create violin plot (ORAI2 only) of Normal vs Cancer vs SF3B1 vs SF3B1 K700E samples

In [None]:
srun -c 4 --pty --x11 /bin/bash -i #enable x forwarding
module load python

In [None]:
import pandas as pd
import datetime as dt
import os
import matplotlib.pyplot as plt
os.chdir('/dfs5/bio/jingtiw2/SpitaleLab/SplicingTCGA/workingfiles')

In [None]:
#get ORAI2 PSI from K700E samples

#getting list of K700E TCGA barcodes
IDbarcode=pd.read_csv('K700Ebarcodes.txt',sep='\t') #read table generated in R
K700Ebarcodes=IDbarcode['submitter_id'].tolist() #save barcodes as a list
K700Ebarcodes.remove('TCGA-AB-2929') #removing this case from the K700E list since it is not found in the Ratsch data
list1=['event_id','event_type','event_chr','event_coordinates','alt_region_coordinates', 'gene_name']
K700Ebarcodes=list1+K700Ebarcodes #prepending gene names to the list of barcodes

#reading in K700E only PSI table
start = dt.datetime.now()
print('{} seconds: Starting...'.format((dt.datetime.now() - start).seconds))
PSIK700E = pd.read_csv('threeprimePSItrimmedheader', usecols = K700Ebarcodes, low_memory = True, sep='\t') #pandas read select columns.
print('{} seconds: completed'.format((dt.datetime.now() - start).seconds)) # This will take one to two minutes to complete on the interactive node (4 cores)

#getting list of K700E ORAI2 PSI
K700EORAI2PSI=PSIK700E.loc[PSIK700E['event_id']=='alt_3prime_227154'].iloc[:,6:].dropna(axis=1).values.flatten().tolist()

In [None]:
#get ORAI2 PSI from normal samples

#reading in normal only PSI table
start = dt.datetime.now()
chunksize = 20000
print('{} seconds: Starting...'.format((dt.datetime.now() - start).seconds))
tp = pd.read_csv('normaltablewID.txt', chunksize=chunksize, iterator=True, encoding='utf-8', sep='\t')
normdf=pd.concat(list(tp), ignore_index=True)
print('{} seconds: completed'.format((dt.datetime.now() - start).seconds)) #takes about 17 seconds to complete on interactive node (4 cores)

#getting list of normal ORAI2 PSI
NormORAI2PSI=normdf.loc[normdf['event_id']=='alt_3prime_227154'].iloc[:,6:].dropna(axis=1).values.flatten().tolist()

In [None]:
#get ORAI2 PSI from all (except normal) samples

#reading in all PSI table (only has ORAI2 row)
AllPSI=pd.read_csv('ORAI2_allheader',sep='\t')
AllPSI.shape

#getting list of normal IDs
norm_list=list(normdf)
norm_list

#dropping normal columns
AllPSI=AllPSI.drop(norm_list,axis = 1)
AllPSI.shape

#getting list of all (except normal) ORAI2 PSI
AllORAI2PSI=AllPSI.iloc[:,6:].dropna(axis=1).values.flatten().tolist()

In [None]:
#get ORAI2 PSI from SF3B1 samples

#getting SF3B1 barcodes
IDbarcode=pd.read_csv('SF3B1barcodes.txt',sep='\t') #read table generated in R
SF3B1barcodes=IDbarcode['submitter_id'].tolist() #save barcodes as a list
listtoremove=['TCGA-AZ-4615', 'TCGA-B5-A11U', 'TCGA-B5-A11E', 'TCGA-E6-A1LX', 'TCGA-EY-A1GI', 
                     'TCGA-A5-A0GI', 'TCGA-12-0828', 'TCGA-B5-A11N', 'TCGA-FI-A2D5', 'TCGA-CN-A6V1', 
                     'TCGA-AA-A00N', 'TCGA-AA-3510', 'TCGA-BG-A222', 'TCGA-AX-A1CE', 'TCGA-AB-2828', 
                     'TCGA-A5-A0G1', 'TCGA-EE-A3AE', 'TCGA-AA-A010', 'TCGA-BS-A0UF', 'TCGA-AA-3672', 
                     'TCGA-AP-A1DK', 'TCGA-AP-A0LM', 'TCGA-AP-A0LD', 'TCGA-A5-A0G2', 'TCGA-D1-A103', 
                     'TCGA-AA-3977', 'TCGA-D1-A15X', 'TCGA-BS-A0UM', 'TCGA-AX-A06F', 'TCGA-EY-A1GD', 
                     'TCGA-BS-A0UV', 'TCGA-B5-A1MR', 'TCGA-D1-A16X', 'TCGA-AX-A05Z', 'TCGA-AX-A06L', 
                     'TCGA-AB-2912', 'TCGA-B7-5816', 'TCGA-AP-A051', 'TCGA-A5-A1OF', 'TCGA-AX-A0J1', 
                     'TCGA-CK-4951', 'TCGA-AA-3967', 'TCGA-D1-A17M', 'TCGA-AX-A2H8', 'TCGA-19-5956', 
                     'TCGA-B5-A1MX', 'TCGA-EY-A1H0', 'TCGA-AP-A1E0', 'TCGA-EO-A22R', 'TCGA-AP-A1DO', 
                     'TCGA-14-1821', 'TCGA-14-1794', 'TCGA-BG-A2AE', 'TCGA-BS-A0TC', 'TCGA-AA-3984', 
              'TCGA-AP-A1DV','TCGA-AB-2929']
SF3B1barcodes2=list(set(SF3B1barcodes)-set(listtoremove)) #removing this case from the list since it is not found in the Ratsch data
list1=['event_id','event_type','event_chr','event_coordinates','alt_region_coordinates', 'gene_name']
SF3B1barcodes3=list1+SF3B1barcodes2 #prepending gene names to the list of barcodes

#reading in SF3B1 PSI table
import datetime as dt
start = dt.datetime.now()
print('{} seconds: Starting...'.format((dt.datetime.now() - start).seconds))
PSISF3B1 = pd.read_csv('threeprimePSItrimmedheader', usecols = SF3B1barcodes3, low_memory = True, sep='\t') #pandas read select columns.
print('{} seconds: completed'.format((dt.datetime.now() - start).seconds))

#getting list of SF3B1 ORAI2 PSI
SF3B1ORAI2PSI=PSISF3B1.loc[PSISF3B1['event_id']=='alt_3prime_227154'].iloc[:,6:].dropna(axis=1).values.flatten().tolist()

In [None]:
#generating the violin plot

#https://www.tutorialspoint.com/matplotlib/matplotlib_violin_plot.htm

data_to_plot = [NormORAI2PSI, AllORAI2PSI, SF3B1ORAI2PSI, K700EORAI2PSI]

# Create a figure instance
fig = plt.figure()

# Create an axes instance
ax = fig.gca()
xticklabels = ['Normal', 'All Cancers', 'SF3B1', 'K700E']
ax.set_xticks([1,2,3,4])
ax.set_xticklabels(xticklabels)
ax.set_title('ORAI2')
#ax.set_xlabel('type')
ax.set_ylabel('Percent Spliced In')

#adding statistical annotations
#https://stackoverflow.com/questions/36578458/how-does-one-insert-statistical-annotations-stars-or-p-values-into-matplotlib/37518947

# Create the boxplot
bp = ax.violinplot(data_to_plot, showmedians=True)
plt.show()

In [None]:
#concatenate PSI values into a table for ggplot in R

Normtable=pd.DataFrame(NormORAI2PSI)
Normtable['Type']='Normal'
Normtable.columns = ['PSI', 'Type']

Alltable=pd.DataFrame(AllORAI2PSI)
Alltable['Type']='All'
Alltable.columns = ['PSI', 'Type']

SF3B1table=pd.DataFrame(SF3B1ORAI2PSI)
SF3B1table['Type']='SF3B1'
SF3B1table.columns = ['PSI', 'Type']

K700Etable=pd.DataFrame(K700EORAI2PSI)
K700Etable['Type']='K700E'
K700Etable.columns = ['PSI', 'Type']

tables=[Normtable,Alltable,SF3B1table,K700Etable]
merged = pd.concat(tables)

with open('merged_PSI_for_R.csv', 'w') as file:
     merged.to_csv(file, sep='\t')

In [None]:
#generating violin plot with stats annotations in R

module load R
R
library(ggpubr)

PSI <- read.table("merged_PSI_for_R.csv",header=TRUE,sep='\t')
PSI$X <- NULL

#adding stats to plot
#http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/76-add-p-values-and-significance-levels-to-ggplots/
compare_means(PSI ~ Type,  data = PSI, method = "kruskal.test")
my_comparisons <- list( c("All", "Normal"), c("All", "SF3B1"), c("All", "K700E"), c("Normal", "SF3B1"),c("Normal", "K700E"),c("K700E", "SF3B1"))

png('ORAI2PSICompare.png', width = 1000, height = 1000, res = 200) #res option affects font size

ggboxplot(PSI, x = "Type", y = "PSI", #to plot violin plot, simply change ggboxplot to ggviolin
  color = "Type", palette = "jco") +
  stat_compare_means(label = "p.signif", comparisons = my_comparisons) +
  stat_compare_means(label.y = 1.7)

dev.off()

#