In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

In [5]:
df_SomaticMT = pd.read_csv('OmicsSomaticMutations.csv', low_memory=False)

In [3]:
DRC_table = pd.read_excel('DRCtableAUC_merge_DepmapID.xlsx')
DRC_table.head()

Unnamed: 0,Cell name,Depmap ID,"AUC, DEG-35"
0,143B,ACH-001001,0.738025
1,22RV1,ACH-000956,0.543824
2,2313287,ACH-000948,0.362258
3,253J,ACH-000011,0.811752
4,253JBV,ACH-000026,0.713113


In [6]:
df_SomaticMT.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,DP,RefCount,AltCount,GT,PS,...,PharmgkbId,DidaID,DidaName,GwasDisease,GwasPmID,GtexGene,ProveanPrediction,Rescue,ModelID,EntrezGeneID
0,chr1,818203,G,A,0.24,27,21,6,0/1,,...,,,,,,,,False,ACH-000062,400728.0
1,chr1,924657,C,G,0.437,17,9,8,0/1,,...,,,,,,,,False,ACH-000693,148398.0
2,chr1,924750,C,T,0.625,19,7,12,0/1,,...,,,,,,,,False,ACH-000930,148398.0
3,chr1,924909,G,A,0.285,52,37,15,0/1,,...,,,,,,,,False,ACH-001691,148398.0
4,chr1,930198,C,T,0.366,42,27,15,0/1,,...,,,,,,,Neutral,False,ACH-000956,148398.0


In [7]:
df_SomaticMT = df_SomaticMT.rename(columns={'ModelID': 'Depmap ID'})

In [8]:
df_SomaticMT = df_SomaticMT.merge(DRC_table, how='inner')
df_SomaticMT.head()

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,DP,RefCount,AltCount,GT,PS,...,DidaName,GwasDisease,GwasPmID,GtexGene,ProveanPrediction,Rescue,Depmap ID,EntrezGeneID,Cell name,"AUC, DEG-35"
0,chr1,818203,G,A,0.24,27,21,6,0/1,,...,,,,,,False,ACH-000062,400728.0,RERFLCMS,0.834513
1,chr1,24469099,A,G,0.978,44,0,44,1|1,,...,,,,,Neutral,False,ACH-000062,57185.0,RERFLCMS,0.834513
2,chr1,34899671,G,C,0.155,30,26,4,0/1,,...,,,,,Neutral,False,ACH-000062,58512.0,RERFLCMS,0.834513
3,chr1,35390086,C,T,0.15,41,35,6,0/1,,...,,,,,,False,ACH-000062,9202.0,RERFLCMS,0.834513
4,chr1,48776614,G,T,0.799,43,8,35,0/1,,...,,,,,Neutral,False,ACH-000062,79656.0,RERFLCMS,0.834513


In [9]:
df_SomaticMT.to_csv('df_SomaticMT.csv')

In [10]:
MT_gene_avg_auc = df_SomaticMT.groupby('HugoSymbol').agg({
    'AUC, DEG-35': 'mean',
    'Cell name': lambda x: ', '.join(map(str, x)),
    'HgncName': 'first'
}).reset_index()

MT_gene_avg_auc.head()

Unnamed: 0,HugoSymbol,"AUC, DEG-35",Cell name,HgncName
0,A1BG,0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein
1,A1CF,0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor
2,A2M,0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin
3,A2ML1,0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1
4,A3GALT2,0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2


In [11]:
MT_gene_avg_auc.to_csv('MT-Gene_AvgAUC.csv')

In [30]:
# Step 1: Create a new DataFrame to store the results
WT_gene_avg_auc = pd.DataFrame(columns=['HugoSymbol', 'Mean AUC'])

# Step 2: Iterate over each row in MT_gene_avg_auc
for index, row in MT_gene_avg_auc.iterrows():
    # Split the cell names and convert to a set
    cell_names = set(row['Cell name'].split(', '))
    
    # Subtract the cell names from the total list of cell names
    new_cell_names = set(DRC_table['Cell name']).difference(cell_names)
    
    # Filter DRC to include only the rows with cell names from the new list of cell names
    new_DRC = DRC_table[DRC_table['Cell name'].isin(new_cell_names)]
    
    # Calculate the mean AUC for the remaining cell names
    mean_auc = new_DRC['AUC, DEG-35'].mean()
    
    # Convert all elements in new_cell_names to strings before joining
    new_cell_names = [str(cell) for cell in new_cell_names]
    
    # Append the results to new_gene_avg_auc
    #WT_gene_avg_auc = WT_gene_avg_auc.append({'HugoSymbol': row['HugoSymbol'], 'Mean AUC': mean_auc, 'WT Cell names': ', '.join(new_cell_names)}, ignore_index=True)
    new_row = {'HugoSymbol': row['HugoSymbol'], 'Mean AUC': mean_auc, 'WT Cell names': ', '.join(new_cell_names)}

    # Convert the new row into a DataFrame
    new_row_df = pd.DataFrame([new_row])

    # Concatenate the new row DataFrame with WT_gene_avg_auc
    WT_gene_avg_auc = pd.concat([WT_gene_avg_auc, new_row_df], ignore_index=True)



In [31]:
WT_gene_avg_auc.head()

Unnamed: 0,HugoSymbol,Mean AUC,WT Cell names
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983..."
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983..."
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983..."
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN..."
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983..."


In [32]:
WT_gene_avg_auc.to_excel('WT_gene_avg_AUC_final.xlsx')

In [33]:
# Merge the two DataFrames on 'HugoSymbol'
merged_df = pd.merge(WT_gene_avg_auc, MT_gene_avg_auc, on='HugoSymbol', suffixes=('_WT', '_MT'))

# Calculate the difference of the mean AUCs
merged_df['AUC_difference_MT-WT'] = merged_df['AUC, DEG-35'] - merged_df['Mean AUC']

difference_df = merged_df[['HugoSymbol', 'AUC_difference_MT-WT']]

In [34]:
difference_df.head()

Unnamed: 0,HugoSymbol,AUC_difference_MT-WT
0,A1BG,-0.021709
1,A1CF,-0.051189
2,A2M,-0.078303
3,A2ML1,-0.034273
4,A3GALT2,0.026214


In [None]:
difference_df.to_excel('AUC_difference_Mt-Wt.xlsx')

In [35]:
def get_auc(cell_names):
    return DRC_table[DRC_table['Cell name'].isin(cell_names)]['AUC, DEG-35'].tolist()

WT_gene_avg_auc2 = WT_gene_avg_auc
WT_gene_avg_auc2['AUC values'] = WT_gene_avg_auc2['WT Cell names'].apply(lambda x: get_auc(x.split(', ')))

MT_gene_avg_auc2 = MT_gene_avg_auc
MT_gene_avg_auc['AUC values'] = MT_gene_avg_auc['Cell name'].apply(lambda x: get_auc(x.split(', ')))


In [36]:
WT_gene_avg_auc2.head()

Unnamed: 0,HugoSymbol,Mean AUC,WT Cell names,AUC values
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175..."
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311..."
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175..."
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311..."
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175..."


In [37]:
MT_gene_avg_auc2.head()

Unnamed: 0,HugoSymbol,"AUC, DEG-35",Cell name,HgncName,AUC values
0,A1BG,0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588..."
1,A1CF,0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131..."
2,A2M,0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416..."
3,A2ML1,0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187..."
4,A3GALT2,0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666..."


In [40]:
print(len(WT_gene_avg_auc))
print(len(MT_gene_avg_auc))

18612
18612


In [41]:
All_cell_MTvsWT = WT_gene_avg_auc2.merge(MT_gene_avg_auc2, how = 'inner', on = 'HugoSymbol', suffixes=('_WT','_MT'))

In [42]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC,WT Cell names,AUC values_WT,"AUC, DEG-35",Cell name,HgncName,AUC values_MT
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588..."
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131..."
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416..."
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187..."
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666..."


In [43]:
All_cell_MTvsWT = All_cell_MTvsWT.rename(columns={
    'Mean AUC':'Mean AUC_WT', 
    'WT Cell names':'Cell names_WT',
    'AUC, DEG-35':'Mean AUC_MT',
    'Cell name':'Cell names_MT'})

In [44]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588..."
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131..."
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416..."
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187..."
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666..."


In [45]:
All_cell_MTvsWT = All_cell_MTvsWT.merge(difference_df, how = 'left', on = 'HugoSymbol')

In [46]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214


In [47]:
All_cell_MTvsWT.to_excel('All_cell_MTvsWT.xlsx')

In [64]:
from scipy.stats import levene

# Step 1: Iterate over each row in MT_gene_avg_auc
F_test = pd.DataFrame(columns=['stat', 'p_value'])

for index, row in All_cell_MTvsWT.iterrows():
    
    # Extract AUC values from the row
    auc1 = row['AUC values_WT']
    auc2 = row['AUC values_MT']
    
    #onvert the lists of arrays into arrays of integers
    auc_array1 = np.array(auc1, dtype=int)
    auc_array2 = np.array(auc2, dtype=int)
    
    # Perform Levene's test for equal variances
    stat, p_value = levene(auc_array1, auc_array2, nan_policy='propagate')
    
    # F-test results
    new_row = {'stat': stat, 'p_value': p_value}

    # Convert the new row into a DataFrame
    new_row_df = pd.DataFrame([new_row])

    # Concatenate the new row DataFrame with WT_gene_avg_auc
    F_test = pd.concat([F_test, new_row_df], ignore_index=True)
    

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [57]:
auc1.head()

0    [0.738025461647376, 0.543823949372426, 0.81175...
1    [0.738025461647376, 0.811752287495537, 0.71311...
2    [0.738025461647376, 0.543823949372426, 0.81175...
3    [0.738025461647376, 0.811752287495537, 0.71311...
4    [0.738025461647376, 0.543823949372426, 0.81175...
Name: AUC values_WT, dtype: object

In [65]:
F_test.head()

Unnamed: 0,stat,p_value
0,0.608839,0.435554
1,0.971495,0.324732
2,0.010121,0.919901
3,0.14172,0.706719
4,0.743537,0.388898


In [68]:
All_cell_MTvsWT = All_cell_MTvsWT.join(F_test, how = "left")

In [69]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,Variance Test,stat,p_value
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709,True,0.608839,0.435554
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189,True,0.971495,0.324732
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303,True,0.010121,0.919901
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273,True,0.14172,0.706719
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214,True,0.743537,0.388898


In [70]:
All_cell_MTvsWT.to_excel('All_cell_MTvsWT_FTest.xlsx')

In [71]:
All_cell_MTvsWT = All_cell_MTvsWT.rename(columns={
    'stat':'F-Test_stat', 
    'p_value':'F-Test_p-value',
    })
All_cell_MTvsWT = All_cell_MTvsWT.drop(columns = ['Variance Test'])

In [72]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,F-Test_stat,F-Test_p-value
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709,0.608839,0.435554
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189,0.971495,0.324732
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303,0.010121,0.919901
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273,0.14172,0.706719
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214,0.743537,0.388898


In [74]:
All_cell_MTvsWT.to_excel('All_cell_MTvsWT_FTest.xlsx')

In [77]:
def Ftest(i):
    if i > 0.05:
        return True
    return False

# Apply the function to create a new column
All_cell_MTvsWT['F-test'] = All_cell_MTvsWT['F-Test_p-value'].apply(Ftest)

In [78]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,F-Test_stat,F-Test_p-value,F-test
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709,0.608839,0.435554,True
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189,0.971495,0.324732,True
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303,0.010121,0.919901,True
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273,0.14172,0.706719,True
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214,0.743537,0.388898,True


In [1]:
All_cell_MTvsWT.to_excel('All_cell_MTvsWT_FTest.xlsx')

NameError: name 'All_cell_MTvsWT' is not defined

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind

All_cell_MTvsWT = pd.read_excel('All_cell_MTvsWT_FTest.xlsx')

In [11]:
All_cell_MTvsWT = All_cell_MTvsWT.dropna()

# Define a function to check if a value is a list containing only one element
def has_single_element(lst):
    return isinstance(lst, list) and len(lst) == 1

# Remove rows where 'Cell names_WT' column contains only one element
df_cleaned = All_cell_MTvsWT[All_cell_MTvsWT['Cell names_WT'].apply(has_single_element)]
df_cleaned = All_cell_MTvsWT[All_cell_MTvsWT['Cell names_MT'].apply(has_single_element)]

print(len(All_cell_MTvsWT))

18023


In [29]:

# Create an empty list to store the rows to be concatenated
rows_to_concat = []

# Iterate over rows in the DataFrame
for index, row in All_cell_MTvsWT.iterrows():
    num_elem1 = len(row['Cell names_WT'])
    num_elem2 = len(row['Cell names_MT'])
    
    if num_elem1 == 1 or num_elem2 == 2:
        rows_to_concat.append(row)
    #elif num_elem1 == 1 or num_elem2 == 1:
        #if has_single_element(row['Cell names_WT']) and has_single_element(row['Cell names_MT']):
            #rows_to_concat.append(row)

# Concatenate the rows into a DataFrame
#df_cleaned = pd.concat(rows_to_concat, axis=1).T

print(len(df_cleaned))

18023


In [30]:
print(len(rows_to_concat))

0


In [32]:

df_cleaned.to_excel('df_cleaned.xlsx')

In [33]:
df_cleaned.head()

Unnamed: 0.1,Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,F-Test_stat,F-Test_p-value,F-test
0,0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709,0.608839,0.435554,True
1,1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189,0.971495,0.324732,True
2,2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303,0.010121,0.919901,True
3,3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273,0.14172,0.706719,True
4,4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214,0.743537,0.388898,True


In [34]:
from scipy import stats

# Step 1: Iterate over each row in MT_gene_avg_auc
T_test = pd.DataFrame(columns=['stat', 'p_value', 'DoF'])

for index, row in df_cleaned.iterrows():
    
    # Extract AUC values from the row
    auc1 = [float(value) for value in row['AUC values_WT'].strip('[]').split(',') if value.strip()]
    auc2 = [float(value) for value in row['AUC values_MT'].strip('[]').split(',') if value.strip()]
    var = row['F-test']
    #onvert the lists of arrays into arrays of integers
    #auc_array1 = np.array(auc1, dtype=float)
    #auc_array2 = np.array(auc2, dtype=float)
    
    # Perform t-test
    if var:  # If F-test indicates equal variances
        stat, p_value = stats.ttest_ind(auc1, auc2, equal_var=True, nan_policy='omit')
    else:
        stat, p_value = stats.ttest_ind(auc1, auc2, equal_var=False, nan_policy='omit')
    
    # Calculate degrees of freedom
    DoF = len(auc1) + len(auc2) - 2
    
    # F-test results
    new_row = {'stat': stat, 'p_value': p_value, 'DoF': DoF}

    # Convert the new row into a DataFrame
    new_row_df = pd.DataFrame([new_row])

    # Concatenate the new row DataFrame with WT_gene_avg_auc
    T_test = pd.concat([T_test, new_row_df], ignore_index=True)
    

  res = hypotest_fun_out(*samples, **kwds)


In [35]:
T_test.head()

Unnamed: 0,stat,p_value,DoF
0,0.509134,0.610858,562
1,1.595908,0.111072,562
2,1.787487,0.074398,562
3,1.233368,0.217954,562
4,-0.512934,0.608199,562


In [86]:
All_cell_MTvsWT = All_cell_MTvsWT.join(T_test, how = "left")
All_cell_MTvsWT = All_cell_MTvsWT.rename(columns={
    'stat':'T-Test_stat', 
    'p_value':'T-Test_p-value',
    })

In [87]:
All_cell_MTvsWT.head()


Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,F-Test_stat,F-Test_p-value,F-test,T-Test_stat,T-Test_p-value,DoF
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709,0.608839,0.435554,True,0.780281,0.435554,562
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189,0.971495,0.324732,True,0.985644,0.324732,562
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303,0.010121,0.919901,True,0.100604,0.919901,562
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273,0.14172,0.706719,True,0.376457,0.706719,562
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214,0.743537,0.388898,True,0.862286,0.388898,562


In [88]:
def Ttest(i):
    if i < 0.05:
        return True
    return False

# Apply the function to create a new column
All_cell_MTvsWT['T-test'] = All_cell_MTvsWT['T-Test_p-value'].apply(Ttest)

In [89]:
All_cell_MTvsWT.head()

Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,F-Test_stat,F-Test_p-value,F-test,T-Test_stat,T-Test_p-value,DoF,T-test
0,A1BG,0.747705,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.725997,"NCIH1435, MFE319, WM2664, HT115, KYSE450, HGC2...",alpha-1-B glycoprotein,"[0.690339204747059, 0.914688944108179, 0.62588...",-0.021709,0.608839,0.435554,True,0.780281,0.435554,562,False
1,A1CF,0.749348,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.811752287495537, 0.71311...",0.698158,"22RV1, AN3CA, HT115, NCIH650, NCIH1092, KATOII...",APOBEC1 complementation factor,"[0.543823949372426, 0.705741491202442, 0.69131...",-0.051189,0.971495,0.324732,True,0.985644,0.324732,562,False
2,A2M,0.749883,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.67158,"LS180, NCIH1435, MFE319, AN3CA, RKO, GP2D, GP2...",alpha-2-macroglobulin,"[0.803290432659885, 0.834731912055259, 0.80416...",-0.078303,0.010121,0.919901,True,0.100604,0.919901,562,False
3,A2ML1,0.749294,"OVTOKO, NCIH28, G402, LAMA84, RD, WM983B, NCIN...","[0.738025461647376, 0.811752287495537, 0.71311...",0.715021,"22RV1, HEC1A, NCIH1435, WM2664, WM2664, VMCUB1...",alpha-2-macroglobulin like 1,"[0.543823949372426, 0.77116001889797, 0.849187...",-0.034273,0.14172,0.706719,True,0.376457,0.706719,562,False
4,A3GALT2,0.746512,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.772726,"SNU407, LS180, MFE296, MFE319, SNUC2A, KYSE450...",alpha 1%3B3-galactosyltransferase 2,"[0.809228736277344, 0.73658900825747, 0.845666...",0.026214,0.743537,0.388898,True,0.862286,0.388898,562,False


In [90]:
All_cell_MTvsWT.to_excel('All_cell_MTvsWT_TTest.xlsx')

In [36]:
All_cell_MTvsWT_Ttest = pd.read_excel('All_cell_MTvsWT_TTest.xlsx')

In [37]:
# Calculate the difference of the mean AUCs
All_cell_MTvsWT_Ttest['AUC_ratio_MT/WT'] = All_cell_MTvsWT_Ttest['Mean AUC_MT'] / All_cell_MTvsWT_Ttest['Mean AUC_WT']

All_cell_MTvsWT_Ttest.head()

Unnamed: 0.1,Unnamed: 0,HugoSymbol,Mean AUC_WT,Cell names_WT,AUC values_WT,Mean AUC_MT,Cell names_MT,HgncName,AUC values_MT,AUC_difference_MT-WT,F-Test_stat,F-Test_p-value,F-test,T-Test_stat,T-Test_p-value,DoF,T-test,AUC_ratio_MT/WT
0,12267.0,PPP1R11,0.746274,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",1.0,"SCLC21H, BXPC3",protein phosphatase 1 regulatory inhibitor sub...,"[1.0, 1.0]",0.253726,0.077359,0.781012,1.0,-7.16527,2.449428e-12,562.0,1.0,1.33999
1,17473.0,VCX2,0.746722,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",0.681129,"2313287, KASUMI2","variable charge, X-linked 2",[1.0],-0.065593,0.040521,0.840538,1.0,-4.950122,9.818968e-07,562.0,1.0,0.912158
2,1577.0,BPIFA4P,0.746722,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",1.0,OVISE,"BPI fold containing family A member 4, pseudogene",[1.0],0.253278,0.040521,0.840538,1.0,-4.950122,9.818968e-07,562.0,1.0,1.339186
3,4544.0,EHHADH-AS1,0.746722,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",1.0,OVISE,EHHADH antisense RNA 1,[1.0],0.253278,0.040521,0.840538,1.0,-4.950122,9.818968e-07,562.0,1.0,1.339186
4,4649.0,EMBP1,0.746722,"OVTOKO, NCIH28, SNB75, G402, LAMA84, RD, WM983...","[0.738025461647376, 0.543823949372426, 0.81175...",1.0,NCIH1437,embigin pseudogene 1,[1.0],0.253278,0.040521,0.840538,1.0,-4.950122,9.818968e-07,562.0,1.0,1.339186


In [39]:
All_cell_MTvsWT_Ttest = All_cell_MTvsWT_Ttest.dropna()

All_cell_MTvsWT_Ttest.to_excel('All_cell_MTvsWT_TTest_2.xlsx')