In [2]:
import pandas as pd
import numpy as np
import pubchempy as pcp
from sklearn.decomposition import PCA
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import AllChem

pd.set_option('mode.chained_assignment',  None) # we dont want unnecessary warning

In [3]:
data_root = Path("./input_data")

exp_file = data_root.joinpath("exp_filtered.csv")

print(exp_file)


input_data/exp_filtered.csv


In [4]:
print(exp_file.is_file())

True


In [6]:
df_exp = pd.read_csv(exp_file)

In [7]:
df_exp

Unnamed: 0,COSMIC.ID,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,H3C2,H3C3,AC098582.1,DUS4L.BCAP29,C8orf44.SGK3,ELOA3B,NPBWR1,ELOA3D,ELOA3,CDR1
0,684072,4.327687,0.070389,5.979339,2.906891,4.904484,0.263034,2.235727,0.422233,4.433627,...,1.416840,2.000000,0.042644,1.959770,0.432959,0.070389,0.042644,0.378512,0.070389,0.000000
1,687448,3.266037,0.000000,6.096979,2.521051,3.040892,0.000000,0.831877,6.576069,4.657068,...,0.333424,0.879706,0.839960,2.384050,0.000000,0.028569,0.163499,0.000000,0.028569,0.000000
2,687562,4.374344,0.000000,6.963821,2.292782,4.001802,0.495695,2.729009,6.396947,5.355792,...,2.056584,0.000000,0.333424,2.792855,0.584963,0.000000,0.238787,0.070389,0.000000,0.028569
3,687568,3.477677,0.000000,6.762615,2.107688,4.371559,0.028569,1.321928,5.440288,4.072963,...,1.910733,1.035624,0.367371,1.673556,0.097611,0.000000,0.042644,0.124328,0.000000,0.389567
4,687590,3.244887,0.000000,7.124535,1.933573,3.493135,0.014355,3.168321,6.090642,4.505891,...,1.695994,1.613532,0.594549,4.015248,0.286881,0.000000,0.028569,0.000000,0.000000,0.411426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,1330948,0.176323,0.000000,6.348197,2.280956,3.636915,1.831877,5.785551,5.497293,6.055065,...,1.937344,0.432959,0.454176,2.920293,0.189034,0.042644,0.014355,0.042644,0.042644,0.000000
658,1503364,4.585563,0.000000,7.524973,2.211012,2.786596,0.000000,0.310340,5.228819,4.791293,...,1.144046,0.782409,0.263034,1.220330,0.111031,0.000000,0.000000,0.000000,0.000000,0.000000
659,1659818,0.137504,0.000000,6.757423,3.176323,5.111449,0.097611,0.189034,3.688180,4.213347,...,0.650765,1.422233,0.604071,2.223423,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
660,1659819,0.000000,0.000000,7.059939,2.761285,4.144862,0.443607,0.000000,0.056584,4.491853,...,1.443607,1.906891,1.070389,1.448901,0.641546,0.000000,0.000000,0.000000,0.000000,0.000000


In [8]:
std_2_to_19194 = df_exp.iloc[:, 1:19194].std()

print("2열부터 19194열까지의 열의 표준편차:")
print(std_2_to_19194)

2열부터 19194열까지의 열의 표준편차:
TSPAN6      1.636366
TNMD        0.226464
DPM1        0.610333
SCYL3       0.567319
C1orf112    0.690382
              ...   
ELOA3B      0.065843
NPBWR1      0.659847
ELOA3D      0.069546
ELOA3       0.097073
CDR1        0.144252
Length: 19193, dtype: float64


In [10]:
q1 = std_2_to_19194.quantile(0.25)
median = std_2_to_19194.quantile(0.5)
q3 = std_2_to_19194.quantile(0.75)

print("표준편차의 1사분위수:", q1)
print("표준편차의 중앙값:", median)
print("표준편차의 3사분위수:", q3)

표준편차의 1사분위수: 0.5760327332808185
표준편차의 중앙값: 0.748986884325046
표준편차의 3사분위수: 1.1204030393247908


In [21]:
selected_columns = std_2_to_19194.index[std_2_to_19194 >= std_2_to_19194.median()]

# 선택한 열만을 포함하는 데이터프레임 생성
filtered_data = df_exp[selected_columns]

# 결과 출력

print(filtered_data)


       TSPAN6       FGR       CFH     FUCA2      GCLC     STPG1    NIPAL3  \
0    4.327687  0.263034  2.235727  0.422233  4.433627  1.516015  2.759156   
1    3.266037  0.000000  0.831877  6.576069  4.657068  2.545968  4.564988   
2    4.374344  0.495695  2.729009  6.396947  5.355792  2.790772  4.251719   
3    3.477677  0.028569  1.321928  5.440288  4.072963  3.137504  4.774524   
4    3.244887  0.014355  3.168321  6.090642  4.505891  2.469886  4.402586   
..        ...       ...       ...       ...       ...       ...       ...   
657  0.176323  1.831877  5.785551  5.497293  6.055065  2.684819  3.663345   
658  4.585563  0.000000  0.310340  5.228819  4.791293  2.837943  2.972693   
659  0.137504  0.097611  0.189034  3.688180  4.213347  1.981853  4.593354   
660  0.000000  0.443607  0.000000  0.056584  4.491853  2.419539  2.684819   
661  6.345361  0.042644  0.014355  5.785289  3.060047  4.023255  2.629939   

        ENPP4    SEMA3F      CFTR  ...   UPK3BL2  AC093512.2  ARHGAP11B  \


In [22]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Step 2: Standardize the data
data_standardized = StandardScaler().fit_transform(filtered_data)



In [23]:
# Step 3: Perform PCA
n_components = 10
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(data_standardized)

In [24]:
# Step 4: Create a new DataFrame with COSMIC.ID as a column and the principal components as additional columns
df_pc = pd.DataFrame(
    data=principal_components,
    index=filtered_data.index.values,
    columns=[f"pc{i+1}" for i in range(n_components)],
)

In [25]:
df_pc

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,17.667029,-54.878207,-5.095870,14.363268,-1.678923,1.608322,-20.991596,4.623455,-6.385375,-12.202126
1,-21.374770,-14.521220,32.819719,-5.484588,41.166722,22.654674,21.239078,-22.909192,-16.406344,-11.243928
2,-17.887814,-23.484925,20.606804,-17.477269,-6.445107,15.027438,16.785896,6.253796,5.716616,2.485089
3,-14.696967,-22.876734,27.401705,-0.932784,-5.385297,-16.390636,-4.688088,-4.212353,-3.041296,1.802740
4,-25.772204,-16.017773,47.168636,-7.821304,1.524498,-2.900304,19.768564,4.410283,4.512983,3.179018
...,...,...,...,...,...,...,...,...,...,...
657,49.672065,-0.368873,8.910457,-12.571055,6.360107,12.673995,-18.342769,22.517865,-12.560013,16.351369
658,-6.485265,53.617747,-8.076410,28.584301,-3.137586,2.048587,20.693363,10.368548,3.062866,6.809916
659,53.666893,10.585074,6.061448,-12.827196,4.508623,-8.137419,16.212227,-19.139953,8.020412,20.580321
660,83.536857,14.283863,8.638587,-0.191666,-19.368635,-14.783832,12.247785,-23.645589,21.923298,-7.873945


In [26]:
cosmic_id = df_exp['COSMIC.ID']
df_exp_features = pd.concat([cosmic_id, df_pc], axis=1)
df_exp_features

Unnamed: 0,COSMIC.ID,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,684072,17.667029,-54.878207,-5.095870,14.363268,-1.678923,1.608322,-20.991596,4.623455,-6.385375,-12.202126
1,687448,-21.374770,-14.521220,32.819719,-5.484588,41.166722,22.654674,21.239078,-22.909192,-16.406344,-11.243928
2,687562,-17.887814,-23.484925,20.606804,-17.477269,-6.445107,15.027438,16.785896,6.253796,5.716616,2.485089
3,687568,-14.696967,-22.876734,27.401705,-0.932784,-5.385297,-16.390636,-4.688088,-4.212353,-3.041296,1.802740
4,687590,-25.772204,-16.017773,47.168636,-7.821304,1.524498,-2.900304,19.768564,4.410283,4.512983,3.179018
...,...,...,...,...,...,...,...,...,...,...,...
657,1330948,49.672065,-0.368873,8.910457,-12.571055,6.360107,12.673995,-18.342769,22.517865,-12.560013,16.351369
658,1503364,-6.485265,53.617747,-8.076410,28.584301,-3.137586,2.048587,20.693363,10.368548,3.062866,6.809916
659,1659818,53.666893,10.585074,6.061448,-12.827196,4.508623,-8.137419,16.212227,-19.139953,8.020412,20.580321
660,1659819,83.536857,14.283863,8.638587,-0.191666,-19.368635,-14.783832,12.247785,-23.645589,21.923298,-7.873945


In [27]:
df_exp_features.to_csv('gene_exp_median_Oct13.csv', index=False)

## 3분위수로 진행

In [12]:
selected_columns = std_2_to_19194.index[std_2_to_19194 >= std_2_to_19194.quantile(0.75)]

# 선택한 열만을 포함하는 데이터프레임 생성
filtered_data2 = df_exp[selected_columns]

# 결과 출력

print(filtered_data2)

       TSPAN6       FGR       CFH     FUCA2      GCLC     ENPP4    SEMA3F  \
0    4.327687  0.263034  2.235727  0.422233  4.433627  1.189034  3.381283   
1    3.266037  0.000000  0.831877  6.576069  4.657068  0.163499  1.536053   
2    4.374344  0.495695  2.729009  6.396947  5.355792  3.510962  2.017922   
3    3.477677  0.028569  1.321928  5.440288  4.072963  0.042644  0.565597   
4    3.244887  0.014355  3.168321  6.090642  4.505891  2.521051  0.176323   
..        ...       ...       ...       ...       ...       ...       ...   
657  0.176323  1.831877  5.785551  5.497293  6.055065  2.849999  0.555816   
658  4.585563  0.000000  0.310340  5.228819  4.791293  2.659925  0.704872   
659  0.137504  0.097611  0.189034  3.688180  4.213347  2.153805  0.163499   
660  0.000000  0.443607  0.000000  0.056584  4.491853  3.129283  0.226509   
661  6.345361  0.042644  0.014355  5.785289  3.060047  1.859970  0.604071   

         CFTR      CD99      AOC1  ...      NEFL      CCL3    ZNF229  \
0  

In [13]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


# Step 2: Standardize the data
data_standardized2 = StandardScaler().fit_transform(filtered_data2)

In [16]:
# Step 3: Perform PCA
n_components = 10
pca = PCA(n_components=n_components)
principal_components2 = pca.fit_transform(data_standardized2)

In [17]:
# Step 4: Create a new DataFrame with COSMIC.ID as a column and the principal components as additional columns
df_pc2 = pd.DataFrame(
    data=principal_components2,
    index=filtered_data2.index.values,
    columns=[f"pc{i+1}" for i in range(n_components)],
)

In [18]:
df_pc2

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,10.372431,31.834972,21.147828,-2.265808,7.874614,6.549472,-9.800130,-6.651733,9.947012,-5.723910
1,-12.589461,25.545640,-15.816012,26.860682,-10.477455,22.429361,23.228674,-4.647014,-6.407121,-8.378884
2,-13.950616,22.014392,-12.198522,-6.946419,-20.190097,0.439651,-0.471627,5.021278,3.293213,6.825336
3,-12.556973,25.185864,-14.387661,-2.665712,2.356831,-7.263419,2.959326,0.722293,10.649601,2.990866
4,-16.632710,29.480782,-28.995224,1.452993,-8.424296,-11.953152,5.852012,-0.776629,6.141423,16.365050
...,...,...,...,...,...,...,...,...,...,...
657,41.075768,-0.072905,-6.578128,2.876002,-4.220911,11.875701,-20.575257,-11.931971,-7.107876,5.757616
658,-2.517458,-26.856082,-2.615557,3.594902,5.832020,-11.472854,4.784765,-4.385503,-5.378886,4.971636
659,45.719007,-7.025276,-6.094818,0.057548,-8.080976,-4.488255,12.964249,9.906298,-11.713461,9.307773
660,69.819871,-7.223465,-8.339740,-9.118115,-0.025863,-7.833246,14.318985,20.069165,6.245146,-10.793403


In [19]:
cosmic_id = df_exp['COSMIC.ID']
df_exp_features2 = pd.concat([cosmic_id, df_pc2], axis=1)
df_exp_features2

Unnamed: 0,COSMIC.ID,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10
0,684072,10.372431,31.834972,21.147828,-2.265808,7.874614,6.549472,-9.800130,-6.651733,9.947012,-5.723910
1,687448,-12.589461,25.545640,-15.816012,26.860682,-10.477455,22.429361,23.228674,-4.647014,-6.407121,-8.378884
2,687562,-13.950616,22.014392,-12.198522,-6.946419,-20.190097,0.439651,-0.471627,5.021278,3.293213,6.825336
3,687568,-12.556973,25.185864,-14.387661,-2.665712,2.356831,-7.263419,2.959326,0.722293,10.649601,2.990866
4,687590,-16.632710,29.480782,-28.995224,1.452993,-8.424296,-11.953152,5.852012,-0.776629,6.141423,16.365050
...,...,...,...,...,...,...,...,...,...,...,...
657,1330948,41.075768,-0.072905,-6.578128,2.876002,-4.220911,11.875701,-20.575257,-11.931971,-7.107876,5.757616
658,1503364,-2.517458,-26.856082,-2.615557,3.594902,5.832020,-11.472854,4.784765,-4.385503,-5.378886,4.971636
659,1659818,45.719007,-7.025276,-6.094818,0.057548,-8.080976,-4.488255,12.964249,9.906298,-11.713461,9.307773
660,1659819,69.819871,-7.223465,-8.339740,-9.118115,-0.025863,-7.833246,14.318985,20.069165,6.245146,-10.793403


In [28]:
df_exp_features2.to_csv('gene_exp_q3_Oct13.csv', index=False)