In [2]:
import pandas as pd
import numpy as np
import pubchempy as pcp
from sklearn.decomposition import PCA
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import AllChem

pd.set_option('mode.chained_assignment',  None) # we dont want unnecessary warning

In [3]:
data_root = Path("./input_data")

exp_file = data_root.joinpath("exp_filtered.csv")

print(exp_file)


input_data/exp_filtered.csv


In [4]:
print(exp_file.is_file())

True


In [5]:
df_exp = pd.read_csv(exp_file)

In [6]:
df_exp

Unnamed: 0,COSMIC.ID,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,...,H3C2,H3C3,AC098582.1,DUS4L.BCAP29,C8orf44.SGK3,ELOA3B,NPBWR1,ELOA3D,ELOA3,CDR1
0,684072,4.327687,0.070389,5.979339,2.906891,4.904484,0.263034,2.235727,0.422233,4.433627,...,1.416840,2.000000,0.042644,1.959770,0.432959,0.070389,0.042644,0.378512,0.070389,0.000000
1,687448,3.266037,0.000000,6.096979,2.521051,3.040892,0.000000,0.831877,6.576069,4.657068,...,0.333424,0.879706,0.839960,2.384050,0.000000,0.028569,0.163499,0.000000,0.028569,0.000000
2,687562,4.374344,0.000000,6.963821,2.292782,4.001802,0.495695,2.729009,6.396947,5.355792,...,2.056584,0.000000,0.333424,2.792855,0.584963,0.000000,0.238787,0.070389,0.000000,0.028569
3,687568,3.477677,0.000000,6.762615,2.107688,4.371559,0.028569,1.321928,5.440288,4.072963,...,1.910733,1.035624,0.367371,1.673556,0.097611,0.000000,0.042644,0.124328,0.000000,0.389567
4,687590,3.244887,0.000000,7.124535,1.933573,3.493135,0.014355,3.168321,6.090642,4.505891,...,1.695994,1.613532,0.594549,4.015248,0.286881,0.000000,0.028569,0.000000,0.000000,0.411426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
657,1330948,0.176323,0.000000,6.348197,2.280956,3.636915,1.831877,5.785551,5.497293,6.055065,...,1.937344,0.432959,0.454176,2.920293,0.189034,0.042644,0.014355,0.042644,0.042644,0.000000
658,1503364,4.585563,0.000000,7.524973,2.211012,2.786596,0.000000,0.310340,5.228819,4.791293,...,1.144046,0.782409,0.263034,1.220330,0.111031,0.000000,0.000000,0.000000,0.000000,0.000000
659,1659818,0.137504,0.000000,6.757423,3.176323,5.111449,0.097611,0.189034,3.688180,4.213347,...,0.650765,1.422233,0.604071,2.223423,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
660,1659819,0.000000,0.000000,7.059939,2.761285,4.144862,0.443607,0.000000,0.056584,4.491853,...,1.443607,1.906891,1.070389,1.448901,0.641546,0.000000,0.000000,0.000000,0.000000,0.000000


In [7]:
std_2_to_19194 = df_exp.iloc[:, 1:19194].std()

print("2열부터 19194열까지의 열의 표준편차:")
print(std_2_to_19194)

2열부터 19194열까지의 열의 표준편차:
TSPAN6      1.636366
TNMD        0.226464
DPM1        0.610333
SCYL3       0.567319
C1orf112    0.690382
              ...   
ELOA3B      0.065843
NPBWR1      0.659847
ELOA3D      0.069546
ELOA3       0.097073
CDR1        0.144252
Length: 19193, dtype: float64


In [8]:
q1 = std_2_to_19194.quantile(0.25)
median = std_2_to_19194.quantile(0.5)
q3 = std_2_to_19194.quantile(0.75)

print("표준편차의 1사분위수:", q1)
print("표준편차의 중앙값:", median)
print("표준편차의 3사분위수:", q3)

표준편차의 1사분위수: 0.5760327332808185
표준편차의 중앙값: 0.748986884325046
표준편차의 3사분위수: 1.1204030393247908


In [9]:
selected_columns = std_2_to_19194.index[std_2_to_19194 >= std_2_to_19194.median()]

# 선택한 열만을 포함하는 데이터프레임 생성
filtered_data = df_exp[selected_columns]

# 결과 출력

print(filtered_data)


       TSPAN6       FGR       CFH     FUCA2      GCLC     STPG1    NIPAL3  \
0    4.327687  0.263034  2.235727  0.422233  4.433627  1.516015  2.759156   
1    3.266037  0.000000  0.831877  6.576069  4.657068  2.545968  4.564988   
2    4.374344  0.495695  2.729009  6.396947  5.355792  2.790772  4.251719   
3    3.477677  0.028569  1.321928  5.440288  4.072963  3.137504  4.774524   
4    3.244887  0.014355  3.168321  6.090642  4.505891  2.469886  4.402586   
..        ...       ...       ...       ...       ...       ...       ...   
657  0.176323  1.831877  5.785551  5.497293  6.055065  2.684819  3.663345   
658  4.585563  0.000000  0.310340  5.228819  4.791293  2.837943  2.972693   
659  0.137504  0.097611  0.189034  3.688180  4.213347  1.981853  4.593354   
660  0.000000  0.443607  0.000000  0.056584  4.491853  2.419539  2.684819   
661  6.345361  0.042644  0.014355  5.785289  3.060047  4.023255  2.629939   

        ENPP4    SEMA3F      CFTR  ...   UPK3BL2  AC093512.2  ARHGAP11B  \


In [10]:
from sklearn.manifold import TSNE

# 이진 데이터를 포함한 데이터프레임을 가정합니다.
# X는 이진 특성을 포함한 데이터
X = filtered_data

# t-SNE 모델 생성
tsne = TSNE(n_components=2)  # 2차원으로 투영하도록 설정

# 데이터를 저차원으로 변환
mut_embedded = tsne.fit_transform(X)

In [11]:
X_embedded = mut_embedded  # t-SNE 결과 데이터, (n_samples, n_components) 형태의 2D 배열

# 데이터프레임 생성
df_tSNE = pd.DataFrame(data=X_embedded, columns=['Dimension 1', 'Dimension 2'])

# 데이터프레임 확인
print(df_tSNE.head())
print(df_tSNE)

   Dimension 1  Dimension 2
0   -14.199987    17.375065
1     9.457047    29.251619
2     8.317383    14.582255
3     6.108467    20.335836
4     3.780310    17.295738
     Dimension 1  Dimension 2
0     -14.199987    17.375065
1       9.457047    29.251619
2       8.317383    14.582255
3       6.108467    20.335836
4       3.780310    17.295738
..           ...          ...
657    35.353970    -3.838055
658   -11.111899   -13.941449
659    43.389538   -10.428765
660    47.286507    -4.652557
661   -14.589652    10.777872

[662 rows x 2 columns]


In [12]:
cosmic_id = df_exp['COSMIC.ID']
df_exp_features = pd.concat([cosmic_id, df_tSNE], axis=1)
df_exp_features

Unnamed: 0,COSMIC.ID,Dimension 1,Dimension 2
0,684072,-14.199987,17.375065
1,687448,9.457047,29.251619
2,687562,8.317383,14.582255
3,687568,6.108467,20.335836
4,687590,3.780310,17.295738
...,...,...,...
657,1330948,35.353970,-3.838055
658,1503364,-11.111899,-13.941449
659,1659818,43.389538,-10.428765
660,1659819,47.286507,-4.652557


In [13]:
df_exp_features.to_csv('gene_exp_median_tSNE_Oct18.csv', index=False)

In [14]:
with open("gene_exp_MEDIAN_tSNE.txt", "w") as file:
    for row in df_exp_features:
        line = "\t".join(str(cell) for cell in row)  # 탭 문자로 열 구분
        file.write(line + "\n")

In [22]:
# CSV 파일을 읽어옴
with open("/home/jiseo/week4/tSNE/gene_exp_median_tSNE_Oct18.csv", "r") as csv_file:
    csv_contents = csv_file.read()

# 읽어온 내용을 .txt 파일로 저장
with open("gene_exp_median_tSNE.txt", "w") as txt_file:
    txt_file.write(csv_contents)



In [6]:
# .txt 파일을 읽어옴
with open("/home/jiseo/week4/data/oct_19_txt/tSNE/exp_median.txt", "r") as txt_file:
    txt_contents = txt_file.read()

# 쉼표를 탭 문자로 대체하고 수정된 내용을 변수에 저장
txt_contents_with_tabs = txt_contents.replace(",", "\t")

# 수정된 내용을 .txt 파일로 저장
with open("tsne_median_이거.txt", "w") as modified_file:
    modified_file.write(txt_contents_with_tabs)

## 3분위수로 진행

In [15]:
selected_columns = std_2_to_19194.index[std_2_to_19194 >= std_2_to_19194.quantile(0.75)]

# 선택한 열만을 포함하는 데이터프레임 생성
filtered_data2 = df_exp[selected_columns]

# 결과 출력

print(filtered_data2)

       TSPAN6       FGR       CFH     FUCA2      GCLC     ENPP4    SEMA3F  \
0    4.327687  0.263034  2.235727  0.422233  4.433627  1.189034  3.381283   
1    3.266037  0.000000  0.831877  6.576069  4.657068  0.163499  1.536053   
2    4.374344  0.495695  2.729009  6.396947  5.355792  3.510962  2.017922   
3    3.477677  0.028569  1.321928  5.440288  4.072963  0.042644  0.565597   
4    3.244887  0.014355  3.168321  6.090642  4.505891  2.521051  0.176323   
..        ...       ...       ...       ...       ...       ...       ...   
657  0.176323  1.831877  5.785551  5.497293  6.055065  2.849999  0.555816   
658  4.585563  0.000000  0.310340  5.228819  4.791293  2.659925  0.704872   
659  0.137504  0.097611  0.189034  3.688180  4.213347  2.153805  0.163499   
660  0.000000  0.443607  0.000000  0.056584  4.491853  3.129283  0.226509   
661  6.345361  0.042644  0.014355  5.785289  3.060047  1.859970  0.604071   

         CFTR      CD99      AOC1  ...      NEFL      CCL3    ZNF229  \
0  

In [16]:
from sklearn.manifold import TSNE

# 이진 데이터를 포함한 데이터프레임을 가정합니다.
# X는 이진 특성을 포함한 데이터
X = filtered_data2

# t-SNE 모델 생성
tsne2 = TSNE(n_components=2)  # 2차원으로 투영하도록 설정

# 데이터를 저차원으로 변환
mut_embedded = tsne2.fit_transform(X)

In [17]:
X_embedded = mut_embedded  # t-SNE 결과 데이터, (n_samples, n_components) 형태의 2D 배열

# 데이터프레임 생성
df_tSNE2 = pd.DataFrame(data=X_embedded, columns=['Dimension 1', 'Dimension 2'])

# 데이터프레임 확인
print(df_tSNE2.head())
print(df_tSNE2)

   Dimension 1  Dimension 2
0     5.305734    16.433502
1    -7.580318    34.335579
2   -13.813935    20.623636
3    -8.432288    23.311266
4   -10.059759    18.943550
     Dimension 1  Dimension 2
0       5.305734    16.433502
1      -7.580318    34.335579
2     -13.813935    20.623636
3      -8.432288    23.311266
4     -10.059759    18.943550
..           ...          ...
657    36.931168     0.727288
658    -5.220496   -13.641965
659    41.367359    -8.591474
660    46.530201    -6.640215
661     6.228158    12.029013

[662 rows x 2 columns]


In [18]:
cosmic_id = df_exp['COSMIC.ID']
df_exp_features2 = pd.concat([cosmic_id, df_tSNE2], axis=1)
df_exp_features2

Unnamed: 0,COSMIC.ID,Dimension 1,Dimension 2
0,684072,5.305734,16.433502
1,687448,-7.580318,34.335579
2,687562,-13.813935,20.623636
3,687568,-8.432288,23.311266
4,687590,-10.059759,18.943550
...,...,...,...
657,1330948,36.931168,0.727288
658,1503364,-5.220496,-13.641965
659,1659818,41.367359,-8.591474
660,1659819,46.530201,-6.640215


In [19]:
df_exp_features2.to_csv('gene_exp_q3_tSNE_Oct18.csv', index=False)

In [20]:
with open("gene_exp_q3_tSNE.txt", "w") as file:
    for row in df_exp_features2:
        line = "\t".join(str(cell) for cell in row)  # 탭 문자로 열 구분
        file.write(line + "\n")

In [21]:
# CSV 파일을 읽어옴
with open("/home/jiseo/week4/tSNE/gene_exp_q3_tSNE_Oct18.csv", "r") as csv_file:
    csv_contents = csv_file.read()

# 읽어온 내용을 .txt 파일로 저장
with open("gene_exp_q3_tSNE.txt", "w") as txt_file:
    txt_file.write(csv_contents)


In [3]:
# .txt 파일을 읽어옴
with open("/home/jiseo/week4/data/oct_19_txt/tSNE/exp_q3.txt", "r") as txt_file:
    txt_contents = txt_file.read()

# 쉼표(,)를 제거하고 수정된 내용을 변수에 저장
txt_contents_without_commas = txt_contents.replace(",", "")

# 수정된 내용을 .txt 파일로 저장
with open("tsne_q3.txt", "w") as modified_file:
    modified_file.write(txt_contents_without_commas)


In [5]:
# .txt 파일을 읽어옴
with open("/home/jiseo/week4/data/oct_19_txt/tSNE/exp_q3.txt", "r") as txt_file:
    txt_contents = txt_file.read()

# 쉼표를 탭 문자로 대체하고 수정된 내용을 변수에 저장
txt_contents_with_tabs = txt_contents.replace(",", "\t")

# 수정된 내용을 .txt 파일로 저장
with open("tsne_q3_이거.txt", "w") as modified_file:
    modified_file.write(txt_contents_with_tabs)