<a href="https://colab.research.google.com/github/JinLeeGG/Survival-Prediction-Model-for-AML-using-Gene-Expression-Data-from-TCGA/blob/main/Machine_Learning_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Building a Machine Learning Model that predicts patient's status based on RNA Expression data




In [68]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [69]:
merged_df = pd.read_csv('/content/drive/MyDrive/Acute Myeloid Leukemia (TCGA, PanCancer Atlas)/Processed_Data/merged_df.csv')
merged_df.dropna(subset=['Observation Period'], inplace=True)
merged_df

Unnamed: 0,bcr_patient_barcode,Status,Observation Period,A1BG-AS|503538,A1BG|1,A1CF|29974,A2LD1|87769,A2ML1|144568,A2M|2,A4GALT|53947,...,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009,psiTPTE22|387590,tAKR|389932
0,TCGA-AB-2803,1,792.0,792.14,1139.18,0.00,194.50,24.36,982.14,24.98,...,555.04,67.00,795.76,3093.76,1114.18,9613.40,5332.46,2452.22,33.00,7.78
1,TCGA-AB-2805,1,576.0,429.64,403.44,0.00,227.10,33.66,193.26,5.00,...,1360.22,41.88,912.38,5481.82,3834.64,18642.30,12197.30,3494.92,19.22,26.34
2,TCGA-AB-2806,1,944.0,891.18,1004.70,0.00,179.84,45.82,129.92,55.72,...,1623.44,231.76,2251.04,6184.50,1696.52,18565.60,12208.00,6213.06,332.12,1.00
3,TCGA-AB-2807,1,180.0,1095.44,1121.68,1.00,111.06,11.08,884.28,272.40,...,1575.48,283.66,1559.34,2978.60,1990.44,7733.44,9364.42,2986.18,51.88,22.28
4,TCGA-AB-2808,0,2861.0,570.74,531.26,0.00,123.08,21.64,757.42,537.62,...,2168.70,106.86,1111.84,3922.22,2723.36,10197.40,8040.82,3697.18,47.70,5.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174,TCGA-AB-3007,0,1581.0,1561.60,1503.12,0.00,321.06,18.34,201.94,776.14,...,1480.26,288.32,1284.20,6217.06,1926.22,6308.06,12576.40,3929.76,30.68,24.38
175,TCGA-AB-3008,1,822.0,1052.62,824.12,0.00,113.20,73.24,2400.86,520.90,...,3793.34,349.88,1349.90,5590.80,2401.34,16703.00,12810.40,3548.82,155.78,0.00
176,TCGA-AB-3009,1,576.0,489.64,514.78,0.00,506.60,229.24,648.56,38.02,...,1133.66,212.18,1078.90,5370.92,2515.52,23951.40,12152.40,3942.60,1493.54,1.00
177,TCGA-AB-3011,0,1885.0,899.50,736.42,0.00,93.70,20.36,162.48,60.20,...,1464.88,87.22,699.92,5071.14,1535.64,9142.90,9567.60,3060.22,47.78,1.04


In [70]:
# Define the target variable
y = merged_df['Status']

# Define features by dropping the non-gene and target columns
X = merged_df.drop(columns=['bcr_patient_barcode', 'Status', 'Observation Period'])

# Display the shapes to verify
print("Features shape (X):", X.shape)
print("Target shape (y):", y.shape)

Features shape (X): (167, 20319)
Target shape (y): (167,)


In [71]:
from sklearn.model_selection import train_test_split

# Split the data, keeping the proportion of classes the same (stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2025, stratify=y
)

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize and train the model
log_reg = LogisticRegression(random_state=2025, max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred_lr = log_reg.predict(X_test)

# Evaluate the model
print("--- Logistic Regression Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.2f}")
print(classification_report(y_test, y_pred_lr))

--- Logistic Regression Results ---
Accuracy: 0.56
              precision    recall  f1-score   support

           0       0.41      0.58      0.48        12
           1       0.71      0.55      0.62        22

    accuracy                           0.56        34
   macro avg       0.56      0.56      0.55        34
weighted avg       0.60      0.56      0.57        34



In [73]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the model
rand_forest = RandomForestClassifier(random_state=2025, n_estimators=100)
rand_forest.fit(X_train, y_train)

# Make predictions
y_pred_rf = rand_forest.predict(X_test)

# Evaluate the model
print("--- Random Forest Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print(classification_report(y_test, y_pred_rf))

--- Random Forest Results ---
Accuracy: 0.65
              precision    recall  f1-score   support

           0       0.50      0.33      0.40        12
           1       0.69      0.82      0.75        22

    accuracy                           0.65        34
   macro avg       0.60      0.58      0.57        34
weighted avg       0.62      0.65      0.63        34



In [74]:
!pip install lifelines



In [75]:
# 필요한 라이브러리를 불러옵니다.
from lifelines.statistics import logrank_test
import pandas as pd
from tqdm import tqdm # 루프 진행 상황을 시각적으로 보여주는 라이브러리

# --- 데이터 준비 ---
# 'merged_df'는 임상 정보와 유전자 발현량이 합쳐진 데이터프레임입니다.

# 1. 특성(X) 데이터 준비: merged_df에서 유전자 발현량에 해당하지 않는 컬럼들을 제외합니다.
X_full = merged_df.drop(columns=['bcr_patient_barcode', 'Status', 'Observation Period'])

# 2. 생존 정보(시간, 이벤트)가 담긴 데이터프레임을 지정합니다.
df_survival_final = merged_df

# 결과를 저장할 리스트를 초기화합니다.
results = []

# tqdm을 사용하여 전체 유전자에 대한 반복문의 진행 상황을 확인합니다.
for gene in tqdm(X_full.columns, desc="모든 유전자 분석 중 (Log-rank)"):
    try:
        # 현재 유전자의 발현량 데이터
        gene_expression = X_full[gene]

        # 중앙값(median)을 기준으로 고발현/저발현 그룹을 나눕니다.
        median_expression = gene_expression.median()

        # 만약 중앙값이 0이면 (대부분의 값이 0인 경우), 분석에서 제외하고 다음 유전자로 넘어갑니다.
        if median_expression == 0:
            continue

        # 그룹 필터 생성
        high_group_filter = gene_expression >= median_expression
        low_group_filter = gene_expression < median_expression

        # 로그-순위 검정 수행 (사용자 데이터의 컬럼명으로 수정)
        result = logrank_test(
            durations_A=df_survival_final.loc[high_group_filter, 'Observation Period'],
            durations_B=df_survival_final.loc[low_group_filter, 'Observation Period'],
            event_observed_A=df_survival_final.loc[high_group_filter, 'Status'],
            event_observed_B=df_survival_final.loc[low_group_filter, 'Status']
        )

        # 결과(유전자 이름, p-value)를 리스트에 저장합니다.
        results.append({'gene': gene, 'p_value': result.p_value})

    except Exception as e:
        # 분석 중 에러가 발생하더라도 전체 과정이 멈추지 않도록 처리합니다.
        # print(f"Error analyzing {gene}: {e}") # 에러 원인을 확인하고 싶을 때 주석 해제
        continue

# 결과를 데이터프레임으로 변환합니다.
results_df = pd.DataFrame(results)

# p-value가 0.05 미만인 유의미한 결과만 필터링하고, p-value가 낮은 순으로 정렬합니다.
significant_genes_logrank = results_df[results_df['p_value'] < 0.05].sort_values(by='p_value')

# 최종 결과 출력
print("\n--- 생존율과 유의미한 관계가 있는 유전자 목록 (상위 20개) ---")
print(significant_genes_logrank.head(20))


모든 유전자 분석 중 (Log-rank): 100%|██████████| 20319/20319 [07:35<00:00, 44.65it/s]



--- 생존율과 유의미한 관계가 있는 유전자 목록 (상위 20개) ---
                   gene       p_value
10432       PARP3|10039  1.985124e-08
5479          FIBP|9158  2.350450e-07
10931      PLA2G4A|5321  1.101993e-06
15238     TOMM40L|84134  1.322534e-06
8331       LPCAT3|10162  1.405197e-06
15357      TREML2|79865  2.198326e-06
8630       MAP4K1|11184  2.424977e-06
2755          CCND3|896  2.529626e-06
1061      ATP13A2|23400  2.575356e-06
12172     RHOBTB2|23221  2.832890e-06
9349      MYBPHL|343263  2.916382e-06
3265         CLCN5|1184  3.481599e-06
4004        DCTN2|10540  3.768060e-06
14449       SYTL4|94121  3.930102e-06
1283        BCKDK|10295  7.396359e-06
4201       DIRC3|729582  8.489778e-06
12489      RPS6KA1|6195  8.825879e-06
12212       RINL|126432  9.030160e-06
10587        PDE3B|5140  9.804775e-06
1502   C10orf128|170371  1.325526e-05


In [76]:
print(len(significant_genes_logrank))

2825


In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# \--- 1. 데이터 준비 ---



selected_genes = significant_genes_logrank['gene'].tolist()

# 선택된 120개 유전자로 특성(X) 데이터셋을 구성합니다.

X = merged_df[selected_genes]

# 예측할 대상(y)인 생존 상태('Status')를 지정합니다.

y = merged_df['Status']

# \--- 2. 훈련 및 테스트 데이터 분리 ---

# 데이터를 훈련용(80%)과 테스트용(20%)으로 분리합니다.

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2025, stratify=y
)

# \--- 3. 모델 훈련 및 평가 ---

# 랜덤 포레스트 모델을 초기화하고 훈련합니다.

rf_model = RandomForestClassifier(n_estimators=100, random_state=2025, class_weight='balanced')
rf_model.fit(X_train, y_train)

# 테스트 데이터로 예측을 수행합니다.

y_pred = rf_model.predict(X_test)

# 최종 모델 성능을 출력합니다.

print("--- 🌳 상위 120개 유전자 기반 랜덤 포레스트 모델 성능 ---")
print(f"정확도(Accuracy): {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))

--- 🌳 상위 120개 유전자 기반 랜덤 포레스트 모델 성능 ---
정확도(Accuracy): 0.676
              precision    recall  f1-score   support

           0       0.55      0.50      0.52        12
           1       0.74      0.77      0.76        22

    accuracy                           0.68        34
   macro avg       0.64      0.64      0.64        34
weighted avg       0.67      0.68      0.67        34

