# TCGA DATA ANALYIS AND MODEL BUILDING FOR PREDICTION

# import libraies

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, precision_recall_curve, roc_curve
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import seaborn as sns

# Load data

In [62]:
data = pd.read_csv('tcga_data.csv')
clinical = pd.read_csv('tcga_clinicall.csv')

# Preprocess data

In [None]:
labels = clinical[['PATIENT_ID', 'CANCER_STATUS']]
labels.dropna()
labels['PATIENT_ID'].dropna()
labels['PATIENT_ID'].drop_duplicates()

# Check Overlap

In [64]:
print("Overlap with labels:", len(set(data['PATIENT_ID']).intersection(labels['PATIENT_ID'])))

Overlap with labels: 89


# Save label file

In [65]:
labels.to_csv('tcga_labels.csv', index=False)

# Import new files

In [66]:
data = pd.read_csv('tcga_data.csv')
label = pd.read_csv('tcga_labels.csv')

# Check imported datas

In [None]:
data.head()
label.head()

# filter for overlapping PATIENT IDS

In [68]:
overlapping_ids= set(data['PATIENT_ID']).intersection(set(label['PATIENT_ID']))
tcga_data = data[data['PATIENT_ID'].isin(overlapping_ids)]
label = label[label['PATIENT_ID'].isin(overlapping_ids)]
print(f"Overlapping patients: {len(overlapping_ids)}")

Overlapping patients: 89


# Feature Engineering - Aggregate per patient

In [None]:
# Example: Aggregate 'seg_mean' (add more columns if available, e.g., mean_num_probes = df.groupby('patient_id')['num_probes'].mean())
tcga_data['amplified'] = (tcga_data['seg.mean'] > 0.2).astype(int)
aggregated = tcga_data.groupby('PATIENT_ID').agg(
    mean_seg_mean=('seg.mean', 'mean'),
    std_seg_mean=('seg.mean','std'),
    min_seg_mean=('seg.mean', 'min'),
    max_seg_mean=('seg.mean', 'max'),
    count_segments=('seg.mean', 'count')
).reset_index()

# pivot to create one columns per chromosome

In [None]:
chrom_features = chrom_agg.pivot(index='PATIENT_ID', columns= 'chrom', values=['mean_seg_mean', 'std_seg_mean','count_segments'])

# Handle NaN in std (if patient has only 1 segment)

In [84]:
aggregated['std_seg_mean']=aggregated['std_seg_mean'].fillna(0)

# Step 4: Merge with labels

In [85]:
data = pd.merge(aggregated, labels, on='PATIENT_ID')



# label encoder

In [86]:
le = LabelEncoder()
data['cancer_status_encoded'] = le.fit_transform(data['CANCER_STATUS'])

# Save merged data

In [87]:
data.to_csv('merge_data.csv', index=False)

# define X and y variables

In [88]:
X = data.drop(['PATIENT_ID', 'CANCER_STATUS','cancer_status_encoded'], axis=1)
y= data['cancer_status_encoded']

# Step 5: Train-Test Split (stratify to maintain class balance)

In [89]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train Random Forest Classifier

In [90]:
model= RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions and Evaluation

In [91]:
y_pred =model.predict(X_test)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.83
Precision: 0.80
Recall: 0.89
F1-Score: 0.84

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.78      0.82         9
           1       0.80      0.89      0.84         9

    accuracy                           0.83        18
   macro avg       0.84      0.83      0.83        18
weighted avg       0.84      0.83      0.83        18



# Add cross validation

In [94]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
cv = StratifiedKFold(n_splits=5,shuffle=True, random_state=42)
scores = cross_val_score(model, X,  y, cv=5)
print(f"Cross Validation: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")


Cross Validation: 0.6856 (+/- 0.1102)


# Confusion Matrix

In [47]:

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Cancer', 'Cancer'], yticklabels=['Non-Cancer', 'Cancer'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('Confusion_matrix.png')
plt.close()

# AUC SCORE

In [49]:
y_pred_proba = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test ROC-AUC: {auc:.3f}")

Test ROC-AUC: 0.864


# Feature Importances

In [55]:
importances = model.feature_importances_
features = X.columns
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(16, 10))
plt.title("Feature Importances")
plt.bar(range(X.shape[1]), importances[indices], align="center")
plt.xticks(range(X.shape[1]), [features[i] for i in indices], rotation=45)
plt.savefig('feature_importance.png')
plt.close()

#