# Importing Library and Data


In [None]:
# Data Processing & Modelling Libaries
import pandas as pd
import numpy as np
import ast

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split, GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [None]:
# Train, test split
# Load your data
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/processed_data.csv')

# Features (X) and labels (y)
X = data.drop(columns=['label'])
y = data['label']
groups = data['gene_id']  # Use gene_id as groups

# Create GroupShuffleSplit instance
gss = GroupShuffleSplit(n_splits=1, train_size=0.7, test_size=0.3, random_state=42)

# Split into train and temporary sets (70% train, 30% temp)
for train_idx, temp_idx in gss.split(X, y, groups):
    X_train, X_temp = X.iloc[train_idx], X.iloc[temp_idx]
    y_train, y_temp = y.iloc[train_idx], y.iloc[temp_idx]
    groups_temp = groups.iloc[temp_idx]

# Further split the temporary set into validation and test sets (50% val, 50% test)
gss_val_test = GroupShuffleSplit(n_splits=1, train_size=0.5, test_size=0.5, random_state=42)

for val_idx, test_idx in gss_val_test.split(X_temp, y_temp, groups_temp):
    X_val, X_test = X_temp.iloc[val_idx], X_temp.iloc[test_idx]
    y_val, y_test = y_temp.iloc[val_idx], y_temp.iloc[test_idx]

# Combine features and labels back for exporting
train_data = pd.concat([X_train, y_train], axis=1)
val_data = pd.concat([X_val, y_val], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Save to CSV
train_data.to_csv('/content/drive/MyDrive/Colab Notebooks/data/train_data.csv', index=False)
val_data.to_csv('/content/drive/MyDrive/Colab Notebooks/data/val_data.csv', index=False)
test_data.to_csv('/content/drive/MyDrive/Colab Notebooks/data/test_data.csv', index=False)

In [None]:
data.head()

Unnamed: 0,ENST_ID,Position,Key,gene_id,transcript_id,transcript_position,label,mean_Value_1,mean_Value_2,mean_Value_3,...,std_Value_2,std_Value_3,std_Value_4,std_Value_5,std_Value_6,std_Value_7,std_Value_8,std_Value_9,transcript_id_encoded,Key_encoded
0,ENST00000514332,579,CAAACAA,ENSG00000169045,ENST00000514332,579,0,0.007097,2.38975,104.571429,...,0.612728,1.859888,0.003986,0.901436,2.426028,0.0028,0.917633,2.698873,4374,72
1,ENST00000374902,2482,CTGACAG,ENSG00000136891,ENST00000374902,2482,0,0.00756,3.764643,106.964286,...,1.592051,2.471005,0.002169,3.209042,4.33469,0.004028,0.875664,2.878208,2968,134
2,ENST00000217026,1738,CAGACCC,ENSG00000101057,ENST00000217026,1738,0,0.009036,8.26,108.836538,...,1.273614,5.080393,0.005056,1.79761,2.087137,0.002622,1.798742,2.469029,151,89
3,ENST00000394803,1321,AGAACAG,ENSG00000109332,ENST00000394803,1321,0,0.008855,7.054,128.133333,...,2.591058,3.547143,0.00367,1.778136,3.732446,0.002679,1.131974,2.297042,3364,26
4,ENST00000249014,598,GGGACAC,ENSG00000128283,ENST00000249014,598,0,0.009415,4.048154,117.128205,...,1.761297,3.155618,0.008608,3.041126,4.770816,0.004297,1.206101,2.725645,474,181


In [None]:
train_data.head()

Unnamed: 0,ENST_ID,Position,Key,gene_id,transcript_id,transcript_position,mean_Value_1,mean_Value_2,mean_Value_3,mean_Value_4,...,std_Value_3,std_Value_4,std_Value_5,std_Value_6,std_Value_7,std_Value_8,std_Value_9,transcript_id_encoded,Key_encoded,label
0,ENST00000514332,579,CAAACAA,ENSG00000169045,ENST00000514332,579,0.007097,2.38975,104.571429,0.006643,...,1.859888,0.003986,0.901436,2.426028,0.0028,0.917633,2.698873,4374,72,0
2,ENST00000217026,1738,CAGACCC,ENSG00000101057,ENST00000217026,1738,0.009036,8.26,108.836538,0.01148,...,5.080393,0.005056,1.79761,2.087137,0.002622,1.798742,2.469029,151,89,0
3,ENST00000394803,1321,AGAACAG,ENSG00000109332,ENST00000394803,1321,0.008855,7.054,128.133333,0.008535,...,3.547143,0.00367,1.778136,3.732446,0.002679,1.131974,2.297042,3364,26,0
4,ENST00000249014,598,GGGACAC,ENSG00000128283,ENST00000249014,598,0.009415,4.048154,117.128205,0.00971,...,3.155618,0.008608,3.041126,4.770816,0.004297,1.206101,2.725645,474,181,0
6,ENST00000618966,839,CTAACAA,ENSG00000275216,ENST00000618966,839,0.008583,1.840789,90.74,0.008351,...,2.239583,0.005836,0.505871,1.91223,0.004654,0.758229,1.876533,5192,120,0


# Declare feature vector and target variable

In [None]:
# Drop features that are not needed / redundant
# Split into X,y
X_train = train_data.drop(columns=['label','gene_id','transcript_id','Key','ENST_ID'])
y_train = train_data['label']

X_val = val_data.drop(columns=['label','gene_id','transcript_id','Key','ENST_ID'])
y_val = val_data['label']

X_test = test_data.drop(columns=['label','gene_id','transcript_id','Key','ENST_ID'])
y_test = test_data['label']

In [None]:
# Checking Feature columns
X_train.head()

Unnamed: 0,Position,transcript_position,mean_Value_1,mean_Value_2,mean_Value_3,mean_Value_4,mean_Value_5,mean_Value_6,mean_Value_7,mean_Value_8,...,std_Value_2,std_Value_3,std_Value_4,std_Value_5,std_Value_6,std_Value_7,std_Value_8,std_Value_9,transcript_id_encoded,Key_encoded
0,579,579,0.007097,2.38975,104.571429,0.006643,2.526429,98.371429,0.005408,2.419286,...,0.612728,1.859888,0.003986,0.901436,2.426028,0.0028,0.917633,2.698873,4374,72
2,1738,1738,0.009036,8.26,108.836538,0.01148,5.427885,128.096154,0.005846,3.924615,...,1.273614,5.080393,0.005056,1.79761,2.087137,0.002622,1.798742,2.469029,151,89
3,1321,1321,0.008855,7.054,128.133333,0.008535,5.120667,95.853333,0.006006,3.055,...,2.591058,3.547143,0.00367,1.778136,3.732446,0.002679,1.131974,2.297042,3364,26
4,598,598,0.009415,4.048154,117.128205,0.00971,7.149872,117.666667,0.007828,3.196282,...,1.761297,3.155618,0.008608,3.041126,4.770816,0.004297,1.206101,2.725645,474,181
6,839,839,0.008583,1.840789,90.74,0.008351,1.756944,92.806667,0.008282,2.141222,...,0.586278,2.239583,0.005836,0.505871,1.91223,0.004654,0.758229,1.876533,5192,120


In [None]:
# Checking label column
y_train.head()

Unnamed: 0,label
0,0
2,0
3,0
4,0
6,0


# Training the Model

In [None]:
# Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Running on validation data
val_probabilities = rf.predict_proba(X_val)[:, 1]
val_predictions = rf.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.89
Validation Accuracy Score: 0.96


In [None]:
# Running on test data
test_predictions = rf.predict(X_test)
test_probabilities = rf.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.89
Test Accuracy: 0.96


# Training RF while Addressing Class Imbalance




In [None]:
# Addressing Class Imbalance of the data set
# To see if Accuracy & ROC_AUC score will improve
# Using sklearn class_weight parameter to address imbalance
# The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
rf2 = RandomForestClassifier(class_weight='balanced')
rf2.fit(X_train, y_train)

In [None]:
# Running on validation data
val_probabilities = rf2.predict_proba(X_val)[:, 1]
val_predictions = rf2.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.89
Validation Accuracy Score: 0.96


In [None]:
# Running on test data
test_predictions = rf2.predict(X_test)
test_probabilities = rf2.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.90
Test Accuracy: 0.96


# Training RF while Addressing Class Imbalance (SMOTE)

In [None]:
# Using SMOTE to address class imbalance + class_weights
# Oversampling the minority class

smote = SMOTE(random_state=42)
X_train2, y_train2 = smote.fit_resample(X_train, y_train)

rf3 = RandomForestClassifier(class_weight='balanced')
rf3.fit(X_train2, y_train2)

In [None]:
# Running on validation data
val_probabilities = rf3.predict_proba(X_val)[:, 1]
val_predictions = rf3.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.89
Validation Accuracy Score: 0.95


In [None]:
# Running on test data
test_predictions = rf3.predict(X_test)
test_probabilities = rf3.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.90
Test Accuracy: 0.95


# Training RF while Addressing Class Imbalance (Scaler)


In [None]:
# Addressing class imbalance adding scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
rf4 = RandomForestClassifier(class_weight='balanced')
rf4.fit(X_train_resampled, y_train_resampled)

In [None]:
# Running on validation data
val_predictions = rf4.predict(X_val_scaled)
val_probabilities = rf4.predict_proba(X_val_scaled)[:, 1]
val_accuracy = accuracy_score(y_val, val_predictions)
val_roc_auc = roc_auc_score(y_val, val_probabilities)


print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.90
Validation Accuracy Score: 0.95


In [None]:
# Running on test data
test_predictions = rf4.predict(X_test_scaled)
test_probabilities = rf4.predict_proba(X_test_scaled)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.91
Test Accuracy: 0.95


# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

model = SelectKBest(f_classif, k=10)
model.fit(X_train, y_train)
Selected_feature_names = X_train.columns[model.get_support()]
Selected_feature_names

Index(['mean_Value_3', 'mean_Value_6', 'median_Value_3', 'min_Value_3',
       'min_Value_5', 'min_Value_6', 'max_Value_3', 'max_Value_6',
       'std_Value_5', 'std_Value_6'],
      dtype='object')

In [None]:
# Extract the 20 features
X_train2 = X_train.filter(list(Selected_feature_names))
X_val2 = X_val.filter(list(Selected_feature_names))
X_test2 = X_test.filter(list(Selected_feature_names))

# Scaling X
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train2)
X_val_scaled = scaler.transform(X_val2)
X_test_scaled = scaler.transform(X_test2)

# SMOTE Resample
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Training RF Model
rf5 = RandomForestClassifier(class_weight='balanced')
rf5.fit(X_train_resampled, y_train_resampled)

In [None]:
test_predictions = rf5.predict(X_test_scaled)
test_probabilities = rf5.predict_proba(X_test_scaled)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.85
Test Accuracy: 0.95


# Conclusion


The 4th Model with class_weights, SMOTE, and Scaling and new features added in training, produced the highest ROC AUC score of 0.91 for test and 0.90 for Validation.

Feature Selection with selectkbest, did not manage to achieve a higher score than the model without feature selection.

For Comparison:

Old:
Validation ROC AUC Score: 0.87
Test ROC AUC: 0.88

New:
Validation ROC AUC Score: 0.90
Test ROC AUC: 0.91