# Importing Library and Data


In [None]:
# Data Processing & Modelling Libaries
import pandas as pd
import numpy as np
import ast

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Tree Visualisation
#from sklearn.tree import export_graphviz
#from IPython.display import Image
#import graphviz

In [None]:
# Reading in the Data with local Google Drive Filepaths
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/processed_data.csv')
train_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/train_data.csv')
test_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/test_data.csv')
val_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/val_data.csv')

In [None]:
train_data.head()

Unnamed: 0,gene_id,transcript_id,transcript_position,Key,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,transcript_id_encoded,Key_encoded,gene_id_encoded,label
0,ENSG00000169045,ENST00000514332,579,CAAACAA,0.007097,2.38975,104.571429,0.006643,2.526429,98.371429,0.005408,2.419286,89.453571,4374,72,2833,0
1,ENSG00000101057,ENST00000217026,1738,CAGACCC,0.009036,8.26,108.836538,0.01148,5.427885,128.096154,0.005846,3.924615,79.675,151,89,632,0
2,ENSG00000109332,ENST00000394803,1321,AGAACAG,0.008855,7.054,128.133333,0.008535,5.120667,95.853333,0.006006,3.055,88.96,3364,26,953,0
3,ENSG00000128283,ENST00000249014,598,GGGACAC,0.009415,4.048154,117.128205,0.00971,7.149872,117.666667,0.007828,3.196282,82.721795,474,181,1534,0
4,ENSG00000275216,ENST00000618966,839,CTAACAA,0.008583,1.840789,90.74,0.008351,1.756944,92.806667,0.008282,2.141222,87.446667,5192,120,3792,0


# Declare feature vector and target variable

In [None]:
# Drop features that are not needed / redundant
# Split into X,y
X_train = train_data.drop(columns=['label','gene_id','transcript_id','Key'])
y_train = train_data['label']

X_val = val_data.drop(columns=['label','gene_id','transcript_id','Key'])
y_val = val_data['label']

X_test = test_data.drop(columns=['label','gene_id','transcript_id','Key'])
y_test = test_data['label']

In [None]:
# Checking Feature columns
X_train.head()

Unnamed: 0,transcript_position,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,transcript_id_encoded,Key_encoded,gene_id_encoded
0,579,0.007097,2.38975,104.571429,0.006643,2.526429,98.371429,0.005408,2.419286,89.453571,4374,72,2833
1,1738,0.009036,8.26,108.836538,0.01148,5.427885,128.096154,0.005846,3.924615,79.675,151,89,632
2,1321,0.008855,7.054,128.133333,0.008535,5.120667,95.853333,0.006006,3.055,88.96,3364,26,953
3,598,0.009415,4.048154,117.128205,0.00971,7.149872,117.666667,0.007828,3.196282,82.721795,474,181,1534
4,839,0.008583,1.840789,90.74,0.008351,1.756944,92.806667,0.008282,2.141222,87.446667,5192,120,3792


In [None]:
# Checking label column
y_train.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


# Training the Model

In [None]:
# Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
# Running on validation data
val_probabilities = rf.predict_proba(X_val)[:, 1]
val_predictions = rf.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.86
Validation Accuracy Score: 0.96


In [None]:
# Running on test data
test_predictions = rf.predict(X_test)
test_probabilities = rf.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.87
Test Accuracy: 0.96


# Training RF while Addressing Class Imbalance




In [None]:
# Addressing Class Imbalance of the data set
# To see if Accuracy & ROC_AUC score will improve
# Using sklearn class_weight parameter to address imbalance
# The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
rf2 = RandomForestClassifier(class_weight='balanced')
rf2.fit(X_train, y_train)

In [None]:
# Running on validation data
val_probabilities = rf2.predict_proba(X_val)[:, 1]
val_predictions = rf2.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.86
Validation Accuracy Score: 0.96


In [None]:
# Running on test data
test_predictions = rf2.predict(X_test)
test_probabilities = rf2.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.88
Test Accuracy: 0.96


# Training RF while Addressing Class Imbalance (SMOTE)

In [None]:
# Using SMOTE to address class imbalance + class_weights
# Oversampling the minority class

smote = SMOTE(random_state=42)
X_train2, y_train2 = smote.fit_resample(X_train, y_train)

rf3 = RandomForestClassifier(class_weight='balanced')
rf3.fit(X_train2, y_train2)

In [None]:
# Running on validation data
val_probabilities = rf3.predict_proba(X_val)[:, 1]
val_predictions = rf3.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.86
Validation Accuracy Score: 0.94


In [None]:
# Running on test data
test_predictions = rf3.predict(X_test)
test_probabilities = rf3.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.87
Test Accuracy: 0.94


# Training RF while Addressing Class Imbalance (Scaler)


In [None]:
# Addressing class imbalance adding scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
rf4 = RandomForestClassifier(class_weight='balanced')
rf4.fit(X_train_resampled, y_train_resampled)

In [None]:
# Running on validation data
val_predictions = rf4.predict(X_val_scaled)
val_probabilities = rf4.predict_proba(X_val_scaled)[:, 1]
val_accuracy = accuracy_score(y_val, val_predictions)
val_roc_auc = roc_auc_score(y_val, val_probabilities)


print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.87
Validation Accuracy Score: 0.94


In [None]:
# Running on test data
test_predictions = rf4.predict(X_test_scaled)
test_probabilities = rf4.predict_proba(X_test_scaled)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.88
Test Accuracy: 0.94


# Conclusion


The 4th Model with class_weights, SMOTE, and Scaling produced the highest ROC AUC score of 0.88 for test and 0.87 for Validation.