# Importing Library and Data


In [1]:
# Data Processing & Modelling Libaries
import pandas as pd
import numpy as np
import ast

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [2]:
# Reading in the Data with local Google Drive Filepaths
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/processed_data.csv')
train_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/train_data.csv')
test_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/test_data.csv')
val_data  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/val_data.csv')

# Declare feature vector and target variable

In [3]:
# Drop features that are not needed / redundant
# Split into X,y
X_train = train_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_train = train_data['label']

X_val = val_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_val = val_data['label']

X_test = test_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_test = test_data['label']

In [4]:
# Checking Feature columns
X_train.head()

Unnamed: 0,Position,Value_1,Value_2,Value_3,Value_4,Value_5,Value_6,Value_7,Value_8,Value_9,ENST_ID_encoded,Key_encoded,gene_id_encoded
0,1312,0.0113,2.9252,105.84,0.006688,3.9472,99.8,0.006549,3.2068,88.612,3286,147,941
1,1133,0.008892,2.330544,103.184937,0.007565,2.887322,97.920502,0.0064,2.380251,87.730126,2341,216,2659
2,849,0.007178,3.808339,90.662609,0.006693,3.910683,97.275652,0.008601,3.462987,92.266087,3447,202,1019
3,2371,0.01085,3.717955,106.727273,0.008118,3.794091,105.302273,0.007403,3.753864,90.725,4564,154,2192
4,297,0.0103,3.695294,118.588235,0.007454,9.200196,117.960784,0.00661,3.95098,78.62549,360,187,1264


In [5]:
# Checking label column
y_train.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


# Training the Model

In [6]:
# Random Forest Model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [7]:
# Running on validation data
val_probabilities = rf.predict_proba(X_val)[:, 1]
val_predictions = rf.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.89
Validation Accuracy Score: 0.96


In [8]:
# Running on test data
test_predictions = rf.predict(X_test)
test_probabilities = rf.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.90
Test Accuracy: 0.96


# Training RF while Addressing Class Imbalance




In [9]:
# Addressing Class Imbalance of the data set
# To see if Accuracy & ROC_AUC score will improve
# Using sklearn class_weight parameter to address imbalance
# The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y))
rf2 = RandomForestClassifier(class_weight='balanced')
rf2.fit(X_train, y_train)

In [10]:
# Running on validation data
val_probabilities = rf2.predict_proba(X_val)[:, 1]
val_predictions = rf2.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.89
Validation Accuracy Score: 0.96


In [11]:
# Running on test data
test_predictions = rf2.predict(X_test)
test_probabilities = rf2.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.90
Test Accuracy: 0.96


# Training RF while Addressing Class Imbalance (SMOTE)

In [12]:
# Using SMOTE to address class imbalance + class_weights
# Oversampling the minority class

smote = SMOTE(random_state=42)
X_train2, y_train2 = smote.fit_resample(X_train, y_train)

rf3 = RandomForestClassifier(class_weight='balanced')
rf3.fit(X_train2, y_train2)

In [13]:
# Running on validation data
val_probabilities = rf3.predict_proba(X_val)[:, 1]
val_predictions = rf3.predict(X_val)
val_roc_auc = roc_auc_score(y_val, val_probabilities)
val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.89
Validation Accuracy Score: 0.95


In [14]:
# Running on test data
test_predictions = rf3.predict(X_test)
test_probabilities = rf3.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.89
Test Accuracy: 0.95


# Training RF while Addressing Class Imbalance (Scaler)


In [15]:
# Addressing class imbalance adding scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
rf4 = RandomForestClassifier(class_weight='balanced')
rf4.fit(X_train_resampled, y_train_resampled)

In [16]:
# Running on validation data
val_predictions = rf4.predict(X_val_scaled)
val_probabilities = rf4.predict_proba(X_val_scaled)[:, 1]
val_accuracy = accuracy_score(y_val, val_predictions)
val_roc_auc = roc_auc_score(y_val, val_probabilities)


print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.90
Validation Accuracy Score: 0.95


In [17]:
# Running on test data
test_predictions = rf4.predict(X_test_scaled)
test_probabilities = rf4.predict_proba(X_test_scaled)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
test_roc_auc = roc_auc_score(y_test, test_probabilities)

print(f"Test ROC AUC: {test_roc_auc:.2f}")
print(f"Test Accuracy: {test_accuracy:.2f}")

Test ROC AUC: 0.91
Test Accuracy: 0.95


# Conclusion


The 4th Model with class_weights, SMOTE, and Scaling produced the highest ROC AUC score of 0.91