In [14]:
import pandas as pd
import numpy as np
import ast
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('Data/processed_data.csv')
test_data = pd.read_csv('Data/test_data.csv')
train_data = pd.read_csv('Data/train_data.csv')
val_data = pd.read_csv('Data/val_data.csv')

In [5]:
X_train = train_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_train = train_data['label']

X_val = val_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_val = val_data['label']

X_test = test_data.drop(columns=['label','ENST_ID', 'Key', 'gene_id'])
y_test = test_data['label']

In [8]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [9]:
#valid set
val_probabilities = model.predict_proba(X_val)[:, 1]

val_predictions = model.predict(X_val)

val_roc_auc = roc_auc_score(y_val, val_probabilities)

val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.68
Validation Accuracy Score: 0.96


ROC AUC is low might be due to class imbalance

In [10]:
#test set
test_predictions = model.predict(X_test)
test_probabilities = model.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
test_roc_auc = roc_auc_score(y_test, test_probabilities)
print(f"Test ROC AUC: {test_roc_auc}")

Test Accuracy: 0.9535456336178595
Test ROC AUC: 0.6760895846669293


In [11]:
model2 = LogisticRegression(class_weight='balanced', max_iter=1000)
model2.fit(X_train, y_train)

In [12]:
#valid set
val_probabilities = model2.predict_proba(X_val)[:, 1]

val_predictions = model2.predict(X_val)

val_roc_auc = roc_auc_score(y_val, val_probabilities)

val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.70
Validation Accuracy Score: 0.61


In [13]:
#test set
test_predictions = model2.predict(X_test)
test_probabilities = model2.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
test_roc_auc = roc_auc_score(y_test, test_probabilities)
print(f"Test ROC AUC: {test_roc_auc}")

Test Accuracy: 0.6167104399212081
Test ROC AUC: 0.6922735343657672


In [15]:
smote = SMOTE(random_state=42)
X_train2, y_train2 = smote.fit_resample(X_train, y_train)
model3 = LogisticRegression(class_weight='balanced', max_iter=1000)
model3.fit(X_train2, y_train2)

In [17]:
#valid set
val_probabilities = model3.predict_proba(X_val)[:, 1]

val_predictions = model3.predict(X_val)

val_roc_auc = roc_auc_score(y_val, val_probabilities)

val_accuracy = accuracy_score(y_val, val_predictions)

print(f'Validation ROC AUC Score: {val_roc_auc:.2f}')
print(f'Validation Accuracy Score: {val_accuracy:.2f}')

Validation ROC AUC Score: 0.70
Validation Accuracy Score: 0.62


In [18]:
#test set
test_predictions = model3.predict(X_test)
test_probabilities = model3.predict_proba(X_test)[:, 1]
test_accuracy = accuracy_score(y_test, test_predictions)
print(f"Test Accuracy: {test_accuracy}")
test_roc_auc = roc_auc_score(y_test, test_probabilities)
print(f"Test ROC AUC: {test_roc_auc}")

Test Accuracy: 0.6261216896476253
Test ROC AUC: 0.6930188949724859
