In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from xverse.ensemble import VotingSelector
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("/Users/ronnitrana/ACSEF/glioma-grading/data/UCI/TCGA_InfoWithGrade.csv")
data_randomized = data.sample(frac=1, random_state=42)

X = data_randomized.drop('Grade', axis=1)
y = data_randomized['Grade']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
logreg_pipe = Pipeline([('scaler', StandardScaler()), ('logreg', LogisticRegression(random_state=42, max_iter=1000))])
svm_pipe = Pipeline([('scaler', StandardScaler()), ('svm', SVC(probability=True, random_state=42))])
knn_pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
# For RandomForest and AdaBoost, scaling is not necessary, but included for consistency
rf_pipe = Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42))])
ada_pipe = Pipeline([('scaler', StandardScaler()), ('ada', AdaBoostClassifier(random_state=42))])

In [8]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('knn', knn_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 1: {avg_score}')

Average score for ensemble 1: 0.8777888336097291


In [9]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('rf', rf_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 2: {avg_score}')

Average score for ensemble 2: 0.8807628524046436


In [10]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 3: {avg_score}')

Average score for ensemble 3: 0.8792703150912107


In [11]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('knn', knn_pipe),
    ('rf', rf_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 4: {avg_score}')

Average score for ensemble 4: 0.873289110005528


In [12]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('knn', knn_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 5: {avg_score}')

Average score for ensemble 5: 0.8718297401879491


In [13]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 6: {avg_score}')

Average score for ensemble 6: 0.8733112216694308


In [14]:
ensemble = VotingClassifier(estimators=[
    ('svm', svm_pipe),
    ('knn', knn_pipe),
    ('rf', rf_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 7: {avg_score}')

Average score for ensemble 7: 0.8718076285240464


In [15]:
ensemble = VotingClassifier(estimators=[
    ('svm', svm_pipe),
    ('knn', knn_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 8: {avg_score}')

Average score for ensemble 8: 0.870326147042565


In [16]:
ensemble = VotingClassifier(estimators=[
    ('svm', svm_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 9: {avg_score}')

Average score for ensemble 9: 0.8807517965726921


In [17]:
ensemble = VotingClassifier(estimators=[
    ('knn', knn_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 10: {avg_score}')

Average score for ensemble 10: 0.8569043670536208


In [18]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('knn', knn_pipe),
    ('rf', rf_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 11: {avg_score}')

Average score for ensemble 11: 0.8807517965726921


In [19]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('knn', knn_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 12: {avg_score}')

Average score for ensemble 12: 0.8777888336097291


In [20]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 13: {avg_score}')

Average score for ensemble 13: 0.8792813709231619


In [21]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('knn', knn_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 14: {avg_score}')

Average score for ensemble 14: 0.873289110005528


In [22]:
ensemble = VotingClassifier(estimators=[
    ('svm', logreg_pipe),
    ('knn', svm_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 15: {avg_score}')

Average score for ensemble 15: 0.8792813709231619


In [25]:
ensemble = VotingClassifier(estimators=[
    ('logreg', logreg_pipe),
    ('svm', svm_pipe),
    ('knn', knn_pipe),
    ('rf', rf_pipe),
    ('ada', ada_pipe)
], voting='soft')

scores = cross_val_score(ensemble, X_train, y_train, cv=5)
avg_score = np.mean(scores)
print(f'Average score for ensemble 16: {avg_score}')

Average score for ensemble 16: 0.882244333886125
