In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectKBest
from functools import partial
from MFMW import mfmw
from tri_stage import tri_stage

DATA_PATH = '../../data/'
SEED = 10
K=10

In [2]:
spect_train = pd.read_csv(DATA_PATH + 'SPECT.train', header=None)
spect_test = pd.read_csv(DATA_PATH + 'SPECT.test', header=None)
# change last column name to class
spect_train.columns = list(range(1, spect_train.shape[1])) + ['class']
spect_test.columns = list(range(1, spect_test.shape[1])) + ['class']
# name the columns
column_names = list(map(lambda i: f'G{i}', range(1, spect_train.shape[1]))) + ['class']
spect_train.columns = column_names
spect_test.columns = column_names

# get spect_train X, y as df
X_spect_train = spect_train.iloc[:, :-1]
y_spect_train = spect_train.iloc[:, -1]
# get spect_test X, y
X_spect_test = spect_test.iloc[:, :-1]
y_spect_test = spect_test.iloc[:, -1]

In [3]:
n_estimators = 100
clf = XGBClassifier(n_estimators=n_estimators, random_state=SEED)
# fit AdaBoost on spect_train
clf.fit(X_spect_train, y_spect_train)
# predict spect_test
y_pred = clf.predict(X_spect_test)
# calculate accuracy
accuracy_score(y_spect_test, y_pred)

0.6524064171122995

In [4]:
for fs, name in [(partial(mfmw, k=K), 'MFMW'), (tri_stage, 'Tri Stage')]:
    clf = XGBClassifier(n_estimators=n_estimators, random_state=SEED)
    # MFMW on spect_train and spect_test
    selector = SelectKBest(fs, k=K).fit(X_spect_train, y_spect_train)
    # fit AdaBoost on spect_train with best_subset
    new_X_spect_train = selector.transform(X_spect_train)
    new_X_spect_test = selector.transform(X_spect_test)
    clf.fit(new_X_spect_train, y_spect_train)
    # predict spect_test with best_subset
    y_pred = clf.predict(new_X_spect_test)
    # print accuracy
    print(f'{name} accuracy: {accuracy_score(y_spect_test, y_pred)}')

MFMW accuracy: 0.679144385026738
Phase 1:
Phase 2:
Phase 3:
Tri Stage accuracy: 0.7165775401069518


In [5]:
K = 180
# create a classification dataset with 10 features where only 5 are informative
X, y = make_classification(n_samples=3000, n_features=300, n_informative=20, n_redundant=280, n_classes=2, random_state=SEED)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)
clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=SEED)
clf.fit(X_train, y_train)
print("Accuracy: {}".format(clf.score(X_test, y_test)))

Accuracy: 0.8811111111111111


In [6]:
# combine X_train and y_train into a dataframe
train_set = pd.DataFrame(np.concatenate((X_train, y_train.reshape(-1, 1)), axis=1))
# turn the last column into integers
train_set.iloc[:, -1] = train_set.iloc[:, -1].astype(int)

# combine X_test and y_test into a dataframe
test_set = pd.DataFrame(np.concatenate((X_test, y_test.reshape(-1, 1)), axis=1))
# turn the last column into integers
test_set.iloc[:, -1] = test_set.iloc[:, -1].astype(int)

# get X, y of train_set
X_train, y_train = train_set.iloc[:, :-1], train_set.iloc[:, -1]

In [7]:
for fs, name in [(partial(mfmw, k=K), 'MFMW'), (tri_stage, 'Tri Stage')]:
    selector = SelectKBest(fs, k=K).fit(X_train, y_train)
    best_X_train, best_X_test = selector.transform(X_train), selector.transform(X_test)
    # fit the classifier again
    clf = AdaBoostClassifier(n_estimators=n_estimators, random_state=SEED)
    clf.fit(best_X_train, y_train)

    # print the accuracy of the classifier
    print(f"{name} accuracy: {clf.score(best_X_test, y_test)}")

MFMW accuracy: 0.8922222222222222
Phase 1:
Phase 2:
Phase 3:
Tri Stage accuracy: 0.89
