In [1]:
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import RobustScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from ens_selection import CES
from sklearn.metrics import make_scorer, f1_score
from xgboost import XGBClassifier
import sys
from utils import dummy_cv

path_to_ei = "/home/jamie/Projects/ei-python"
sys.path.append(path_to_ei)
from ei import EnsembleIntegration

from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1250, n_features=50, n_redundant=0,
n_clusters_per_class=1, weights=[0.95, 0.05], flip_y=0.0, random_state=1)

X_1 = X[:, 0:25]
X_2 = X[:, 25:]

X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y, test_size=0.2, random_state=2, shuffle=True, stratify=y)
X_2_train, X_2_test, y_2_train, y_2_test = train_test_split(X_2, y, test_size=0.2, random_state=2, shuffle=True, stratify=y)

In [2]:
len(y[y==0])

1188

In [3]:
len(y[y==1])

62

In [4]:
len(y_1_test[y_1_test==1])

12

In [5]:
data = {
                "X_1": X_1_train,
                "X_2": X_2_train
                }

In [18]:
base_predictors = {
                'AdaBoost': AdaBoostClassifier(),
                'DT': DecisionTreeClassifier(max_depth=3),
                'GradientBoosting': GradientBoostingClassifier(n_estimators=100),
                'KNN': KNeighborsClassifier(n_neighbors=40),
                'LR': LogisticRegression(C=0.5),
                'NB': GaussianNB(),
                'MLP': MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=200, alpha=1),
                'RF': RandomForestClassifier(),
                'SVM': SVC(probability=True), 
                'XGB': XGBClassifier()
                    }

EI = EnsembleIntegration(base_predictors=base_predictors,
                        k_outer=3,
                        k_inner=3,
                        n_samples=3,
                        sampling_strategy="undersampling",
                        sampling_aggregation="mean",
                        n_jobs=-1,  # set as -1 to use all available CPUs
                        random_state=38,
                        parallel_backend="loky",
                        project_name="cell-division",
                        model_building=True,
                        # calibration_model=CalibratedClassifierCV(),
                        )

meta_models = {
                'AdaBoost': AdaBoostClassifier(),
                'DT': DecisionTreeClassifier(max_depth=3),
                'GradientBoosting': GradientBoostingClassifier(n_estimators=100),
                'KNN': KNeighborsClassifier(n_neighbors=40),
                'LR': LogisticRegression(C=0.5),
                'NB': GaussianNB(),
                'MLP': MLPClassifier(hidden_layer_sizes=(50), max_iter=200, alpha=1),
                'RF': RandomForestClassifier(),
                'SVM': SVC(probability=True, C=0.01),
                'XGB': XGBClassifier()
                }

In [19]:
for name, modality in data.items():
    EI.train_base(modality, y_1_train, modality=name)

EI.train_meta(meta_models=meta_models)



##############################################################################################
######################################## X_1 modality ########################################
############################################################################################## 


Training base predictors and generating data for analysis...
Generating meta training data via nested cross validation...
Training base predictors on outer training sets...

Base predictor training is complete: see "base_summary" attribute for a summary of base predictor performance. Meta training data can be found in "meta_training_data" and "meta_test_data" attributes. Run "train_meta" method for analysis of ensemble algorithms.

Training base predictors and generating data for final ensemble...
Generating meta training data via nested cross validation...
Training base predictors on outer training sets...

Model building: meta training data for the final model has been generated and can be found in 

<ei.EnsembleIntegration at 0x7fc2d84add30>

In [23]:
len(EI.final_models["base models"]["X_1"])

30

In [47]:
test_data = {
                "X_1": X_1_test,
                "X_2": X_2_test
                }

ensemble_method = "S.LR"

y_pred = EI.predict(X_dictionary=test_data, meta_model_key=ensemble_method)

threshold = EI.meta_summary["thresholds"][ensemble_method]["fmax (minority)"]

y_pred_label = y_pred
y_pred_label[y_pred_label>threshold] = 1
y_pred_label[y_pred_label<=threshold] = 0

print(f1_score(y_1_test, y_pred_label))

0.5681818181818181


In [None]:
cv = dummy_cv(n_splits=1)
for train_ix, test_ix in cv.split(X=X, y=y):
    print(X[train_ix, :].shape)