In [11]:
#import libraries
from sklearn.feature_selection import SelectPercentile
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import feature_selection
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import ensemble
from sklearn import neural_network
from sklearn import svm
from sklearn import kernel_ridge
from sklearn.decomposition import KernelPCA
from sklearn.feature_selection import RFECV
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import KFold
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb
from sklearn.cluster import DBSCAN
from sklearn.utils import resample
import warnings  
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE
from sklearn import svm
from numpy import loadtxt
import xgboost
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, KFold
from bayes_opt import BayesianOptimization
from imblearn.over_sampling import SMOTE

In [12]:
#define functions for loading data and producing final CSV 

'''
eliminate highly correlated features
'''
def to_be_eliminated(df):
    # Create correlation matrix
    corr_matrix = df.corr().abs()
    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))
    # Find features with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    return to_drop

'''
loading training and test datasets
'''
def load_data():
    X_train = pd.read_csv("X_train.csv")
    X_test = pd.read_csv("X_test.csv")
    y_train = pd.read_csv("y_train.csv")
     
    #dropping id column
    X_train = X_train.drop('id', axis = 1)
    X_test = X_test.drop('id', axis = 1)
    y_train = y_train.drop('id', axis = 1)
   
    #reshuffling data 
    X_train['y'] = y_train
    X_train = X_train.sample(frac=1).reset_index(drop=True)
    y_train = X_train['y']
    X_train = X_train.drop('y', axis = 1)
    
    to_drop = to_be_eliminated(X_train)
    
    for i in range(len(to_drop)):
        X_train = X_train.drop(to_drop[i], axis = 1)
    
    for i in range(len(to_drop)):
        X_test = X_test.drop(to_drop[i], axis = 1)

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    return X_train, X_test, y_train


'''
produce submission file
'''
def produce_solution(y):
    with open('out.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', lineterminator="\n")
        writer.writerow(['id', 'y'])
        for i in range(y.shape[0]):
            writer.writerow([float(i), y[i]])


In [7]:
'''

Approach 1: 

model assessment via 10 fold CV 
class imbalance is taken care of by oversamplingfrom classes 0 and 2

Important Note
Always split into test and train sets BEFORE trying oversampling techniques!
Oversampling before splitting the data can allow the exact same observations 
to be present in both the test and train sets. This can allow our model to simply 
memorize specific data points and cause overfitting and poor generalization to the test data.
'''
'''
#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=5)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']

    #oversampling to offset class imbalance
    X_concat = pd.concat([X_train, y_train], axis=1)
    
    # separate minority and majority classes
    class_0 = X_concat[X_concat.y==0]
    class_1 = X_concat[X_concat.y==1]
    class_2 = X_concat[X_concat.y==2]

    #upsample minority -- classes 0 and 2
    class_0_upsampled = resample(class_0,
                          replace=True, # sample with replacement
                          n_samples=len(class_1), # match number in majority class
                          random_state=27) 
    class_2_upsampled = resample(class_2,
                          replace=True, # sample with replacement
                          n_samples=len(class_1), # match number in majority class
                          random_state=32)

    upsampled = pd.concat([class_1, class_0_upsampled, class_2_upsampled])
   
    y_train = upsampled.y
    X_train = upsampled.drop('y', axis=1)
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
#################################################################
#begin fitting model to training folds -- X_train 

    #2. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=100, contamination=0.30)
    outliers = isf.fit_predict(X_train)

    #DBScan = DBSCAN(eps = .5, metric='euclidean', min_samples = 30, n_jobs = -1)    
    #outliers = DBScan.fit_predict(X_train)

    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    
    
    #3. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
    select.fit(X_train, y_train)
    X_train = select.transform(X_train)
    
    print("Fitting the model")
    clf = xgb.XGBClassifier(random_state=42, learning_rate=0.6, n_estimators=100, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    clf.fit(X_train, y_train)
    
#end model fitting on X_train
############################################################
        
    #prediction 
    print("Predicting")
    #selecting features based on training results
    X_test = select.transform(X_test)
    pred = clf.predict(X_test)
    
    #scoring
    score = balanced_accuracy_score(y_test, pred)
    print(score)
    scores = np.append(scores,score)
    
##########################################################    

truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

'''

'\n#X and y are training x and y data \n#X_test_original corresponds to X_test.csv as given in the task \n\nX, X_test_original, y = load_data() \ny = y.ravel()\nscores = np.array([])\n\nkf = KFold(n_splits=5)\nBMAC_scores = np.array([])\n\nfor train_index, test_index in kf.split(X):\n    #define X_train and y_train as data in training folds (model is fitted here)\n    #similarly, X_test, y_test as data in test fold (model is evaluated here)\n    X_train, X_test = X[train_index], X[test_index]\n    X_train = pd.DataFrame(X_train)\n    X_test = pd.DataFrame(X_test)\n\n    y_train, y_test = y[train_index], y[test_index]\n    y_train = pd.DataFrame(y_train)\n    y_test = pd.DataFrame(y_test)\n    \n    y_train.columns = [\'y\']\n    y_test.columns = [\'y\']\n\n    #oversampling to offset class imbalance\n    X_concat = pd.concat([X_train, y_train], axis=1)\n    \n    # separate minority and majority classes\n    class_0 = X_concat[X_concat.y==0]\n    class_1 = X_concat[X_concat.y==1]\n    

In [None]:
'''
BEST APPROACH SO FAR
    
Approach 2: 

model assessment via 5 fold CV 
class imbalance is taken care of by undersampling from class 1 
'''

#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=5)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train))
    X_test = scaler.transform(X_test)
    
    #2. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=200, random_state=42))
    select.fit(X_train, y_train)
    X_train = pd.DataFrame(select.transform(X_train))
   
    '''
    #3. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=300, contamination=0.38)
    outliers = isf.fit_predict(X_train)
    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    #DBScan = DBSCAN(eps = .5, metric=”euclidean”,min_samples = 30, n_jobs = -1)    
    #outliers = DBScan.fit_predict(X_train)
    '''
    
    '''
    #4. Undersampling from class 1 to offset class imbalance
    print('Undersampling')
    X_concat = pd.concat([X_train, y_train], axis=1)
    # separate minority and majority classes
    class_0 = X_concat[X_concat.y==0]
    class_1 = X_concat[X_concat.y==1]
    class_2 = X_concat[X_concat.y==2]

    #undersample majority class (1)
    class_1_under = resample(class_1,
                          replace=False, # sample with replacement
                          n_samples=len(class_0), # match number in minority classes
                          random_state=27) 
    undersampled = pd.concat([class_1_under, class_0, class_2])
    y_train = undersampled.y
    X_train = undersampled.drop('y', axis=1)
    '''
    #5. fitting model
    print("Fitting the model")
    #clf = xgb.XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=300, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    class_weight = y_train.shape[0] / (3 * np.bincount((y_train.iloc[:,0]).astype(int)))
    class_weights0 = { 
    0 : class_weight[0],
    1 : class_weight[1],
    2 : class_weight[2]
    }
    
    ########## BO
    def classifier(c0_weight=class_weight[0], c1_weight=class_weight[1], c2_weight=class_weight[2], 
                   xtrain=X_train, ytrain=y_train, xtest=X_test, ytest=y_test):
        class_weights1 = { 
        0 : c0_weight,
        1 : c1_weight,
        2 : c2_weight
        }
        clf = svm.SVC(class_weight=class_weights1)

        clf.fit(xtrain, ytrain)

        #6. prediction 
        #print("Predicting")
        #selecting features based on training results
        #_test_selected = pd.DataFrame(select.transform(xtest))  #note: transform was previosuly fitted on training folds
        pred = clf.predict(xtrain)

        #scoring
        score = balanced_accuracy_score(ytrain, pred)
        #print(score)
        #scores = np.append(scores,score)
        return score


    # specify parameters and distributions to sample from
    param_dist = {"c0_weight": (0, 4), "c1_weight": (0, 3), "c2_weight": (0, 4)}

    optimizer = BayesianOptimization(
        f=classifier,
        pbounds=param_dist,
        verbose=2,
        random_state=5,
    )

    probe_params = {"c0_weight": class_weight[0], "c1_weight": class_weight[1], "c2_weight": class_weight[2]}
    optimizer.probe(
        params=probe_params,
        lazy=True
    )

    optimizer.maximize(
        init_points=3,
        n_iter=80,
    )

    print(optimizer.max)
    

    ########## BO
    
    
    class_weights_test = { 
    0 : optimizer.max['params']['c0_weight'],
    1 : optimizer.max['params']['c1_weight'],
    2 : optimizer.max['params']['c2_weight']
    }
    clf2 = svm.SVC(class_weight=class_weights_test)
    clf2.fit(X_train, y_train)
    print("Predicting")
    #selecting features based on training results
    X_test_selected = pd.DataFrame(select.transform(X_test))  #note: transform was previosuly fitted on training folds
    pred2 = clf2.predict(X_test_selected)
    #scoring
    score2 = balanced_accuracy_score(y_test, pred2)
    print('Test score:', score2)    
    scores = np.append(scores,score2)
    
    
##########################################################

truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

Standardize data
Feature Selection
Fitting the model
|   iter    |  target   | c0_weight | c1_weight | c2_weight |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.8346  [0m | [0m 2.634   [0m | [0m 0.4472  [0m | [0m 2.602   [0m |
| [0m 2       [0m | [0m 0.6256  [0m | [0m 1.776   [0m | [0m 4.354   [0m | [0m 1.654   [0m |
| [95m 3       [0m | [95m 0.9062  [0m | [95m 7.349   [0m | [95m 2.442   [0m | [95m 4.894   [0m |
| [0m 4       [0m | [0m 0.8347  [0m | [0m 6.127   [0m | [0m 2.592   [0m | [0m 2.374   [0m |
| [0m 5       [0m | [0m 0.6345  [0m | [0m 8.0     [0m | [0m 0.0     [0m | [0m 8.0     [0m |
| [0m 6       [0m | [0m 0.6374  [0m | [0m 0.0     [0m | [0m 5.0     [0m | [0m 8.0     [0m |
| [95m 7       [0m | [95m 0.9194  [0m | [95m 8.0     [0m | [95m 5.0     [0m | [95m 8.0     [0m |
| [0m 8       [0m | [0m 0.3333  [0m | [0m 8.0     [0m | [0m 0.0     [0m | [0m 0.0     [0m

In [None]:
'''

'''
X_train, X_test, y_train = load_data() 
from sklearn.ensemble import VotingClassifier

class_weights0 = { 
0 : 2.67223382045929, 
1 : 0.44382801664355065, 
2 : 2.6834381551362685
}

class_weights1 = { 
0 : 2.6611226611226613, 
1 : 0.4435204435204435, 
2 : 2.7061310782241015
}

class_weights2 = { 
0 : 2.7176220806794054, 
1 : 0.44521739130434784, 
2 : 2.591093117408907
}

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))
    

select = SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42))
select.fit(X_train, y_train)
X_train = pd.DataFrame(select.transform(X_train))
X_test = pd.DataFrame(select.transform(X_test))

clf0 = svm.SVC(class_weight=class_weights0)
clf1 = svm.SVC(class_weight=class_weights1)
clf2 = svm.SVC(class_weight=class_weights2)
eclf = VotingClassifier(estimators=[('clf0', clf0), ('clf1', clf1), ('clf2', clf2)], voting='hard')

eclf.fit(X_train, y_train)
pred = eclf.predict(pd.DataFrame(X_test))

produce_solution(pred)

In [87]:
'''
BAD
BAD
BAD

Approach 3: 

model assessment via 5 fold CV 
class imbalance is taken care by performing the following 2 steps
step 1: undersample from class 1 
step 2: oversample from class 0 and 2
this is a hybrid between approach 1 and 2 
we choose the resampling in order to end up with balanced sampled 
each of which contains exactly half of the initial datapoints in class 1
'''
'''
#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=5)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train))
    X_test = scaler.transform(X_test)
    
    #2. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=300, random_state=42))
    select.fit(X_train, y_train)
    X_train = pd.DataFrame(select.transform(X_train))
    
    #3. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=300, contamination=0.38)
    outliers = isf.fit_predict(X_train)
    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    #DBScan = DBSCAN(eps = .5, metric=”euclidean”,min_samples = 30, n_jobs = -1)    
    #outliers = DBScan.fit_predict(X_train)
    
    #4. Undersampling from class 1 to offset class imbalance
    print('Undersampling & Oversampling')
    X_concat = pd.concat([X_train, y_train], axis=1)
    # separate minority and majority classes
    class_0 = X_concat[X_concat.y==0]
    class_1 = X_concat[X_concat.y==1]
    class_2 = X_concat[X_concat.y==2]

    #undersample majority class (1)
    class_1_under = resample(class_1,
                          replace=False, # sample without replacement
                          n_samples=min(1000,len(class_1)), 
                          random_state=27) 
    class_0_over = resample(class_0,
                          replace=True, # sample with replacement
                          n_samples= min(1000,len(class_1)), 
                          random_state=27)
    class_2_over = resample(class_2,
                          replace=True, # sample with replacement
                          n_samples=min(1000,len(class_1)), 
                          random_state=27)    
    
    undersampled = pd.concat([class_1_under, class_0_over, class_2_over])
    y_train = undersampled.y
    X_train = undersampled.drop('y', axis=1)
      
    #5. fitting model
    print("Fitting the model")
    clf = xgb.XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=300, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    clf.fit(X_train, y_train)
        
    #6. prediction 
    print("Predicting")
    #selecting features based on training results
    X_test = pd.DataFrame(select.transform(X_test))  #note: transform was previosuly fitted on training folds
    pred = clf.predict(X_test)
    
    #scoring
    score = balanced_accuracy_score(y_test, pred)
    print(score)
    scores = np.append(scores,score)

##########################################################

truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

'''

Standardize data
Feature Selection
Outlier Detection
Undersampling & Oversampling
Fitting the model
Predicting


ValueError: feature_names mismatch: ['0  ', '1  ', '2  ', '3  ', '4  ', '5  ', '6  ', '7  ', '8  ', '9  ', '10 ', '11 ', '12 ', '13 ', '14 ', '15 ', '16 ', '17 ', '18 ', '19 ', '20 ', '21 ', '22 ', '23 ', '24 ', '25 ', '26 ', '27 ', '28 ', '29 ', '30 ', '31 ', '32 ', '33 ', '34 ', '35 ', '36 ', '37 ', '38 ', '39 ', '40 ', '41 ', '42 ', '43 ', '44 ', '45 ', '46 ', '47 ', '48 ', '49 ', '50 ', '51 ', '52 ', '53 ', '54 ', '55 ', '56 ', '57 ', '58 ', '59 ', '60 ', '61 ', '62 ', '63 ', '64 ', '65 ', '66 ', '67 ', '68 ', '69 ', '70 ', '71 ', '72 ', '73 ', '74 ', '75 ', '76 ', '77 ', '78 ', '79 ', '80 ', '81 ', '82 ', '83 ', '84 ', '85 ', '86 ', '87 ', '88 ', '89 ', '90 ', '91 ', '92 ', '93 ', '94 ', '95 ', '96 ', '97 ', '98 ', '99 ', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '200', '201', '202', '203', '204', '205'] ['0  ', '1  ', '2  ', '3  ', '4  ', '5  ', '6  ', '7  ', '8  ', '9  ', '10 ', '11 ', '12 ', '13 ', '14 ', '15 ', '16 ', '17 ', '18 ', '19 ', '20 ', '21 ', '22 ', '23 ', '24 ', '25 ', '26 ', '27 ', '28 ', '29 ', '30 ', '31 ', '32 ', '33 ', '34 ', '35 ', '36 ', '37 ', '38 ', '39 ', '40 ', '41 ', '42 ', '43 ', '44 ', '45 ', '46 ', '47 ', '48 ', '49 ', '50 ', '51 ', '52 ', '53 ', '54 ', '55 ', '56 ', '57 ', '58 ', '59 ', '60 ', '61 ', '62 ', '63 ', '64 ', '65 ', '66 ', '67 ', '68 ', '69 ', '70 ', '71 ', '72 ', '73 ', '74 ', '75 ', '76 ', '77 ', '78 ', '79 ', '80 ', '81 ', '82 ', '83 ', '84 ', '85 ', '86 ', '87 ', '88 ', '89 ', '90 ', '91 ', '92 ', '93 ', '94 ', '95 ', '96 ', '97 ', '98 ', '99 ', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '158', '159', '160', '161', '162', '163', '164', '165', '166', '167', '168', '169', '170', '171', '172', '173', '174', '175', '176', '177', '178', '179', '180', '181', '182', '183', '184', '185', '186', '187', '188', '189', '190', '191', '192', '193', '194', '195', '196', '197', '198', '199', '200']
expected 204, 203, 205, 201, 202 in input data

In [49]:
'''
Approach 4: 

model assessment via 5 fold CV 
class imbalance is taken care of by undersampling from class 1 
'''

#X and y are training x and y data 
#X_test_original corresponds to X_test.csv as given in the task 

X, X_test_original, y = load_data() 
y = y.ravel()
scores = np.array([])

kf = KFold(n_splits=5)
BMAC_scores = np.array([])

for train_index, test_index in kf.split(X):
    #define X_train and y_train as data in training folds (model is fitted here)
    #similarly, X_test, y_test as data in test fold (model is evaluated here)
    X_train, X_test = X[train_index], X[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    y_train, y_test = y[train_index], y[test_index]
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    
    y_train.columns = ['y']
    y_test.columns = ['y']
    
    smote = SMOTE('minority')
    X_train, y_train = smote.fit_sample(X_train, y_train)

    '''
    #undersampling from class 1 to offset class imbalance
    X_concat = pd.concat([X_train, y_train], axis=1)
    
    # separate minority and majority classes
    class_0 = X_concat[X_concat.y==0]
    class_1 = X_concat[X_concat.y==1]
    class_2 = X_concat[X_concat.y==2]

    #upsample minority -- classes 0 and 2
    class_1_under = resample(class_1,
                          replace=False, # sample with replacement
                          n_samples=len(class_0), # match number in minority classes
                          random_state=27) 

    undersampled = pd.concat([class_1_under, class_0, class_2])
   
    y_train = undersampled.y
    X_train = undersampled.drop('y', axis=1)
    '''
    
    
    #1. Zero Mean, Unit Variance
    print("Standardize data")
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
#################################################################
#begin fitting model to training folds -- X_train 

    #2. Feature selection 
    print("Feature Selection")
    select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
    select.fit(X_train, y_train)
    X_train = select.transform(X_train)

    #3. Outlier detection
    print("Outlier Detection")
    isf = IsolationForest(n_estimators=100, contamination=0.35)
    outliers = isf.fit_predict(X_train)
    
    #DBScan = DBSCAN(eps = .5, metric=”euclidean”,min_samples = 30, n_jobs = -1)    
    #outliers = DBScan.fit_predict(X_train)

    unique, counts = np.unique(outliers, return_counts=True)
    count_dict = dict(zip(unique, counts))
    X_train = X_train[outliers == 1]
    y_train = y_train[outliers == 1]
    
    print("Fitting the model")
    clf = xgb.XGBClassifier(random_state=42, learning_rate=0.5, n_estimators=100, max_depth=10)
    #clf = RandomForestClassifier(n_estimators=300, max_depth=10)
    clf.fit(X_train, y_train)
    
#end model fitting on X_train
############################################################
        
    #prediction 
    print("Predicting")
    #selecting features based on training results
    X_test = select.transform(X_test)
    pred = clf.predict(X_test)
    
    #scoring
    score = balanced_accuracy_score(y_test, pred)
    print(score)
    scores = np.append(scores,score)

##########################################################

truth = np.mean(scores)
std = np.std(scores)
print("mean expected error: ", truth, "std: ", std)

Standardize data
Feature Selection
Outlier Detection
Fitting the model
Predicting
0.5357672053022566
Standardize data
Feature Selection


KeyboardInterrupt: 