In [4]:
from sklearn.model_selection import train_test_split
from DE import DiffentialEvolutionTuner
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from features_engineer import load_data, normalize_data, pca
from defines import *
import pickle
import learner_params as lp
import numpy as np
from timeit import default_timer as timer


learner_objs = {"cart": tree.DecisionTreeClassifier(random_state=SEED_CART),
                "rf": RandomForestClassifier(random_state=SEED_RF),
                "nb": GaussianNB(),
                "svm": SVC(random_state=SEED_SVM),
                "knn": KNeighborsClassifier()
                }

def pickle_operating(fname, item, flag):
    # save or load the pickle file.
    file_name = '%s.pickle' % fname
    print(file_name)
    if flag == 1:
        with open(file_name, 'rb') as fs:
            item = pickle.load(fs)
            return item
    else:
        with open(file_name, 'wb') as fs:
            pickle.dump(item, fs, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
#train_path = './sentences.csv'
#X_train, y_train = load_data(train_path, False)
#loading features selected file data
X_t = pickle_operating('X_pca_1000', None, 1)
y_train = pickle_operating('y_train', None, 1)

X_pca_1000.pickle
y_train.pickle


In [7]:
X = {}
y = {}
#splitting data to train, tune, and test sets
X['merged'], X['test'], y['merged'], y['test'] = train_test_split(X_t, y_train, test_size=0.33, random_state=42)
X['train'], X['tune'], y['train'], y['tune'] = train_test_split(X['merged'], y['merged'], test_size=0.5, random_state=42)

In [9]:
import features_engineer as fe
#smoting the data
balance_klass = fe.SMOTE()
l = {0: 2, 1: 3}
X['train_smote'], y['train_smote'] = balance_klass.execute(l, samples=X['train'], labels=y['train'])

('Mean Length: ', 811)


In [None]:
#X['train'], y['train'] = reduce_data(y['train'], X['train'])

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=SEED_SMOTE)
X['train_smote'], y['train_smote'] = sm.fit_sample(X['train'], y['train'])

In [10]:
from collections import Counter
print('Original dataset shape {}'.format(Counter(y['train_smote'])))

Original dataset shape Counter({'p': 811, 'o': 811, 'n': 811})


In [11]:
X['merged_smote'] = np.vstack((X['train_smote'],  X['tune'])) 
y['merged_smote'] = list(y['train_smote']) + list(y['tune'])

In [12]:
learner_params = lp.param_grid['svm']
learner = learner_objs['svm']

In [14]:
# Fetch training, tuning and testing datasets for lucene
#X,Y = preprocess(dataset=dataset, do_smote = True)
start = timer()
de_tuner = DiffentialEvolutionTuner(learner=learner,learner_params=paramgrid,
                                    X_train=X['train_smote'], Y_train=y['train_smote'],
                                    X_tune=X['tune'], Y_tune=y['tune'],
                                    X_merged=X['merged'], Y_merged=y['merged'],
                                    X_test=X['test'], Y_test=y['test'],
                                    np=10, goal="f1", life=3, cr=0.5, f=0.75)

score, best_params, tune_score, untuned_test_score, learner_fit = de_tuner.tune_and_evaluate(1)
duration = timer() - start
print(duration)

Time taken to compute untuned score: 19.0482060909 
Executing Iteration #1 of DE

Running generation 1

New member added : {'kernel': 'rbf', 'C': 46, 'coef0': 0.81632653061224492} | Score = 0.828195643239
New member added : {'kernel': 'rbf', 'C': 20, 'coef0': 0.15510204081632656} | Score = 0.829017673654
New member added : {'kernel': 'rbf', 'C': 38, 'coef0': 0.58673469387755106} | Score = 0.827373612824
Best member of population is {'kernel': 'poly', 'C': 14, 'coef0': 0.11836734693877551} | Score = 0.838471023428 with score = 0.838471

Running generation 2

New member added : {'kernel': 'rbf', 'C': 1, 'coef0': 0.9816326530612246} | Score = 0.848746403617
New member added : {'kernel': 'rbf', 'C': 24, 'coef0': 0.57755102040816331} | Score = 0.827373612824
Best member of population is {'kernel': 'rbf', 'C': 1, 'coef0': 0.9816326530612246} | Score = 0.848746403617 with score = 0.848746

Running generation 3

New member added : {'kernel': 'rbf', 'C': 1, 'coef0': 0.36632653061224496} | Score

In [14]:
#micro
from sklearn.model_selection import cross_validate
def tuned_learner(model):
    if model == 'svm':
        clf = SVC(kernel = 'rbf', C = 4, random_state = 1, coef0= 0.306) 
    elif model == 'rf':
        clf = RandomForestClassifier(max_leaf_nodes= 42, min_samples_leaf= 5, n_estimators= 50, max_features= 90, random_state= 1, min_samples_split= 18)
    elif model == 'knn':
        clf = KNeighborsClassifier(n_neighbors= 8, weights= 'distance')
    elif model == 'cart':
        clf = tree.DecisionTreeClassifier(max_features= 0.53163265306122454, min_samples_split= 12, random_state= 79, max_depth= 35, min_samples_leaf= 16)
    else:
        raise NameError('Unknown machine learning model. Please us one of: rf, svm, nb')
    return clf

In [16]:
clf = tuned_learner('rf')
#fit and save the model
clf.fit(X['train_smote'], y['train_smote'])
pickle_operating('rf', clf, 0)

rf.pickle


In [19]:
#run cross-validation on all scores 
clf = tuned_learner('rf')
scores = cross_validate(clf, X_t, y_train, scoring=['precision_micro', 'recall_micro', 'f1_micro', 'f1_macro'],
                        cv=5, return_train_score=False)

In [20]:
print scores

{'score_time': array([ 0.08157611,  0.06754017,  0.07142806,  0.07508707,  0.06344986]), 'fit_time': array([ 28.33412099,  26.621454  ,  27.83389783,  29.41496801,  26.31560493]), 'test_f1_micro': array([ 0.87207703,  0.87826685,  0.87405368,  0.87327824,  0.87250172]), 'test_f1_macro': array([ 0.40883672,  0.46758123,  0.42798562,  0.40989172,  0.39916821]), 'test_recall_micro': array([ 0.87207703,  0.87826685,  0.87405368,  0.87327824,  0.87250172]), 'test_precision_micro': array([ 0.87207703,  0.87826685,  0.87405368,  0.87327824,  0.87250172])}


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier

clf2 = KNeighborsClassifier()
clf4 = SVC(kernel='poly', C= 1, coef0= 1.0, probability=True)
clf3 = RandomForestClassifier(max_leaf_nodes= 42, min_samples_leaf= 5, n_estimators= 50, max_features= 90, random_state= 1, min_samples_split= 18)
clf1 = tree.DecisionTreeClassifier(max_features= 0.16428571428571431, min_samples_split= 11, random_state= 79, max_depth= 44, min_samples_leaf=10)

eclf3 = VotingClassifier(estimators=[
       ('cart', clf1), ('knn', clf2), ('rf', clf3), ('svm', clf4)],
        voting='soft', weights=[1,2,3,4],
        flatten_transform=True)

In [None]:
scores = cross_val_score(eclf3, X_t, y_train, cv=5)

In [None]:
split = 0.3
size = int(len(y_train) * split)
X_train, X_test, y_train, y_test = train_test_split(X_t, y_train, test_size=size)
start = timer()
eclf3.fit(X_train, y_train)
duration = timer() - start
print(duration)
print()
print()
print("Scores on test set: %s" % classification_report(y_test, eclf3.predict(X_test)))
print()