<a href="https://colab.research.google.com/github/MiraGles/Machine-learning-traininng/blob/main/Evolutionary_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
from evolutionary_search import EvolutionaryAlgorithmSearchCV, maximize
import sklearn.datasets
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
import unittest
import random



def func(x, y, m=1.0, z=False):
    return m * (np.exp(-(x ** 2 + y ** 2)) + float(z))


def readme():
    data = sklearn.datasets.load_digits()
    X = data["data"]
    y = data["target"]


    paramgrid = {
        "kernel": ["rbf"],
        "C": np.logspace(-9, 9, num=25, base=10),
        "gamma": np.logspace(-9, 9, num=25, base=10),
    }

    random.seed(1)

    cv = EvolutionaryAlgorithmSearchCV(
        estimator=SVC(),
        params=paramgrid,
        scoring="accuracy",
        cv=StratifiedKFold(n_splits=4),
        verbose=1,
        population_size=10,
        gene_mutation_prob=0.10,
        gene_crossover_prob=0.5,
        tournament_size=3,
        generations_number=5,
    )

    cv.fit(X, y)
    return cv

class TestEvolutionarySearch(unittest.TestCase):
    def test_cv(self):
        def try_with_params(**kwargs):
            cv = readme()
            cv_results_ = cv.cv_results_
            print("CV Results:\n{}".format(cv_results_))
            self.assertIsNotNone(cv_results_, msg="cv_results is None.")
            self.assertNotEqual(cv_results_, {}, msg="cv_results is empty.")
            self.assertAlmostEqual(
                cv.best_score_,
                1.0,
                delta=0.05,
                msg="Did not find the best score. Returned: {}".format(cv.best_score_),
            )

        try_with_params()


    def test_optimize(self):
        """ Simple hill climbing optimization with some twists. """

        param_grid = {"x": [-1.0, 0.0, 1.0], "y": [-1.0, 0.0, 1.0], "z": [True, False]}
        args = {"m": 1.0}

        best_params, best_score, score_results, _, _ = maximize(
            func, param_grid, args, verbose=True
        )
        print("Score Results:\n{}".format(score_results))

        self.assertEqual(best_params, {"x": 0.0, "y": 0.0, "z": True})
        self.assertEqual(best_score, 2.0)




In [3]:
!pip install sklearn-genetic

Collecting sklearn-genetic
  Downloading sklearn_genetic-0.5.1-py3-none-any.whl (11 kB)
Collecting deap>=1.0.2
  Downloading deap-1.3.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (160 kB)
[K     |████████████████████████████████| 160 kB 5.0 MB/s 
Installing collected packages: deap, sklearn-genetic
Successfully installed deap-1.3.1 sklearn-genetic-0.5.1


In [4]:
!pip install sklearn-deap

Collecting sklearn-deap
  Downloading sklearn_deap-0.3.0-py3-none-any.whl (11 kB)
Installing collected packages: sklearn-deap
Successfully installed sklearn-deap-0.3.0


In [10]:
import sklearn.datasets
import numpy as np
import random

data = sklearn.datasets.load_digits()
X = data["data"]
y = data["target"]

from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold

paramgrid = {"kernel": ["rbf"],
             "C"     : np.logspace(-9, 9, num=25, base=10),
             "gamma" : np.logspace(-9, 9, num=25, base=10)}

random.seed(1)

from evolutionary_search import EvolutionaryAlgorithmSearchCV
cv = EvolutionaryAlgorithmSearchCV(estimator=SVC(),
                                   params=paramgrid,
                                   scoring="accuracy",
                                   cv=StratifiedKFold(n_splits=4),
                                   verbose=1,
                                   population_size=50,
                                   gene_mutation_prob=0.10,
                                   gene_crossover_prob=0.5,
                                   tournament_size=3,
                                   generations_number=5,
                                   n_jobs=4)
cv.fit(X, y)

Types [1, 2, 2] and maxint [0, 24, 24] detected
--- Evolve in 625 possible combinations ---
gen	nevals	avg    	min    	max     	std     
0  	50    	0.24453	0.10128	0.968837	0.311932
1  	36    	0.44739	0.10128	0.971619	0.403942
2  	23    	0.769705	0.101836	0.971619	0.337089
3  	31    	0.948158	0.148024	0.971619	0.114484
4  	30    	0.934869	0.10128 	0.971619	0.165466
5  	19    	0.968948	0.968837	0.971619	0.00054524
Best individual is: {'kernel': 'rbf', 'C': 5.623413251903491, 'gamma': 0.00017782794100389227}
with fitness: 0.9716193656093489


In [16]:
!pip install Preprocessing

Collecting Preprocessing
  Downloading preprocessing-0.1.13-py3-none-any.whl (349 kB)
[K     |████████████████████████████████| 349 kB 4.9 MB/s 
[?25hCollecting nltk==3.2.4
  Downloading nltk-3.2.4.tar.gz (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 8.0 MB/s 
[?25hCollecting sphinx-rtd-theme==0.2.4
  Downloading sphinx_rtd_theme-0.2.4-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 22.7 MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.2.4-py3-none-any.whl size=1367722 sha256=99a192fbafb7019f452e56e78146ad61b53765fdfd582d7c1a03aabd759cbdd6
  Stored in directory: /root/.cache/pip/wheels/90/5e/9e/4cb46185f2a16c60e6fc524372ba7fef89ce3347734c8798b6
Successfully built nltk
Installing collected packages: sphinx-rtd-theme, nltk, Preprocessing
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
  

In [18]:
!pip install fifa_preprocessing


Collecting fifa_preprocessing
  Downloading fifa_preprocessing-1.1.2-py3-none-any.whl (8.1 kB)
Installing collected packages: fifa-preprocessing
Successfully installed fifa-preprocessing-1.1.2


In [21]:
# Some bits of code I found that I may need eventually

# KDE, https://jakevdp.github.io/blog/2013/12/01/kernel-density-estimation/

from sklearn.neighbors import KernelDensity
from scipy.stats import gaussian_kde
from statsmodels.nonparametric.kernel_density import KDEMultivariate

def kde_scipy(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scipy, use with small data"""
    # Note that scipy weights its bandwidth by the covariance of the
    # input data.  To make the results comparable to the other methods,
    # we divide the bandwidth by the sample standard deviation here.
    kde = gaussian_kde(x, bw_method=bandwidth / x.std(ddof=1), **kwargs)
    return kde.evaluate(x_grid)    
    
def kde_statsmodels_m(x, x_grid, bandwidth=0.2, **kwargs):
    """Multivariate Kernel Density Estimation with Statsmodels, use with heterogeneous data"""
    kde = KDEMultivariate(x, bw=bandwidth * np.ones_like(x),
                          var_type='c', **kwargs)
    return kde.pdf(x_grid)
    
def kde_sklearn(x, x_grid, bandwidth=0.2, **kwargs):
    """Kernel Density Estimation with Scikit-learn, use in general"""
    kde_skl = KernelDensity(bandwidth=bandwidth, **kwargs)
    kde_skl.fit(x[:, np.newaxis])
    # score_samples() returns the log-likelihood of the samples
    log_pdf = kde_skl.score_samples(x_grid[:, np.newaxis])
    return np.exp(log_pdf)
    
    
# ROC curves plotting, Yhat blog

from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression

X, y = make_classification(n_samples=10000, n_features=10, n_classes=2, n_informative=5)
Xtrain = X[:9000]
Xtest = X[9000:]
ytrain = y[:9000]
ytest = y[9000:]

clf = LogisticRegression()
clf.fit(Xtrain, ytrain)

from sklearn import metrics
import pandas as pd
from ggplot import *

preds = clf.predict_proba(Xtest)[:,1]
fpr, tpr, _ = metrics.roc_curve(ytest, preds)

df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(df, aes(x='fpr', y='tpr')) +\
    geom_line() +\
    geom_abline(linetype='dashed')

auc = metrics.auc(fpr,tpr)
ggplot(df, aes(x='fpr', ymin=0, ymax='tpr')) +\
    geom_area(alpha=0.2) +\
    geom_line(aes(y='tpr')) +\
    ggtitle("ROC Curve w/ AUC=%s" % str(auc))


# GridSearch in sklearn, from CS109
# evolutionary algorithm to replace gridsearch from https://github.com/rsteca/sklearn-deap
# Evolutionary not tested, may return wrong values
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix

from evolutionary_search import EvolutionaryAlgorithmSearchCV

def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None, evo=None, population_size=5):
    if score_func:
        if evo:
            gs = EvolutionaryAlgorithmSearchCV(pipeline, grid=parameters, scoring=score_func, n_jobs=n_jobs, population_size=population_size)
        else:
            gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
    else:
        if evo:
            gs = EvolutionaryAlgorithmSearchCV(pipeline, grid=parameters, scoring=None, verbose=True, n_jobs=4, population_size=population_size)
        else:
            gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
    gs.fit(X, y)
    print ("BEST", gs.best_params_, gs.best_score_, gs.grid_scores_)
    best = gs.best_estimator_
    return best




# https://github.com/databricks/spark-sklearn
from sklearn import grid_search, datasets
from sklearn.ensemble import RandomForestClassifier
# Use spark_sklearn’s grid search instead:
from sklearn.grid_search import GridSearchCV
from spark_sklearn import GridSearchCV
digits = datasets.load_digits()
X, y = digits.data, digits.target
param_grid = {"max_depth": [3, None],
              "max_features": [1, 3, 10],
              "min_samples_split": [1, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
              "n_estimators": [10, 20, 40, 80]}
gs = grid_search.GridSearchCV(RandomForestClassifier(), param_grid=param_grid)
gs.fit(X, y)

  import pandas.util.testing as tm


ModuleNotFoundError: ignored

In [22]:
!pip install ggplot

Collecting ggplot
  Downloading ggplot-0.11.5-py2.py3-none-any.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 5.3 MB/s 
[?25hCollecting brewer2mpl
  Downloading brewer2mpl-1.4.1-py2.py3-none-any.whl (24 kB)
Installing collected packages: brewer2mpl, ggplot
Successfully installed brewer2mpl-1.4.1 ggplot-0.11.5
