In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
Health=pd.read_excel('Health.xlsx')
Health.head()

Unnamed: 0,sen,jens,dard ghafase sineh,feshar khun dar halat esterahat,kolestrol,ghand khun nashta,navar ghalb dar halat esterahat,hadaksar zaraban ghalb,anjin sadri nashi az varzesh,afsordegi st nashi az tamrin va varzesh nesbat be halat esterahat,shibe tamrin dar oje tamrin dar maghtae ST,tedad oroghe bozorg rangi ba flourosopy,talasemi,ehtemal voghu bimari ghalbi
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0,3,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,3,0


In [2]:
Health=Health.rename(columns={'ehtemal voghu bimari ghalbi':'Outcome'})
Health.head(3)

Unnamed: 0,sen,jens,dard ghafase sineh,feshar khun dar halat esterahat,kolestrol,ghand khun nashta,navar ghalb dar halat esterahat,hadaksar zaraban ghalb,anjin sadri nashi az varzesh,afsordegi st nashi az tamrin va varzesh nesbat be halat esterahat,shibe tamrin dar oje tamrin dar maghtae ST,tedad oroghe bozorg rangi ba flourosopy,talasemi,Outcome
0,63,1,1,145,233,1,2,150,0,2.3,3,0,6,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,3,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,7,1


In [3]:
from sklearn.model_selection import train_test_split
x=Health[['sen','jens','dard ghafase sineh','feshar khun dar halat esterahat','kolestrol','ghand khun nashta','navar ghalb dar halat esterahat','hadaksar zaraban ghalb','anjin sadri nashi az varzesh','afsordegi st nashi az tamrin va varzesh nesbat be halat esterahat','shibe tamrin dar oje tamrin dar maghtae ST','talasemi']]
y=Health['Outcome']
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.5,random_state=0)

In [4]:
#FeaturesSelectionGA
#This class uses Genetic Algorithm to find out the best features for an input model
    #using Distributed Evolutionary Algorithms in Python(DEAP) package. Default toolbox is
    #used for GA but it can be changed accordingly.

In [5]:
from deap import base, creator
import random
import numpy as np
from deap import tools
import fitness_function as ff


class FeatureSelectionGA:
    """
        FeaturesSelectionGA
        This class uses Genetic Algorithm to find out the best features for an input model
        using Distributed Evolutionary Algorithms in Python(DEAP) package. Default toolbox is
        used for GA but it can be changed accordingly.

    
    """
    def __init__(self,model,x,y,cv_split=5,verbose=0):
        """
            Parameters
            -----------
            model : scikit-learn supported model, 
                x :  {array-like}, shape = [n_samples, n_features]
                     Training vectors, where n_samples is the number of samples 
                     and n_features is the number of features.
 
                y  : {array-like}, shape = [n_samples]
                     Target Values
            cv_split: int
                     Number of splits for cross_validation to calculate fitness.
            
            verbose: 0 or 1
        """
        self.model =  model
        self.n_features = x.shape[1]
        self.toolbox = None
        self.creator = self._create()
        self.cv_split = cv_split
        self.x = x
        self.y = y
        self.verbose = verbose
        if self.verbose==1:
            print("Model {} will select best features among {} features using cv_split :{}.".format(model,x.shape[1],cv_split))
            print("Shape od train_x: {} and target: {}".format(x.shape,y.shape))
        self.final_fitness = []
        self.fitness_in_generation = {}
    	#self.best_ind = None
    
    def evaluate(self,individual):
        fit_obj = ff.FitenessFunction(self.cv_split)
        np_ind = np.asarray(individual)
        if np.sum(np_ind) == 0:
            fitness = 0.0
        else:
            feature_idx = np.where(np_ind==1)[0]
            fitness = fit_obj.calculate_fitness(self.model,self.x[:,feature_idx],self.y)
        
        if self.verbose == 1:
            print("Individual: {}  Fitness_score: {} ".format(individual,fitness))
            
        return fitness,
    
    
    def _create(self):
        creator.create("FeatureSelect", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FeatureSelect)
        return creator
    
    def create_toolbox(self):
        """ 
            Custom creation of toolbox.
            Parameters
            -----------
                self
            Returns
            --------
                Initialized toolbox
        """
        
        self._init_toolbox()
        return toolbox
        
    def register_toolbox(self,toolbox):
        """ 
            Register custom created toolbox. Evalute function will be registerd
            in this method.
            Parameters
            -----------
                Registered toolbox with crossover,mutate,select tools except evaluate
            Returns
            --------
                self
        """
        toolbox.register("evaluate", self.evaluate)
        self.toolbox = toolbox
     
    
    def _init_toolbox(self):
        toolbox = base.Toolbox()
        toolbox.register("attr_bool", random.randint, 0, 1)
        # Structure initializers
        toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, self.n_features)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        return toolbox
        
        
    def _default_toolbox(self):
        toolbox = self._init_toolbox()
        toolbox.register("mate", tools.cxTwoPoint)
        toolbox.register("mutate", tools.mutFlipBit, indpb=0.1)
        toolbox.register("select", tools.selTournament, tournsize=3)
        toolbox.register("evaluate", self.evaluate)
        return toolbox
    
    def get_final_scores(self,pop,fits):
        self.final_fitness = list(zip(pop,fits))
        
    
        
    def generate(self,n_pop,cxpb = 0.5,mutxpb = 0.2,ngen=5,set_toolbox = False):
        
        """ 
            Generate evolved population
            Parameters
            -----------
                n_pop : {int}
                        population size
                cxpb  : {float}
                        crossover probablity
                mutxpb: {float}
                        mutation probablity
                n_gen : {int}
                        number of generations
                set_toolbox : {boolean}
                              If True then you have to create custom toolbox before calling 
                              method. If False use default toolbox.
            Returns
            --------
                Fittest population
        """
        
        
        
        if self.verbose==1:
            print("Population: {}, crossover_probablity: {}, mutation_probablity: {}, total generations: {}".format(n_pop,cxpb,mutxpb,ngen))
        
        if not set_toolbox:
            self.toolbox = self._default_toolbox()
        else:
            raise Exception("Please create a toolbox.Use create_toolbox to create and register_toolbox to register. Else set set_toolbox = False to use defualt toolbox")
        pop = self.toolbox.population(n_pop)
        CXPB, MUTPB, NGEN = cxpb,mutxpb,ngen

        # Evaluate the entire population
        print("EVOLVING.......")
        fitnesses = list(map(self.toolbox.evaluate, pop))
        
        for ind, fit in zip(pop, fitnesses):
            ind.fitness.values = fit

        for g in range(NGEN):
            print("-- GENERATION {} --".format(g+1))
            offspring = self.toolbox.select(pop, len(pop))
            self.fitness_in_generation[str(g+1)] = max([ind.fitness.values[0] for ind in pop])
            # Clone the selected individuals
            offspring = list(map(self.toolbox.clone, offspring))

            # Apply crossover and mutation on the offspring
            for child1, child2 in zip(offspring[::2], offspring[1::2]):
                if random.random() < CXPB:
                    self.toolbox.mate(child1, child2)
                    del child1.fitness.values
                    del child2.fitness.values

            for mutant in offspring:
                if random.random() < MUTPB:
                    self.toolbox.mutate(mutant)
                    del mutant.fitness.values

            # Evaluate the individuals with an invalid fitness
            weak_ind = [ind for ind in offspring if not ind.fitness.valid]
            fitnesses = list(map(self.toolbox.evaluate, weak_ind))
            for ind, fit in zip(weak_ind, fitnesses):
                ind.fitness.values = fit
            print("Evaluated %i individuals" % len(weak_ind))

            # The population is entirely replaced by the offspring
            pop[:] = offspring
            
                    # Gather all the fitnesses in one list and print the stats
        fits = [ind.fitness.values[0] for ind in pop]
        
        length = len(pop)
        mean = sum(fits) / length
        sum2 = sum(x*x for x in fits)
        std = abs(sum2 / length - mean**2)**0.5
        if self.verbose==1:
            print("  Min %s" % min(fits))
            print("  Max %s" % max(fits))
            print("  Avg %s" % mean)
            print("  Std %s" % std)
    
        print("-- Only the fittest survives --")

        self.best_ind = tools.selBest(pop, 1)[0]
        print("Best individual is %s, %s" % (best_ind, best_ind.fitness.values))
        self.get_final_scores(pop,fits)
        
        return pop
    
   
    
    




In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np
def iterative_forward_select(X, y, depth=None):
    """
    Performes iterative forward selection on X with targets y.
    Currently, only model is linear regression, but this should be
    simple to generalize.
    Args:
    X (pd.DataFrame): DataFrame of predictor variables, dimensions
    (n_samples, n_features)
    y (pd.Series): Series of target values, dimensions (n_samples, 1)
    depth (int): Number of iterations before terminating
    Returns:
    None
    Outputs visualization of each iteration. For each forward
    select iteration, the residuals of the previous model are
    plotted against each predictor variable. Each plot is
    sorted by how predictive the predictor variable is after
    already regressing on the previously selected variables.
    """
    if not depth:
        
        depth = len(X.columns)
        current_params = []
        iteration = 1
        while iteration < depth:
            
            best_feature = forward_select(X, y, current_params, iteration)
            current_params.append(best_feature)
            iteration += 1
            return None
    
        def forward_select(X, y, current_params, iteration):
            scores = []
            columns = np.array([x for x in X.columns if x not in current_params])
            """
            A single round of forward selection. Determines most predictive
            variable, and plots how predictive each variable after
            regressing on previously selected variables.
            Args:
            X (pd.DataFrame): DataFrame of predictor variables, dimensions
            (n_samples, n_features)
            y (pd.Series): Series of target values, dimensions (n_samples, 1)
            current_params (list): list of variables already selected for
            regression
            iteration (int): which iteration through forward selection are
            we on
            Returns:
            Selected Variable (str): Name of variable that minimizes
            training error.
            Note, this is the best variable after already regressing
            on previously selected variables.
            """
            
            

            
            for column in columns:
                model = LinearRegression()
                test_params = current_params + [column]
                model.fit(X[test_params], y)
                score = model.score(X[test_params], y)
                scores.append(score)
                columns = columns[np.argsort(scores)][::-1]
                scores = np.sort(scores)[::-1]
                residuals = get_residuals(X, y, current_params)
                show_correlations(X, residuals, current_params, iteration, scores, columns)
                return columns[0]
                    
        def get_residuals(X, y, current_params):
            
            """
                Calculates residuals for current model
                Args:
                X (pd.DataFrame): DataFrame of predictor variables, dimensions
                (n_samples, n_features)
                y (pd.Series): Series of target values, dimensions (n_samples, 1)
                current_params (list): list of variables already selected for
                regression
                iteration (int): which iteration through forward selection are we on
                Returns:
                residuals (list): list of residuals for each observation
            """
            if len(current_params) == 0:
                current_estimate = np.mean(y)
                residuals = y - current_estimate
            else: 
                model = LinearRegression()
                model.fit(X[current_params], y)
                current_estimate = model.predict(X[current_params])
                residuals = y - current_estimate
                return residuals
                def show_correlations(X, residuals, current_params, iteration, scores,
                columns):
                    
                    """
                    Makes plot of correlation between each remaining predictor variable
                    and residuals. Each plot is ranked by how predictive the predictor
                    variable is after already regressing on the previously selected
                    variables.
                    Args:
                    X (pd.DataFrame): DataFrame of predictor variables, dimensions
                    (n_samples, n_features)
                    residuals (list): list of residuals for each observation
                    current_params (list): list of variables already selected for
                    regression
                    iteration (int): which iteration through forward selection are
                    we on
                    scores (list): list of scores representing how well a variable
                    improves the previous model
                    columns (list): list of predictor variable names
                    Returns:
                    None
                    Outputs files in plots/ directory with filenames being the
                    predictor variable names included in the previous model.
                    """

                    n_features = len(columns)
                    f, axarr = plt.subplots(n_features, sharex=True, sharey=True,
                    figsize=(6, 6 * n_features))
                    for i in range(n_features):
                        axarr[i].scatter(X[columns[i]], residuals)
                        axarr[i].set_ylabel(residuals.name, fontsize=18)
                        axarr[i].set_xlabel(columns[i], fontsize=18)
                        axarr[i].set_title('R^2: ' + str(scores[i]), fontsize=18)
                        plt.tight_layout()
                        filename = str(iteration) + '_' + '_'.join(current_params)
                        filename = filename.replace("/", "")
                        filename = filename.replace(".", "")
                        plt.savefig('plots/' + filename)
                        return None

  return f(*args, **kwds)


In [18]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from deap import creator, base, tools, algorithms
import sys


def avg(l):
    """
    Returns the average between list elements
    """
    return (sum(l)/float(len(l)))


def getFitness(individual, X, y):
    """
    Feature subset fitness function
    """

    if(individual.count(0) != len(individual)):
        # get index with value 0
        cols = [index for index in range(
            len(individual)) if individual[index] == 0]

        # get features subset
        X_parsed = X.drop(X.columns[cols], axis=1)
        X_subset = pd.get_dummies(X_parsed)

        # apply classification algorithm
        clf = LogisticRegression()

        return (avg(cross_val_score(clf, X_subset, y, cv=5)),)
    else:
        return(0,)


def geneticAlgorithm(X, y, n_population, n_generation):
    """
    Deap global variables
    Initialize variables to use eaSimple
    """
    # create individual
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)

    # create toolbox
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat,
                     creator.Individual, toolbox.attr_bool, len(X.columns))
    toolbox.register("population", tools.initRepeat, list,
                     toolbox.individual)
    toolbox.register("evaluate", getFitness, X=X, y=y)
    toolbox.register("mate", tools.cxOnePoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)

    # initialize parameters
    pop = toolbox.population(n=n_population)
    hof = tools.HallOfFame(n_population * n_generation)
    stats = tools.Statistics(lambda ind: ind.fitness.values)
    stats.register("avg", np.mean)
    stats.register("min", np.min)
    stats.register("max", np.max)

    # genetic algorithm
    pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.2, mutpb=0.2,
                                   ngen=n_generation, stats=stats, halloffame=hof,
                                   verbose=True)

    # return hall of fame
    return hof


def bestIndividual(hof, X, y):
    """
    Get the best individual
    """
    maxAccurcy = 0.0
    for individual in hof:
        #if(individual.fitness.values > maxAccurcy):
            maxAccurcy = individual.fitness.values
            _individual = individual

    _individualHeader = [list(X)[i] for i in range(
        len(_individual)) if _individual[i] == 1]
    return _individual.fitness.values, _individual, _individualHeader


def getArguments():
    """
    Get argumments from command-line
    If pass only dataframe path, pop and gen will be default
    """
    dfPath = sys.argv[1]
    if(len(sys.argv) == 4):
        pop = int(sys.argv[2])
        gen = int(sys.argv[3])
    else:
        pop = 12
        gen = 10
    return dfPath, pop, gen


if __name__ == '__main__':
    # get dataframe path, population number and generation number from command-line argument
    dataframePath, n_pop, n_gen = getArguments()
    # read dataframe from csv
    df = pd.read_excel('Health.xlsx')

    # encode labels column to numbers
    le = LabelEncoder()
    le.fit(df.iloc[:, -1])
    y = le.transform(df.iloc[:, -1])
    X = df.iloc[:, :-1]

    # get accuracy with all features
    individual = [1 for i in range(len(X.columns))]
    print("Accuracy with all features: \t" +
          str(getFitness(individual, X, y)) + "\n")

    # apply genetic algorithm
    hof = geneticAlgorithm(X, y, n_pop, n_gen)

    # select the best individual
    accuracy, individual, header = bestIndividual(hof, X, y)
    print('Best Accuracy: \t' + str(accuracy))
    print('Number of Features in Subset: \t' + str(individual.count(1)))
    print('Individual: \t\t' + str(individual))
    print('Feature Subset\t: ' + str(header))

    print('\n\ncreating a new classifier with the result')

    # read dataframe from csv one more time
    df = pd.read_excel('Health.xlsx')

    # with feature subset
    X = df[header]

    clf = LogisticRegression()

    scores = cross_val_score(clf, X, y, cv=5)
    print("Accuracy with Feature Subset: \t" + str(avg(scores)) + "\n")


Accuracy with all features: 	(0.8314124293785312,)





gen	nevals	avg    	min     	max     
0  	12    	0.79153	0.757288	0.824633
1  	6     	0.812891	0.79096 	0.824633
2  	5     	0.820198	0.80452 	0.824633
3  	7     	0.819887	0.78774 	0.824633
4  	3     	0.821309	0.794689	0.824633
5  	4     	0.824633	0.824633	0.824633
6  	2     	0.824633	0.824633	0.824633
7  	5     	0.819049	0.78774 	0.824633
8  	1     	0.824633	0.824633	0.824633
9  	4     	0.824633	0.824633	0.824633
10 	6     	0.825202	0.824633	0.831469
Best Accuracy: 	(0.7572881355932204,)
Number of Features in Subset: 	7
Individual: 		[0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0]
Feature Subset	: ['dard ghafase sineh', 'feshar khun dar halat esterahat', 'ghand khun nashta', 'navar ghalb dar halat esterahat', 'afsordegi st nashi az tamrin va varzesh nesbat be halat esterahat', 'shibe tamrin dar oje tamrin dar maghtae ST', ' tedad oroghe bozorg rangi ba flourosopy']


creating a new classifier with the result
Accuracy with Feature Subset: 	0.7572881355932204



In [19]:
#!/usr/bin/env python3

import numpy as np
from sklearn.model_selection import train_test_split
from keras import layers, models

diabetic = pd.read_excel('Health.xlsx')
X=Health[['sen','jens','dard ghafase sineh','feshar khun dar halat esterahat','kolestrol',
          'ghand khun nashta','navar ghalb dar halat esterahat','hadaksar zaraban ghalb']]
y=Health['Outcome']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = models.Sequential([
    layers.Dense(1, input_shape=[8], activation='sigmoid')
])
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.fit(x_train, y_train, epochs=5)

test_loss, test_acc = model.evaluate(x_test, y_test)
print('Test Loss: ', test_loss, ', Test Accuracy: ', test_acc)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss:  6.447238190968831 , Test Accuracy:  0.599999992052714
