In [22]:
#importing libraries

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import copy

In [23]:
from google.colab import drive
drive.mount('/content/drive')

DataFolder = "/content/drive/My Drive/Colab Notebooks/geneticsProject/data/"

# CONFIG
SAMPLE_SIZE =200
CHANGE_SAMPLE_AFTER = 15

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
class Config:
    MAX_INSTRUCTION_SET_LENGTH = 256
    INIT_MAX_INSTRUCTION_SET_LENGTH = 128
    MIN_INSTRUCTION_SET_LENGTH = 8
    TOTAL_OUTPUT_REGISTER = 10
    TOTAL_OPERATIONS = 4

    #PAR TO TUNE!
    GAP_PERCENTAGE = 20
    MUTATION_PROBABILITY = 90
    CROSSOVER_PROBABILITY = 90
    POPULATION_SIZE = 150
    SAME_NUMBER_OF_ITERATIONS = 50

### Get Data

In [25]:
def getDataFromFile(fileName):
  #Read File
  df = pd.read_csv(fileName, sep=" " ,header=None)
  data = df.dropna(axis=1)

  return data

In [26]:
def getData(target):
  trainFile = DataFolder + target +"/train.txt"
  testFile = DataFolder + target +"/test.txt"

  #Training Data
  trainData =  getDataFromFile(trainFile)

  #Post Training Evaluation
  testData =  getDataFromFile(testFile)

  return trainData,testData


In [27]:
def getuniformSample(data, sampleSize, lableSize):
  print("CREATING SAMPLE")
  classesIndex = (data.shape)[1] - 1
  allLables = data[classesIndex].unique()

  #empty Sample df
  sample = pd.DataFrame()
  tempData = data[:]

  while(len(sample) < sampleSize):
    lableToSelect = random.choice(allLables)
    dataForClass = tempData[tempData[classesIndex] == (lableToSelect)]
    selectedData = dataForClass.sample(n=1, replace=False)
    sample = sample.append(selectedData)
    tempData.drop(selectedData.index)
    
  return sample

In [28]:
def splitDataForXandY(data):
  X = data[data.columns[:-1]]
  Y = data[data.columns[-1]]
  
  # one hot encode lable
  Y = pd.get_dummies(Y)
  
  X = X.values.tolist()
  Y = Y.values.tolist()
  
  return X,Y

### Creating Individuals and population

In [29]:
class Individual:
    # Operators = [+, -, /2, *2]
    # Operators = [0, 1,  2 , 3]
    # instructionSet = [ Select | target Register | operator| source ]

    def __init__(self, instructionSet):
        self.instructionSet = instructionSet
        self.registers = [0] * Config.TOTAL_OUTPUT_REGISTER
        self.fitnessScore = 0

    def reset(self):
        self.registers = [0] * Config.TOTAL_OUTPUT_REGISTER

    def resetFitness(self):
        self.fitnessScore = 0

    def isValid(self):
        if( (len(self.instructionSet) > Config.MIN_INSTRUCTION_SET_LENGTH) and (len(self.instructionSet) < Config.MAX_INSTRUCTION_SET_LENGTH)):
            return True
        else:
            return False

    def runProgram(self, data):
        self.reset()
        for instruction in self.instructionSet:
            select = instruction[0]
            target = instruction[1]
            operator = instruction[2]
            source = instruction[3]

            if select == 0:
                source = source%len(data)
                source = data[source]
            else:
                source = source % Config.TOTAL_OUTPUT_REGISTER
                source = self.registers[source]

            if operator == 0:
                self.registers[target] = self.registers[target] + source
            if operator == 1:
                self.registers[target] = self.registers[target] - source
            if operator == 2:
                self.registers[target] = self.registers[target] / 2
            if operator == 3:
                self.registers[target] = self.registers[target] * 2

    def predict(self, data, lable):

        self.runProgram(data)

        maxLableIndex = np.array(lable).argmax()
        maxRegisterIndex = np.array(self.registers[:len(lable)]).argmax()

        if maxLableIndex == maxRegisterIndex:
            return True
        else:
            return False

In [30]:
def getInstructionSet(dataLen):
  
  configOb = Config()

  #randomly selecting the length of instruction set
  endIndex = random.randint(2,configOb.INIT_MAX_INSTRUCTION_SET_LENGTH)
  instructionSet = list()
  for i in range(endIndex):
    selectBit = random.randint(0,1)
    targetValue = random.randint(0,configOb.TOTAL_OUTPUT_REGISTER - 1)
    operator = random.randint(0,configOb.TOTAL_OPERATIONS - 1)
    source = random.randint(0,max(dataLen, configOb.TOTAL_OUTPUT_REGISTER ) - 1 )

    instruction = [selectBit, targetValue, operator, source]
    instructionSet.append(instruction)

  return instructionSet

In [31]:
def createGenotype(X):
  dataLen = len(X[0])
  populationList = list()
  for i in range(Config.POPULATION_SIZE):
    individualInstructionSet = getInstructionSet(dataLen)
    individualOb = Individual(individualInstructionSet)
    populationList.append(individualOb)
    
  print(str(len(populationList)) + " individuals created.")
  return populationList

### Methods for GL

In [32]:
class TwoPointCrossover:

    def getTwoCrossoverPoints(self, parent):
        crossoverPoint1 = 0
        crossoverPoint2 = 0

        while (crossoverPoint1 == crossoverPoint2):
            crossoverPoint1 = random.randint(0, len(parent.instructionSet) - 2)
            crossoverPoint2 = random.randint(crossoverPoint1 + 1, len(parent.instructionSet) - 1)

        return crossoverPoint1,crossoverPoint2

    def createChild(self, parent1, parent2, p1CP1,p1CP2, p2CP1,p2CP2 ):

        childInstructionSet = parent1.instructionSet[:p1CP1 - 1]
        childInstructionSet = childInstructionSet + parent2.instructionSet[p2CP1: p2CP2]
        childInstructionSet = childInstructionSet + parent1.instructionSet[p1CP2 + 1:]

        child = Individual(childInstructionSet)

        return child

    def crossover(self, parent1, parent2):

        p1CP1, p1CP2 = self.getTwoCrossoverPoints(parent1)
        p2CP1, p2CP2 = self.getTwoCrossoverPoints(parent2)

        child1 = self.createChild(parent1, parent2, p1CP1, p1CP2, p2CP1, p2CP2)

        child2 = self.createChild(parent2, parent1, p2CP1, p2CP2, p1CP1, p1CP2)

        if (not (child1.isValid())):
            child1 = None
        if (not (child2.isValid())):
            child2 = None

        return child1, child2

In [33]:
class Mutation:

    mutationTypes = 3

    def swapMutation(self, child):
        bit1 = random.randint(0, len(child.instructionSet) - 1)
        bit2 = random.randint(0, len(child.instructionSet) - 1)
        #print("Swapping bit: ",bit1, "and ", bit2)
        child.instructionSet[bit1], child.instructionSet[bit2] = child.instructionSet[bit2], child.instructionSet[bit1]

        return child

    def reverseMutation(self, child):
        bit1 = random.randint(0, len(child.instructionSet) - 2)
        bit2 = random.randint(bit1+1, len(child.instructionSet) - 1)

        mutatedChild = child.instructionSet[0:bit1] + child.instructionSet[bit2: bit1-1: -1] + child.instructionSet[bit2+1:]
        mutatedChild = Individual(mutatedChild)

        return mutatedChild

    def scrambleMutation(self, child):
        bit1 = random.randint(0, len(child.instructionSet) - 2)
        bit2 = random.randint(bit1 + 1, len(child.instructionSet) - 1)

        temp = child.instructionSet[bit1: bit2]
        random.shuffle(temp)
        mutatedList = child.instructionSet[:bit1] + temp + child.instructionSet[bit2:]
        mutatedChild = Individual(mutatedList)

        return mutatedChild

    def mutateIndividual(self, childToMutate):
        child = childToMutate
        type = random.randint(0, self.mutationTypes - 1)
        if (random.randint(0, 100) < Config.MUTATION_PROBABILITY):
            if (type == 0):
                child = self.swapMutation(child)
            if (type == 1):
                child = self.reverseMutation(child)
            if (type == 2):
                child = self.scrambleMutation(child)

        return child

In [34]:
def calculateFitness(populationList, data, lable):
  for person in populationList:
    correctPrediction = 0
    person.resetFitness()
    for i in range(len(data)):
      answer = person.predict(data[i] , lable[i])
      correctPrediction += answer

    fitness = 100*(correctPrediction/len(data))
    person.fitnessScore = fitness

In [35]:
def getGapedPopulation(populationList):
  sortedPopulation = sorted(populationList, key=lambda x: x.fitnessScore, reverse=True)
  
  totalGap = (Config.GAP_PERCENTAGE * len(sortedPopulation)) / 100
  
  gapedPopulation = sortedPopulation[: int(len(sortedPopulation)- totalGap)]
  return gapedPopulation

In [36]:
def isChildReadyToPush(populationList, child,newPopulation):
  crossoverChance = random.randint(0, 100) < Config.CROSSOVER_PROBABILITY
  if (crossoverChance and (not (child is None)) and (len(newPopulation) < len(populationList))):
    return True
  else:
    return False

In [37]:

def breed(populationList, gapedPopulation):
  newPopulation = gapedPopulation[:]
  while (len(newPopulation) < len(populationList)):
    parent1 = random.choice(gapedPopulation)
    parent2 = random.choice(gapedPopulation)
    
    child1,child2 = TwoPointCrossover().crossover(parent1, parent2)
    mutationOb = Mutation()
    
    if (isChildReadyToPush(populationList, child1, newPopulation)):
      mutatedChild = mutationOb.mutateIndividual(child1)
      newPopulation.append(mutatedChild)
      
      if (isChildReadyToPush(populationList,child2,newPopulation)):
        mutatedChild = mutationOb.mutateIndividual(child2)
        newPopulation.append(mutatedChild)
  
  return newPopulation

### Making Data for visualization 

In [38]:
def pushToGenerationAccuracyDictionary(classwiseAccuracy,classWiseAccuracyInGeneration):
  for key in classwiseAccuracy.keys():
    if key in classWiseAccuracyInGeneration:
      classWiseAccuracyInGeneration[key].append(classwiseAccuracy[key])
    else:
      classWiseAccuracyInGeneration[key] = []
      classWiseAccuracyInGeneration[key].append(classwiseAccuracy[key])

In [39]:
def calculateClasswiseAccuracy(individual, data, lable, classWiseAccuracyInGeneration):
  classwiseAccuracy = dict()
  for i in range(len(data)):
    lableToPush = tuple(lable[i])
    answer = individual.predict(data[i], lable[i])
    if (answer):
      if (lableToPush in classwiseAccuracy):
        classwiseAccuracy[lableToPush] += 1
      else:
        classwiseAccuracy[lableToPush] = 1
    else:
      if (lableToPush in classwiseAccuracy):
        classwiseAccuracy[lableToPush] += 0
      else:
        classwiseAccuracy[lableToPush] = 0
  pushToGenerationAccuracyDictionary(classwiseAccuracy, classWiseAccuracyInGeneration)

### Training Model

In [40]:
#method to run everything
def runGA(targetFile, accuracy):
  trainData,testData = getData(targetFile)
  attributes = (trainData.shape)[1] - 1
  classes = len(trainData[attributes].unique())
  print("Data contains", attributes, "and", classes, "classes")

  sample = getuniformSample(trainData, SAMPLE_SIZE, classes)
  

  X, Y = splitDataForXandY(sample)
  X_test, Y_test = splitDataForXandY(testData)

  #init Population
  populationList = list()
  populationList = createGenotype(X)

  classWiseAccuracyInGeneration = dict()

  generationAccuracy = 0
  desiredAccuracy = accuracy
  generationCount = 0
  accuracyOverGenerations = list()
  testAccuracy = list()

  while (generationAccuracy < desiredAccuracy):
    if (generationCount % CHANGE_SAMPLE_AFTER == 0):
      sample = getuniformSample(trainData, SAMPLE_SIZE, classes)
      X, Y = splitDataForXandY(sample)
      
    calculateFitness(populationList, X, Y)
    gapedPopulation = getGapedPopulation(populationList)

    mostFitIndi = gapedPopulation[0]
    calculateClasswiseAccuracy(mostFitIndi, X, Y,classWiseAccuracyInGeneration)
    
    accuracyOverGenerations.append(round(gapedPopulation[0].fitnessScore, 2))
    print("Generation ", generationCount + 1, " Accuracy = ", accuracyOverGenerations[generationCount])

    newPopulation = breed(populationList, gapedPopulation)
    populationList = newPopulation
    generationAccuracy = accuracyOverGenerations[generationCount]
    generationCount += 1

    calculateFitness([gapedPopulation[0]], X_test, Y_test)
    testAccuracy.append(round(gapedPopulation[0].fitnessScore, 2))
    
    if(len(accuracyOverGenerations) > 30):
      checkList = accuracyOverGenerations[-25:]
      result = checkList.count(checkList[0]) == len(checkList)
      if (result):
        print("Same Accuracy for last 25 generations")
        break
  
  return generationCount,accuracyOverGenerations,testAccuracy

In [41]:
class ModelInfo():
  def __init__(self, generationCount,accuracyOverGenerations,testAccuracy ):
        self.generationCount = generationCount
        self.trainAccuracy = accuracyOverGenerations
        self.testAccuracy = testAccuracy

In [None]:
datasetList = ['abalone','bank','thyroid']
accuracy = [70, 90, 90]
modelInfo = {}
a = 0
for dataset in datasetList:
  modelList = []
  print("Running GA for Dataset:", dataset, "desired Accuracy:", accuracy[a])
  for i in range(10):
    generationCount,accuracyOverGenerations,testAccuracy = runGA(dataset, accuracy[a])
    modelObject = ModelInfo(generationCount,accuracyOverGenerations,testAccuracy)
    modelList.append(modelObject)

  modelInfo[dataset] = modelList
  a += 1


Running GA for Dataset: abalone desired Accuracy: 65
Data contains 8 and 4 classes
CREATING SAMPLE
150 individuals created.
CREATING SAMPLE
Generation  1  Accuracy =  47.5
Generation  2  Accuracy =  47.5
Generation  3  Accuracy =  47.5
Generation  4  Accuracy =  47.5
Generation  5  Accuracy =  47.5
Generation  6  Accuracy =  47.5
Generation  7  Accuracy =  47.5
Generation  8  Accuracy =  47.5
Generation  9  Accuracy =  48.5
Generation  10  Accuracy =  48.5
Generation  11  Accuracy =  48.5
Generation  12  Accuracy =  48.5
Generation  13  Accuracy =  48.5
Generation  14  Accuracy =  48.5
Generation  15  Accuracy =  48.5
CREATING SAMPLE
Generation  16  Accuracy =  54.5
Generation  17  Accuracy =  54.5
Generation  18  Accuracy =  54.5
Generation  19  Accuracy =  54.5
Generation  20  Accuracy =  54.5
Generation  21  Accuracy =  54.5
Generation  22  Accuracy =  54.5
Generation  23  Accuracy =  54.5
Generation  24  Accuracy =  54.5
Generation  25  Accuracy =  54.5
Generation  26  Accuracy =  

In [None]:
plt.plot(range(len(accuracyOverGenerations)), accuracyOverGenerations)
plt.plot(range(len(accuracyOverGenerations)), testAccuracy)
plt.xlabel("Generations")
plt.ylabel("Accuracy")
plt.title("Overall Accuracy Curve")