In [1]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import pandas as pd
import sys

In [2]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import Row 
import pyspark.sql.functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import IntegerType, LongType, FloatType
from pyspark.sql.window import Window


In [3]:
sc = pyspark.SparkContext(appName ='TSP3')



In [4]:
sqlContext = SQLContext(sc)


In [5]:
def distance(city1, city2):
    
    Distance = np.sqrt(sum(map(lambda x, y: (x-y)**2, city1,city2)))

    return Distance
    
def routeDistance(route):
    dist = 0
    
    N = 15  #Number of cities 
    starting_point = route[0]
    final_point = route[N-1]
    for i in range(N-1):
        dist += distance(route[i], route[i+1])
        
        
    dist += distance(starting_point,final_point)      
        
    return dist
                   
    
def routeFitness(route):
    fitness = 0
    fitness = 1/ float(routeDistance(route))
    return fitness

In [17]:
   
def createRoute(cityList):
    """
    size = cityList.count() # as option we could use a fixed number: number of cities 
    route = cityList.takeSample(False, size)
    """
    
    route = random.sample(cityList, len(cityList))   
    return route
       
    
def initialPopulation(popSize, cityList):
    
    population = []

    for i in range(0, popSize):
        population.append([[i],createRoute(cityList)])
        
       
    #dPopulation = sc.parallelize(population, 15)      
    #dfPop = dPopulation.map(lambda x: Row(index=x[0], route=x[1]))
    
    df = sqlContext.createDataFrame(population)    
    return df

def rankRoutes(df):

    fitnessResults = F.udf(lambda x: routeFitness(x) , FloatType())
                         
    new_df =df.select(df._2, fitnessResults(df._2).alias('RouteFitness'))
    
    final_df = df.join(new_df, df._2 == new_df._2, 'outer').select(df._1, new_df._2, new_df.RouteFitness)
    
    ordered_df = final_df.orderBy(final_df.RouteFitness, ascending = False)

    return ordered_df

def randomNumber():
    
    return 100*random.random()


def selection(ordered_df, eliteSize):
    
    
    fitnessTotal = ordered_df.groupBy().sum('RouteFitness').collect()[0][0]
    cumulative_sum = ordered_df.withColumn('cumSum', F.sum(ordered_df.RouteFitness).over(Window.partitionBy().orderBy().rowsBetween(-sys.maxsize, 0)))    
    
    relativeFitness = F.udf(lambda x: 100*x/fitnessTotal , FloatType())
    cumulative_sum = cumulative_sum.withColumn('RelativeFitness', relativeFitness(cumulative_sum.cumSum))
      
    #randomGeneration = F.udf(lambda x: 100*random.random(), FloatType())
    #cumulative_sum = cumulative_sum.withColumn('RandomNumber', randomGeneration(cumulative_sum.cumSum))
    
    #firstSelection = spark.sql("SELECT _1, MAX ('RouteFitness') AS bestIndividual from ordered_df LIMIT eliteSize")
    
    return cumulative_sum

                                                       
                              
"""
def selection(ordered_df, eliteSize):
    
    selectionResults = []
    relativeFitness = []
    
    fitnessTotal = np.sum(popRanked, axis = 0)
    cumulative_sum = np.cumsum(popRanked, axis =0)
    
    for i in range(len(popRanked)):
        relativeFitness.append(100*cumulative_sum[i][1]/fitnessTotal[1])
    
    for i in range(0, eliteSize):
        selectionResults.append(popRanked[i][0])
    
    for i in range(0, len(popRanked)-eliteSize):
        pick = 100*random.random()
        for i in range(0, len(popRanked)):
            if pick <= relativeFitness[i]: 
                selectionResults.append(popRanked[i][0])
                break
                
    return selectionResults
"""

def matingPool(population, selectionResults):
    matingpool = []
    
    for i in range(len(selectionResults)):
        matingpool.append(population[selectionResults[i]])
    
    return matingpool


def breed(parent1, parent2):
    
    a = random.randint(0, len(parent1))
    b = random.randint(0, len(parent1))
    
    if a>b:
        child= parent1[b:a]
    else:
        child=parent1[a:b]
        
    for i in range(len(parent2)):
        if (parent2[i] not in child):
            child.append(parent2[i])                

    return child

def breedPopulation(matingpool, eliteSize):
    children = []
    length = len(matingpool) - eliteSize
    pool = random.sample(matingpool, len(matingpool))

    for i in range(0,eliteSize):
        children.append(matingpool[i])
    
    for i in range(0, length):
        child = breed(pool[i], pool[len(matingpool)-i-1])
        children.append(child)
    return children

def mutate(individual, mutationRate):
    
    """ Partial Shuffle Mutation """
    
    for swapped in range(len(individual)):
        if(random.random() < mutationRate):
            swapWith = int(random.random() * len(individual))
            
            city1 = individual[swapped]
            city2 = individual[swapWith]
            
            individual[swapped] = city2
            individual[swapWith] = city1
    return individual

def mutatePopulation(population, mutationRate):
    mutatedPop = []
    
    for ind in range(0, len(population)):
        mutatedInd = mutate(population[ind], mutationRate)
        mutatedPop.append(mutatedInd)
    return mutatedPop


def nextGeneration(currentGen, eliteSize, mutationRate):
    popRanked = rankRoutes(currentGen)
    selectionResults = selection(popRanked, eliteSize)
    matingpool = matingPool(currentGen, selectionResults)
    children = breedPopulation(matingpool, eliteSize)
    nextGeneration = mutatePopulation(children, mutationRate)
    return nextGeneration

def geneticAlgorithmPlot(population, popSize, eliteSize, mutationRate, generations):
    pop = initialPopulation(popSize, population)
    progress = []
    progress.append(1 / rankRoutes(pop)[0][1])
    
    for i in range(0, generations):
        pop = nextGeneration(pop, eliteSize, mutationRate)
        progress.append(1 / rankRoutes(pop)[0][1])
    
    plt.plot(progress)
    plt.ylabel('Distance')
    plt.xlabel('Generation')
    plt.show()
    
    

    


In [7]:
cityList = []

for i in range(0,15):
    cityList.append([int(random.randint(0,200)), int(random.randint(0,200))])


In [8]:
df = initialPopulation(15, cityList)

In [9]:
display(df)
df.show(5)

DataFrame[_1: array<bigint>, _2: array<array<bigint>>]

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|[0]|[[84, 132], [75, ...|
|[1]|[[84, 132], [6, 6...|
|[2]|[[122, 77], [59, ...|
|[3]|[[6, 69], [34, 14...|
|[4]|[[42, 173], [195,...|
+---+--------------------+
only showing top 5 rows



In [10]:
ordered_df = rankRoutes(df)

In [11]:
ordered_df.show()

+----+--------------------+------------+
|  _1|                  _2|RouteFitness|
+----+--------------------+------------+
| [5]|[[34, 144], [23, ...|8.1544236E-4|
| [6]|[[122, 77], [75, ...|7.7358476E-4|
|[12]|[[122, 77], [86, ...|7.6519675E-4|
| [0]|[[84, 132], [75, ...| 7.403956E-4|
| [4]|[[42, 173], [195,...|7.2556105E-4|
| [2]|[[122, 77], [59, ...|7.0987886E-4|
| [8]|[[6, 69], [59, 44...|6.8093627E-4|
|[10]|[[6, 69], [42, 17...| 6.343118E-4|
|[11]|[[6, 69], [55, 9]...|6.2787795E-4|
|[13]|[[23, 94], [140, ...|6.2742166E-4|
| [1]|[[84, 132], [6, 6...| 6.231016E-4|
|[14]|[[42, 173], [86, ...|6.0853135E-4|
| [3]|[[6, 69], [34, 14...| 5.938387E-4|
| [9]|[[133, 106], [34,...| 5.929081E-4|
| [7]|[[34, 144], [133,...|5.8442843E-4|
+----+--------------------+------------+



In [19]:
cumulative_df = selection(ordered_df, 5)

In [20]:
cumulative_df.show()

+----+--------------------+------------+--------------------+---------------+
|  _1|                  _2|RouteFitness|              cumSum|RelativeFitness|
+----+--------------------+------------+--------------------+---------------+
| [5]|[[34, 144], [23, ...|8.1544236E-4|8.154423558153212E-4|       8.070957|
| [6]|[[122, 77], [75, ...|7.7358476E-4|0.001589027117006...|      15.727624|
|[12]|[[122, 77], [86, ...|7.6519675E-4|0.002354223863221705|      23.301268|
| [0]|[[84, 132], [75, ...| 7.403956E-4|0.003094619489274919|       30.62944|
| [4]|[[42, 173], [195,...|7.2556105E-4|0.003820180543698...|      37.810783|
| [2]|[[122, 77], [59, ...|7.0987886E-4|0.004530059406533837|       44.83691|
| [8]|[[6, 69], [59, 44...|6.8093627E-4|0.005210995674133301|      51.576576|
|[10]|[[6, 69], [42, 17...| 6.343118E-4|0.005845307488925755|      57.854767|
|[11]|[[6, 69], [55, 9]...|6.2787795E-4|0.006473185436334461|       64.06928|
|[13]|[[23, 94], [140, ...|6.2742166E-4|0.007100607093889266|   