In [1]:
import os
# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3.8'
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.8'

In [2]:
import numpy as np
import random 
import matplotlib.pyplot as plt
import pandas as pd
import sys

In [3]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import Row 
import pyspark.sql.functions as F
from pyspark.sql import SQLContext
from pyspark.sql.types import IntegerType, LongType, FloatType
from pyspark.sql.window import Window


In [4]:
sc = pyspark.SparkContext(appName ='TSP3')



In [5]:
sqlContext = SQLContext(sc)


In [6]:
def distance(city1, city2):
    
    Distance = np.sqrt(sum(map(lambda x, y: (x-y)**2, city1,city2)))

    return Distance
    
def routeDistance(route):
    dist = 0
    
    N = 5  #Number of cities # TODO get the length from the list?
    starting_point = route[0]
    final_point = route[N-1]
    for i in range(N-1):
        dist += distance(route[i], route[i+1])
        
        
    dist += distance(starting_point,final_point)      
        
    return dist
                   
    
def routeFitness(route):
    fitness = 0
    fitness = 1/ float(routeDistance(route))
    return fitness

In [7]:
   
def createRoute(cityList):
    """
    size = cityList.count() # as option we could use a fixed number: number of cities 
    route = cityList.takeSample(False, size)
    """
    
    route = random.sample(cityList, len(cityList))   
    return route
       
    
def initialPopulation(popSize, cityList):
    
    population = []

    for i in range(0, popSize):
        population.append([[i],createRoute(cityList)])
        
       
    # rdd
    rdd = sc.parallelize(population)      
    # rddPop = dPopulation.map(lambda x: Row(index=x[0], route=x[1])) # Is this line necessary or conveniance
    
    df = sqlContext.createDataFrame(population)    
    return rdd, df #sc.parallelize(population)

def rankRoutes(rdd, df):

    fitnessResults = F.udf(lambda x: routeFitness(x) , FloatType())
    new_df = df.select(df._2, fitnessResults(df._2).alias('RouteFitness'))
    final_df = df.join(new_df, df._2 == new_df._2, 'outer').select(df._1, new_df._2, new_df.RouteFitness)
    ordered_df = final_df.orderBy(final_df.RouteFitness, ascending = False)

    # rdd
    ordered_rdd = rdd.map(lambda row: (routeFitness(row[1]), row[1])).sortByKey(ascending=False)  

    return ordered_rdd, ordered_df

def randomNumber():
    
    return 100*random.random()


def selection(ordered_df, ordered_rdd, eliteSize):
    
    
    fitnessTotal = ordered_df.groupBy().sum('RouteFitness').collect()[0][0]
    cumulative_sum = ordered_df.withColumn('cumSum', F.sum(ordered_df.RouteFitness).over(Window.partitionBy().orderBy().rowsBetween(-sys.maxsize, 0)))    
    
    relativeFitness = F.udf(lambda x: 100*x/fitnessTotal , FloatType())
    cumulative_sum = cumulative_sum.withColumn('RelativeFitness', relativeFitness(cumulative_sum.cumSum))

    cumulative_rdd = ordered_rdd
      
    #randomGeneration = F.udf(lambda x: 100*random.random(), FloatType())
    #cumulative_sum = cumulative_sum.withColumn('RandomNumber', randomGeneration(cumulative_sum.cumSum))
    
    #firstSelection = spark.sql("SELECT _1, MAX ('RouteFitness') AS bestIndividual from ordered_df LIMIT eliteSize")
    
    return cumulative_sum, cumulative_rdd

                                                       
                              
"""
def selection(ordered_df, eliteSize):
    
    selectionResults = []
    relativeFitness = []
    
    fitnessTotal = np.sum(popRanked, axis = 0)
    cumulative_sum = np.cumsum(popRanked, axis =0)
    
    for i in range(len(popRanked)):
        relativeFitness.append(100*cumulative_sum[i][1]/fitnessTotal[1])
    
    for i in range(0, eliteSize):
        selectionResults.append(popRanked[i][0])
    
    for i in range(0, len(popRanked)-eliteSize):
        pick = 100*random.random()
        for i in range(0, len(popRanked)):
            if pick <= relativeFitness[i]: 
                selectionResults.append(popRanked[i][0])
                break
                
    return selectionResults
"""

def matingPool(population, selectionResults):
    matingpool = []
    
    for i in range(len(selectionResults)):
        matingpool.append(population[selectionResults[i]])
    
    return matingpool


def breed(parent1, parent2):
    
    a = random.randint(0, len(parent1))
    b = random.randint(0, len(parent1))
    
    if a>b:
        child= parent1[b:a]
    else:
        child=parent1[a:b]
        
    for i in range(len(parent2)):
        if (parent2[i] not in child):
            child.append(parent2[i])                

    return child

def breedPopulation(matingpool, eliteSize):
    children = []
    length = len(matingpool) - eliteSize
    pool = random.sample(matingpool, len(matingpool))

    for i in range(0,eliteSize):
        children.append(matingpool[i])
    
    for i in range(0, length):
        child = breed(pool[i], pool[len(matingpool)-i-1])
        children.append(child)
    return children

def mutate(individual, mutationRate):
    
    """ Partial Shuffle Mutation """
    
    for swapped in range(len(individual)):
        if(random.random() < mutationRate):
            swapWith = int(random.random() * len(individual))
            
            city1 = individual[swapped]
            city2 = individual[swapWith]
            
            individual[swapped] = city2
            individual[swapWith] = city1
    return individual

def mutatePopulation(population, mutationRate):
    mutatedPop = []
    
    for ind in range(0, len(population)):
        mutatedInd = mutate(population[ind], mutationRate)
        mutatedPop.append(mutatedInd)
    return mutatedPop


def nextGeneration(currentGen, eliteSize, mutationRate):
    popRanked = rankRoutes(currentGen)
    selectionResults = selection(popRanked, eliteSize)
    matingpool = matingPool(currentGen, selectionResults)
    children = breedPopulation(matingpool, eliteSize)
    nextGeneration = mutatePopulation(children, mutationRate)
    return nextGeneration

def geneticAlgorithmPlot(population, popSize, eliteSize, mutationRate, generations):
    pop = initialPopulation(popSize, population)
    progress = []
    progress.append(1 / rankRoutes(pop)[0][1])
    
    for i in range(0, generations):
        pop = nextGeneration(pop, eliteSize, mutationRate)
        progress.append(1 / rankRoutes(pop)[0][1])
    
    plt.plot(progress)
    plt.ylabel('Distance')
    plt.xlabel('Generation')
    plt.show()
    
    

    


In [8]:
cityList = []

for i in range(0,15):
    cityList.append([int(random.randint(0,100)), int(random.randint(0,100))])


In [9]:
rdd, df = initialPopulation(5, cityList)

In [10]:
display(df)
df.show(5)

DataFrame[_1: array<bigint>, _2: array<array<bigint>>]

+---+--------------------+
| _1|                  _2|
+---+--------------------+
|[0]|[[9, 40], [18, 63...|
|[1]|[[18, 63], [82, 9...|
|[2]|[[1, 67], [32, 48...|
|[3]|[[93, 77], [51, 5...|
|[4]|[[32, 48], [1, 67...|
+---+--------------------+



In [11]:
display(rdd)
df.take(5)

ParallelCollectionRDD[0] at readRDDFromFile at PythonRDD.scala:247

[Row(_1=[0], _2=[[9, 40], [18, 63], [82, 99], [98, 4], [4, 39], [92, 18], [5, 73], [1, 67], [50, 27], [83, 84], [93, 77], [98, 50], [51, 50], [32, 48], [32, 38]]),
 Row(_1=[1], _2=[[18, 63], [82, 99], [93, 77], [32, 48], [92, 18], [32, 38], [98, 50], [83, 84], [1, 67], [9, 40], [50, 27], [4, 39], [51, 50], [5, 73], [98, 4]]),
 Row(_1=[2], _2=[[1, 67], [32, 48], [98, 4], [82, 99], [92, 18], [50, 27], [93, 77], [83, 84], [9, 40], [98, 50], [51, 50], [5, 73], [32, 38], [18, 63], [4, 39]]),
 Row(_1=[3], _2=[[93, 77], [51, 50], [92, 18], [1, 67], [32, 48], [98, 50], [98, 4], [5, 73], [4, 39], [9, 40], [83, 84], [82, 99], [18, 63], [50, 27], [32, 38]]),
 Row(_1=[4], _2=[[32, 48], [1, 67], [93, 77], [4, 39], [50, 27], [83, 84], [32, 38], [98, 50], [18, 63], [9, 40], [98, 4], [92, 18], [82, 99], [5, 73], [51, 50]])]

In [12]:
ordered_rdd, ordered_df = rankRoutes(rdd, df)

In [13]:
ordered_df.show(5)

+---+--------------------+------------+
| _1|                  _2|RouteFitness|
+---+--------------------+------------+
|[0]|[[9, 40], [18, 63...|0.0033347793|
|[4]|[[32, 48], [1, 67...|  0.00332367|
|[3]|[[93, 77], [51, 5...|0.0032342025|
|[1]|[[18, 63], [82, 9...| 0.003132244|
|[2]|[[1, 67], [32, 48...|0.0025189673|
+---+--------------------+------------+



In [14]:
ordered_rdd.take(5)

[(0.0033347792968817635,
  [[9, 40],
   [18, 63],
   [82, 99],
   [98, 4],
   [4, 39],
   [92, 18],
   [5, 73],
   [1, 67],
   [50, 27],
   [83, 84],
   [93, 77],
   [98, 50],
   [51, 50],
   [32, 48],
   [32, 38]]),
 (0.003323670082718724,
  [[32, 48],
   [1, 67],
   [93, 77],
   [4, 39],
   [50, 27],
   [83, 84],
   [32, 38],
   [98, 50],
   [18, 63],
   [9, 40],
   [98, 4],
   [92, 18],
   [82, 99],
   [5, 73],
   [51, 50]]),
 (0.0032342026091684356,
  [[93, 77],
   [51, 50],
   [92, 18],
   [1, 67],
   [32, 48],
   [98, 50],
   [98, 4],
   [5, 73],
   [4, 39],
   [9, 40],
   [83, 84],
   [82, 99],
   [18, 63],
   [50, 27],
   [32, 38]]),
 (0.0031322441080146144,
  [[18, 63],
   [82, 99],
   [93, 77],
   [32, 48],
   [92, 18],
   [32, 38],
   [98, 50],
   [83, 84],
   [1, 67],
   [9, 40],
   [50, 27],
   [4, 39],
   [51, 50],
   [5, 73],
   [98, 4]]),
 (0.002518967168786486,
  [[1, 67],
   [32, 48],
   [98, 4],
   [82, 99],
   [92, 18],
   [50, 27],
   [93, 77],
   [83, 84],
   [9, 

In [15]:
cumulative_df, cumulative_rdd = selection(ordered_df, ordered_rdd, 5)

In [16]:
cumulative_df.show()

+---+--------------------+------------+--------------------+---------------+
| _1|                  _2|RouteFitness|              cumSum|RelativeFitness|
+---+--------------------+------------+--------------------+---------------+
|[0]|[[9, 40], [18, 63...|0.0033347793|0.003334779292345047|      21.453993|
|[4]|[[32, 48], [1, 67...|  0.00332367|0.006658449303358793|      42.836514|
|[3]|[[93, 77], [51, 5...|0.0032342025|0.009892651811242104|      63.643456|
|[1]|[[18, 63], [82, 9...| 0.003132244|0.013024895917624235|       83.79446|
|[2]|[[1, 67], [32, 48...|0.0025189673|  0.0155438631772995|          100.0|
+---+--------------------+------------+--------------------+---------------+



In [17]:
cumulative_rdd.collect()

[(0.0033347792968817635,
  [[9, 40],
   [18, 63],
   [82, 99],
   [98, 4],
   [4, 39],
   [92, 18],
   [5, 73],
   [1, 67],
   [50, 27],
   [83, 84],
   [93, 77],
   [98, 50],
   [51, 50],
   [32, 48],
   [32, 38]]),
 (0.003323670082718724,
  [[32, 48],
   [1, 67],
   [93, 77],
   [4, 39],
   [50, 27],
   [83, 84],
   [32, 38],
   [98, 50],
   [18, 63],
   [9, 40],
   [98, 4],
   [92, 18],
   [82, 99],
   [5, 73],
   [51, 50]]),
 (0.0032342026091684356,
  [[93, 77],
   [51, 50],
   [92, 18],
   [1, 67],
   [32, 48],
   [98, 50],
   [98, 4],
   [5, 73],
   [4, 39],
   [9, 40],
   [83, 84],
   [82, 99],
   [18, 63],
   [50, 27],
   [32, 38]]),
 (0.0031322441080146144,
  [[18, 63],
   [82, 99],
   [93, 77],
   [32, 48],
   [92, 18],
   [32, 38],
   [98, 50],
   [83, 84],
   [1, 67],
   [9, 40],
   [50, 27],
   [4, 39],
   [51, 50],
   [5, 73],
   [98, 4]]),
 (0.002518967168786486,
  [[1, 67],
   [32, 48],
   [98, 4],
   [82, 99],
   [92, 18],
   [50, 27],
   [93, 77],
   [83, 84],
   [9, 