# Main code

At some points we could not make the rdd.map function work so we switched to regular map. Unfortunately then we don't really use pyspark's efficiency but at least we can answer the research questions.


#### HDFS
Could be used using: 

data = sc.textFile('hdfs://scomp1334:9000/user/steen176/csv/butterflies.csv')


#### addPyFile
Adds the classes to store species and observations to the workers

sc.addPyFile("classes.py")


In [7]:
from classes import Species, Observation
import subprocess as sp

if __name__ == "__main__":
    global species
    sc.addPyFile("classes.py")

    data = sc.textFile("/home/WUR/steen176/Downloads/butterflies.csv")
    corrected = data.map(lambda line: correctLine(line))
    species = extractSpecies(corrected)
    
    
    #rdd.map did not work here
    map(getObservation, corrected.toLocalIterator())
    
    #rdd.map did not work here
    clusters = map(cluster, species.keys())
    
    #GetList of rare species
    top = 1
    rare_specs =  sorted([(specie, len(species[specie].observations)) for specie in species.keys()], 
                         key = lambda x: x[1])[0:top]
    
    
    #currently only for the rarest specie
    rarest_name = rare_specs[0][0]
    rarest_cluster = cluster(rarest_name)[1].clusterCenters
    rarest = (rarest_name, cluster(rarest_name)[1].clusterCenters)
    
    filtered_clusters = [(x[0], [x[1].clusterCenters]) for x in clusters if 
                       [tuple(y.tolist()) for y in x[1].clusterCenters] != 
                       [tuple(y.tolist()) for y in rarest_cluster]
                        ]
   
    
    
    #output
    with open("out.tsv", "w") as out_file:
        for x in findCombinations(rarest, filtered_clusters):
            out_file.write(x)
            
    
    sp.call("echo 'rare_species\tclusters_with\tlongtitude\tlattitude' > output.tsv; sort -u out.tsv >> output.tsv", shell = True)
   
    

# Utility

In [4]:
def getSpecies(line):
    line = line.split(',')
    return line[0]

def correctLine(line):
    line = line.split(',')
    if len(line) != 10:
        return ','.join(line[0:4] + [" ".join(line[4:6])]+ line[6:])
    else:
        return ','.join(line)

def getObservation(line):
    line = line.split(',')
    specie = line[0]
    for i in xrange(int(line[5])):
        longti = line[8]
        lati = line[9]
        obs = Observation(longti,lati)
        species[specie].addObservation(obs)
    
    
def createSpecieObjects(species_list):
    species_objects = {}
    for specie in species_list:
        obj = Species(specie)
        species_objects[specie] = obj
    return species_objects


def extractSpecies(rdd):
    species = rdd.map(getSpecies)
    species_list  = list(set([x for x in species.toLocalIterator()]))
    return createSpecieObjects(species_list)


# Clustering

In [3]:
from pyspark.mllib.clustering import KMeans, KMeansModel
from pyspark.mllib.linalg import SparseVector
from math import sqrt
NUM_CLUSTER = 4

def kmeans(sparse_vectors):
    #sparsevectors should be a rdd containing sparsevectors
    return KMeans.train(sparse_vectors, NUM_CLUSTER, maxIterations=20, runs=100, initializationMode="random")

def cluster(specie):
    
    vectors = species[specie].getVectorRDD()
    return (specie, kmeans(sc.parallelize(vectors)))


    
def error(point):
    center = clusters.centers[clusters.predict(point)]
    denseCenter = DenseVector(numpy.ndarray.tolist(center))
    return sqrt(sum([x**2 for x in (DenseVector(point.toArray()) - denseCenter)]))

    WSSSE = parsedData.map(lambda point: error(point)).reduce(lambda x, y: x + y)
    print(WSSSE)

# Comparing clusters

In [6]:
from itertools import combinations
import numpy 
def centerdiffs(centerA, centerB, DIFF=0.1):

    
    longA = centerA[0]
    longB = centerB[0]
    lattA = centerA[1]
    lattB = centerB[1]
    
    longDiff = numpy.subtract(longA, longB)
    lattDiff = numpy.subtract(lattA, lattB)
    
    #print("Longdiff: {}\t\tLattdiff: {}".format(str(longDiff), str(lattDiff)))

    if (longDiff < DIFF and longDiff > DIFF*-1) and (lattDiff < DIFF and lattDiff > DIFF*-1):
        return True
    else:
        return False
    

def findCombinations(rare_specie, all_species_centers):
    """Compare a rare specie with all other species and see if some clusters overlap.
    
        Keyword Arguments:
            rare_specie_centers -- The np.array of cluster centers
                    format: (u'Cuculus canorus', Array())
            all_species_centers -- The np.array of cluster centers for all other species
        Returns:
            None
    """
    rare_name = rare_specie[0]
    rare_centers = rare_specie[1]
    for one in all_species_centers:
        other_name = one[0]
        for center in rare_centers:
            for one_center in one[1][0]:
                #print("Center: {}\t\t Onecenter: {}".format(center, one_center))
                if centerdiffs(center, one_center):
                    yield "{}\t{}\t{}\t{}\n".format(rare_name, other_name, str(center[0]), str(center[1]))
                else:
                    pass