# BDA Project
- Romain Claret
- Jämes Ménétrey
- Damien Rochat

### Load PySpark

In [1]:
import os
import findspark
findspark.init()

import pyspark

#memory = '4g'
#pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["SPARK_HOME"] = "/opt/spark"
#os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

#--driver-maxResultSize 10g --executor-memory 4g

from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("bda-spark-fare-clustering")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '10g')
        .set('spark.driver.memory', '10g')
        .set('spark.driver.maxResultSize', '10g')
        .set('spark.network.timeout', '1000000000')
        .set('spark.executor.heartbeatInterval', '1000000000')
        )
sc = SparkContext(conf=conf)

### Check config

In [2]:
sc._conf.getAll()

[('spark.executor.memory', '10g'),
 ('spark.driver.memory', '10g'),
 ('spark.driver.host', 'rclaret.tic.heia-fr.ch'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1559845596378'),
 ('spark.app.name', 'bda-spark-fare-clustering'),
 ('spark.driver.maxResultSize', '10g'),
 ('spark.network.timeout', '1000000000'),
 ('spark.executor.heartbeatInterval', '1000000000'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '42617')]

### Check if Spark is working with a little PI calculation using monte carlo

In [3]:
import random
num_samples = 100000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)

3.14624


### Load the Dataset

In [4]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [5]:
file_path = "datasets/trip_curated/distributed/trip_curated_1.csv_out"
df = sqlContext.read.format("csv").option("delimiter", ",").option("header", "false").load(file_path)

In [6]:
df.show(20)

+---+---------+---------+
|_c0|      _c1|      _c2|
+---+---------+---------+
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97865|40.787735|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|7.5|-73.97713| 40.74831|
|6.5|-73.98308|40.762566|
|6.5|-73.98308|40.762566|
|6.5|-73.98308|40.762566|
|6.5|-73.98308|40.762566|
+---+---------+---------+
only showing top 20 rows



In [7]:
def df_maker(file_path):
    return (sqlContext.read.format("csv")
            .option("delimiter", ",")
            .option("header", "false")
            .load(file_path)
            .drop("_c0")
            .withColumnRenamed("_c1", "longitude")
            .withColumnRenamed("_c2", "latitude")
           )

In [10]:
#all: 180'743'398
#max 20: 125'403'354
file_path = "datasets/trip_curated/distributed/trip_curated_1.csv_out"
df_part = df_maker(file_path)
df_part.printSchema()
df_part.show(2)
print(df_part.count())

root
 |-- longitude: string (nullable = true)
 |-- latitude: string (nullable = true)

+---------+---------+
|longitude| latitude|
+---------+---------+
|-73.97865|40.787735|
|-73.97865|40.787735|
+---------+---------+
only showing top 2 rows

125403354


### Filter the Dataset

In [11]:
ny_lat_min = 40.50214590272583
ny_lat_max = 40.9
ny_lon_min = -74.24354116993825
ny_lon_max = -73.77490985242169

def df_filter(df):
    return (df.withColumn('latitude', df['latitude'].cast('float'))
            .withColumn('longitude', df['longitude'].cast('float'))
            .filter(df.latitude<=ny_lat_max)
            .filter(df.latitude>=ny_lat_min)
            .filter(df.longitude<=ny_lon_max)
            .filter(df.longitude>=ny_lon_min)
            .na.drop()
           )

In [12]:
#all: 177'388'377
#max 20: 123'059'346
df_part = df_filter(df_part)
df_part.printSchema()
df_part.show(2)
print(df_part.count())

root
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)

+---------+---------+
|longitude| latitude|
+---------+---------+
|-73.97865|40.787735|
|-73.97865|40.787735|
+---------+---------+
only showing top 2 rows

123059346


In [13]:
#all: 177'388'377
#max 20: 123'059'346
file_path = "datasets/trip_curated/distributed/trip_curated_1.csv_out"
df_test = df_filter(df_maker(file_path))
df_test.printSchema()
df_test.show(2)
print(df_test.count())

root
 |-- longitude: float (nullable = true)
 |-- latitude: float (nullable = true)

+---------+---------+
|longitude| latitude|
+---------+---------+
|-73.97865|40.787735|
|-73.97865|40.787735|
+---------+---------+
only showing top 2 rows

123059346


### Create complete Dataframe from all Dataset files

In [27]:
path_trip = "datasets/trip_curated/distributed_e3/"
for idx,e in enumerate(os.listdir(path_trip)):
    print(e)
    if idx == 0:
        dff = df_filter(df_maker(path_trip+e))
    else:
        df = df_filter(df_maker(path_trip+e))
        dff=dff.union(df)
dff.show(2)

trip_curated_3.csv_out
trip_curated_4.csv_out
trip_curated_8.csv_out
trip_curated_2.csv_out
trip_curated_7.csv_out
trip_curated_9.csv_out
trip_curated_12.csv_out
trip_curated_5.csv_out
trip_curated_1.csv_out
trip_curated_6.csv_out
trip_curated_11.csv_out
trip_curated_10.csv_out
+---------+---------+
|longitude| latitude|
+---------+---------+
|-73.98638|40.749817|
|-73.98638|40.749817|
+---------+---------+
only showing top 2 rows



#### Count of the whole dataset

In [None]:
#none distributed: 85'147'405
#distributed all: 2'192'938'097 # 7mins
#distributed max 20: 1'461'307'161 # 5mins
#distributed-e3 max 20: 268'806'940 # 50secs
#distributed-e4 max 20: 351'520'721 # 1min
#distributed-e5 max 20: 421'293'569 # 1min

import time
start_time = time.time()

print(dff.count())

elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

#### Count of the unique last dataset load

In [25]:
#not distributed: 4'834'280
#distributed all: 195'909'011 # 1min
#distributed max 20: 128'396'754 # 24secs
#distributed-e3 max 20: 23'572'539 # 4secs
#distributed-e4 max 20: 30'804'039 # 6secs
#distributed-e5 max 20: 36'932'762 # 7secs

import time
start_time = time.time()

print(df.count())

time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))

36932762


'00:00:07'

## Using K-Means to show Clusters
- pickup cluster

### Cluster fitter function

In [28]:
from pyspark.ml.feature import VectorAssembler
def calculate_kmeans_cluster(df, model, feature_1, feature_2, cluser_name):
    vecAssembler = VectorAssembler(inputCols=[feature_1, feature_2], outputCol="features")
    df_features = vecAssembler.transform(df)
    
    df = model.transform(df_features)
    
    df = df.withColumnRenamed('prediction', cluser_name)
    df = df.drop('features')
    return df

### K-Means model building function

In [29]:
#https://jaceklaskowski.gitbooks.io/mastering-apache-spark/spark-mllib/spark-mllib-KMeans.html
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

def build_kmeans_model(df=0,k_n=2,iter_n=20,steps_n=5,tol_n=1e-4,feature_1="",feature_2=""):
    vecAssembler = VectorAssembler(inputCols=[feature_1, feature_2], outputCol="features")
    vec_df = vecAssembler.transform(df)
    
    kmeans = KMeans(k=k_n, seed=1, maxIter=iter_n, initSteps=steps_n, tol=tol_n)
    
    model = kmeans.fit(vec_df.select('features'))
    return model

### Serialization of the Custers from the small dataframe

In [30]:
import pandas as pd
def kmeans_centroid_to_json(model, path):
    print("Gets centroids")
    centers = model.clusterCenters()
    print("To pd.df")
    centers_df = pd.DataFrame(centers)#, columns=["latitude","longitude"])
    print("Filters")
    centers_df = centers_df[(centers_df[1] < ny_lon_max) &
                     (centers_df[1] > ny_lon_min) &
                     (centers_df[0] < ny_lat_max) &
                     (centers_df[0] > ny_lat_min)
                    ]
    print("To Json")
    centers_df.to_json(path,orient='values')
    print(centers_df)

### Generates K Means Models Centroids

In [33]:
import time
def generate_centroids_kmeans_models(df, k_start, k_end, name):
    for k_value in range(k_start,k_end,1):
        df_copy = df
        print("Builds KMeans with k =",k_value)
        df_model = build_kmeans_model(df_copy,k_n=k_value,iter_n=20,
                                      steps_n=5,tol_n=1e-4,
                                      feature_1="latitude",
                                      feature_2="longitude")
        time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
        
        print("Calculates KMeans Clusters...")
        df_copy = calculate_kmeans_cluster(df_copy, df_model, "latitude", "longitude", "pickup_cluster")
        time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))
        
        print("KMeans Centroids to Json...")
        kmeans_centroid_to_json(df_model,"json/"+name+str(k_value)+".json")

start_time = time.time()
generate_centroids_kmeans_models(dff, 5, 6, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))

Builds KMeans with k = 5
Calculates KMeans Clusters...
KMeans Centroids to Json...
Gets centroids
To pd.df
Filters
To Json
           0          1
0  40.755531 -73.982636
1  40.646847 -73.785576
2  40.725290 -73.995518
3  40.768084 -73.874792
4  40.780146 -73.960625


'Done with: 00:58:24'

In [35]:
start_time = time.time()
generate_centroids_kmeans_models(dff, 10, 11, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))

Builds KMeans with k = 10
Calculates KMeans Clusters...
KMeans Centroids to Json...
Gets centroids
To pd.df
Filters
To Json
           0          1
0  40.747648 -73.994729
1  40.646844 -73.785480
2  40.756795 -73.975603
3  40.729976 -73.984062
4  40.720658 -74.004847
5  40.769042 -73.871644
6  40.800799 -73.959581
7  40.772847 -73.954951
8  40.685978 -73.979770
9  40.775291 -73.981767


'Done with: 01:25:28'

In [34]:
start_time = time.time()
generate_centroids_kmeans_models(dff, 15, 16, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))

Builds KMeans with k = 15
Calculates KMeans Clusters...
KMeans Centroids to Json...
Gets centroids
To pd.df
Filters
To Json
            0          1
0   40.756422 -73.991603
1   40.646829 -73.785461
2   40.755930 -73.923114
3   40.713731 -73.953623
4   40.714500 -74.008769
5   40.770124 -73.869128
6   40.781603 -73.978466
7   40.738943 -74.001724
8   40.683958 -73.985319
9   40.758963 -73.977791
10  40.762862 -73.964856
11  40.805768 -73.956833
12  40.777538 -73.954485
13  40.726240 -73.990271
14  40.743062 -73.983651


'Done with: 01:18:05'

In [36]:
start_time = time.time()
generate_centroids_kmeans_models(dff, 20, 21, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))

Builds KMeans with k = 20
Calculates KMeans Clusters...
KMeans Centroids to Json...
Gets centroids
To pd.df
Filters
To Json
            0          1
0   40.760210 -73.969472
1   40.646827 -73.785461
2   40.748366 -73.991415
3   40.710752 -74.011119
4   40.759314 -73.989555
5   40.769942 -73.869142
6   40.769493 -73.982383
7   40.683975 -73.985201
8   40.804396 -73.958815
9   40.737686 -73.985931
10  40.713750 -73.953607
11  40.832880 -73.939235
12  40.781361 -73.951920
13  40.725137 -74.001710
14  40.770421 -73.959085
15  40.740855 -74.002829
16  40.787102 -73.974751
17  40.724859 -73.988291
18  40.750497 -73.977244
19  40.755760 -73.923663


'Done with: 01:32:12'

In [None]:
start_time = time.time()
generate_centroids_kmeans_models(dff, 7, 8, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))

Builds KMeans with k = 7


In [None]:
start_time = time.time()
generate_centroids_kmeans_models(dff, 12, 13, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))

In [None]:
start_time = time.time()
generate_centroids_kmeans_models(dff, 17, 18, "trip_fare_clusters_e3_k")
time.strftime("Done with: %H:%M:%S", time.gmtime(time.time() - start_time))