# BDA Project
- Romain Claret
- Jämes Ménétrey
- Damien Rochat

### Load PySpark

In [1]:
import os
import findspark
findspark.init()

import pyspark

#memory = '4g'
#pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["SPARK_HOME"] = "/opt/spark"
#os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

#--driver-maxResultSize 10g --executor-memory 4g

from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("bda-spark-fare")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '4g')
        .set('spark.driver.memory', '4g')
        .set('spark.driver.maxResultSize', '4g'))
sc = SparkContext(conf=conf)

### Check config

In [2]:
sc._conf.getAll()

[('spark.driver.port', '40061'),
 ('spark.driver.memory', '4g'),
 ('spark.rdd.compress', 'True'),
 ('spark.executor.memory', '4g'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.host', 'rclaret.tic.heia-fr.ch'),
 ('spark.app.id', 'local-1559776876221'),
 ('spark.app.name', 'bda-spark-fare'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.maxResultSize', '4g')]

### Check if Spark is working with a little PI calculation using monte carlo

In [3]:
import random
num_samples = 100000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)

3.13444


### Load the Dataset

In [4]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

### Filter the Dataset

In [5]:
def df_maker_fare(fare_path):
    df_fare = (sqlContext.read.format("csv")
                .option("delimiter", ",")
                .option("header", "true")
                .load(fare_path)
                .drop("medallion")
                .drop(" hack_license")
                .drop("hack_license")
                .drop(" vendor_id")
                .drop(" pickup_datetime")
                .drop(" payment_type")
                .drop(" tip_amount")
                .drop(" tolls_amount")
                .drop(" total_amount")
               .drop(" mta_tax")
               .withColumnRenamed(" fare_amount", "fare_amount")
               .withColumnRenamed(" surcharge", "surcharge")
              )
    return (df_fare.withColumn("fare_total",df_fare.fare_amount+df_fare.surcharge)
               .drop("fare_amount")
               .drop("surcharge")
              )

df_fare = df_maker_fare("datasets/trip_fare/trip_fare_1.csv")

df_fare.printSchema()

root
 |-- fare_total: double (nullable = true)



In [6]:
ny_lat_min = 40.50214590272583
ny_lat_max = 40.9#40.75977082462501
ny_lon_min = -74.24354116993825
ny_lon_max = -73.77490985242169
    
def df_maker_data(data_path):
    df_data = (sqlContext.read.format("csv")
                .option("delimiter", ",")
                .option("header", "true")
                .load(data_path)
                .drop("medallion")
                .drop(" hack_license")
                .drop("hack_license")
                .drop(" vendor_id")
                .drop("vendor_id")
               .drop(" rate_code")
                .drop("rate_code")
               .drop(" store_and_fwd_flag")
                .drop("store_and_fwd_flag")
               .drop(" pickup_datetime")
                .drop("pickup_datetime")
               .drop(" dropoff_datetime")
                .drop("dropoff_datetime")
               .drop(" passenger_count")
                .drop("passenger_count")
               .drop(" trip_time_in_secs")
                .drop("trip_time_in_secs")
               .drop(" trip_distance")
                .drop("trip_distance")
               .drop(" dropoff_longitude")
                .drop("dropoff_longitude")
               .drop(" dropoff_latitude")
                .drop("dropoff_latitude")
               .withColumnRenamed(" pickup_longitude", "pickup_longitude")
               .withColumnRenamed(" pickup_latitude", "pickup_latitude")
                
              )
    return (df_data.withColumn('pickup_longitude', df_data['pickup_longitude'].cast('float')) #convert str to float
            .withColumn('pickup_latitude', df_data['pickup_latitude'].cast('float'))
            .filter(df_data.pickup_latitude<=ny_lat_max)
            .filter(df_data.pickup_latitude>=ny_lat_min)
            .filter(df_data.pickup_longitude<=ny_lon_max)
            .filter(df_data.pickup_longitude>=ny_lon_min)
           )

df_data = df_maker_data("datasets/trip_data/trip_data_1.csv")
df_data.printSchema()

root
 |-- pickup_longitude: float (nullable = true)
 |-- pickup_latitude: float (nullable = true)



In [31]:
#df_fare.count()

root
 |--  fare_amount: string (nullable = true)
 |--  surcharge: string (nullable = true)



In [25]:
#df_data.count()

root
 |--  vendor_id: string (nullable = true)
 |--  pickup_datetime: string (nullable = true)
 |--  payment_type: string (nullable = true)
 |--  fare_amount: string (nullable = true)
 |--  surcharge: string (nullable = true)
 |--  mta_tax: string (nullable = true)
 |--  tip_amount: string (nullable = true)
 |--  tolls_amount: string (nullable = true)
 |--  total_amount: string (nullable = true)



In [7]:
from pyspark.sql.functions import monotonically_increasing_id
def df_combiner(df_fare, df_data):
    df_fare_id = df_fare.withColumn("id", monotonically_increasing_id())
    df_data_id = df_data.withColumn("id", monotonically_increasing_id())
    return df_fare_id.join(df_data_id, "id", "outer").drop("id")
#df_merged = df_combiner(df_fare, df_data)
#df_merged.printSchema()
#df_merged.show(1)

### Create complete Dataframe from all Dataset files

In [9]:
path_trip = "datasets/trip_data/"
path_fare = "datasets/trip_fare/"

fare_files = os.listdir(path_fare)

for idx,e in enumerate(os.listdir(path_trip)):
    trip_file = path_trip+"trip_data_"+str(idx+1)+".csv"
    fare_file = path_fare+"trip_fare_"+str(idx+1)+".csv"

    if idx == 0:
        dff = df_combiner(df_maker_fare(fare_file),
                          df_maker_data(trip_file))
    else:
        df = df_combiner(df_maker_fare(fare_file),
                          df_maker_data(trip_file))
        dff= dff.union(df)

dff.show(2)

+----------+----------------+---------------+
|fare_total|pickup_longitude|pickup_latitude|
+----------+----------------+---------------+
|       7.5|       -73.97865|      40.787735|
|       7.5|       -73.97713|       40.74831|
+----------+----------------+---------------+
only showing top 2 rows



#### Count of the whole dataset

In [10]:
dff.count()

173264090

#### Count of the unique last dataset load

In [11]:
#original 7774669
df.count()

13977692

## Playing with a smaller dataset: 1/9

## Using K-Means to show Clusters
- pickup cluster
- dropoff cluster

### Cluster fitter function

In [10]:
from pyspark.ml.feature import VectorAssembler
def calculate_kmeans_cluster(df, model, feature_1, feature_2, cluser_name):
    vecAssembler = VectorAssembler(inputCols=[feature_1, feature_2], outputCol="features")
    df_features = vecAssembler.transform(df)
    
    df = model.transform(df_features)
    
    df = df.withColumnRenamed('prediction', cluser_name)
    df = df.drop('features')
    return df

### K-Means model building function

In [11]:
#https://jaceklaskowski.gitbooks.io/mastering-apache-spark/spark-mllib/spark-mllib-KMeans.html
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

def build_kmeans_model(df=0,k_n=2,iter_n=20,steps_n=5,tol_n=1e-4,feature_1="",feature_2=""):
    vecAssembler = VectorAssembler(inputCols=[feature_1, feature_2], outputCol="features")
    vec_df = vecAssembler.transform(df)
    
    kmeans = KMeans(k=k_n, seed=1, maxIter=iter_n, initSteps=steps_n, tol=tol_n)
    
    model = kmeans.fit(vec_df.select('features'))
    return model