# BDA Project
- Romain Claret
- Jämes Ménétrey
- Damien Rochat

### Load PySpark

In [1]:
import os
import findspark
findspark.init()

import pyspark

#memory = '4g'
#pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["SPARK_HOME"] = "/opt/spark"
#os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args

#--driver-maxResultSize 10g --executor-memory 4g

from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName("bda-spark-fare-tmp")
conf = (conf.setMaster('local[*]')
        .set('spark.executor.memory', '10g')
        .set('spark.driver.memory', '10g')
        .set('spark.driver.maxResultSize', '10g')
        .set('spark.network.timeout', '1000000000')
        .set('spark.executor.heartbeatInterval', '1000000000')
        )
sc = SparkContext(conf=conf)

### Check config

In [2]:
sc._conf.getAll()

[('spark.executor.memory', '10g'),
 ('spark.driver.port', '33469'),
 ('spark.app.id', 'local-1559822973793'),
 ('spark.driver.memory', '10g'),
 ('spark.driver.host', 'rclaret.tic.heia-fr.ch'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.maxResultSize', '10g'),
 ('spark.app.name', 'bda-spark-fare-tmp'),
 ('spark.network.timeout', '1000000000'),
 ('spark.executor.heartbeatInterval', '1000000000'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

### Check if Spark is working with a little PI calculation using monte carlo

In [3]:
import random
num_samples = 100000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)

3.14236


### Load the Dataset

In [4]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

### Filter the Dataset

In [5]:
def df_maker_fare(fare_path):
    df_fare = (sqlContext.read.format("csv")
                .option("delimiter", ",")
                .option("header", "true")
                .load(fare_path)
                .drop("medallion")
                .drop(" hack_license")
                .drop("hack_license")
                .drop(" vendor_id")
                .drop(" pickup_datetime")
                .drop(" payment_type")
                .drop(" tip_amount")
                .drop(" tolls_amount")
                .drop(" total_amount")
               .drop(" mta_tax")
               .withColumnRenamed(" fare_amount", "fare_amount")
               .withColumnRenamed(" surcharge", "surcharge")
              )
    return (df_fare.withColumn("fare_total",df_fare.fare_amount+df_fare.surcharge)
               .drop("fare_amount")
               .drop("surcharge")
              )

In [6]:
ny_lat_min = 40.50214590272583
ny_lat_max = 40.9#40.75977082462501
ny_lon_min = -74.24354116993825
ny_lon_max = -73.77490985242169
    
def df_maker_data(data_path):
    df_data = (sqlContext.read.format("csv")
                .option("delimiter", ",")
                .option("header", "true")
                .load(data_path)
                .drop("medallion")
                .drop(" hack_license")
                .drop("hack_license")
                .drop(" vendor_id")
                .drop("vendor_id")
               .drop(" rate_code")
                .drop("rate_code")
               .drop(" store_and_fwd_flag")
                .drop("store_and_fwd_flag")
               .drop(" pickup_datetime")
                .drop("pickup_datetime")
               .drop(" dropoff_datetime")
                .drop("dropoff_datetime")
               .drop(" passenger_count")
                .drop("passenger_count")
               .drop(" trip_time_in_secs")
                .drop("trip_time_in_secs")
               .drop(" trip_distance")
                .drop("trip_distance")
               .drop(" dropoff_longitude")
                .drop("dropoff_longitude")
               .drop(" dropoff_latitude")
                .drop("dropoff_latitude")
               .withColumnRenamed(" pickup_longitude", "pickup_longitude")
               .withColumnRenamed(" pickup_latitude", "pickup_latitude")
                
              )
    return (df_data.withColumn('pickup_longitude', df_data['pickup_longitude'].cast('float')) #convert str to float
            .withColumn('pickup_latitude', df_data['pickup_latitude'].cast('float'))
            .filter(df_data.pickup_latitude<=ny_lat_max)
            .filter(df_data.pickup_latitude>=ny_lat_min)
            .filter(df_data.pickup_longitude<=ny_lon_max)
            .filter(df_data.pickup_longitude>=ny_lon_min)
           )

In [7]:
from pyspark.sql.functions import monotonically_increasing_id
def df_combiner(df_fare, df_data):
    df_fare_id = df_fare.withColumn("id", monotonically_increasing_id())
    df_data_id = df_data.withColumn("id", monotonically_increasing_id())
    return df_fare_id.join(df_data_id, "id", "outer").drop("id")

In [12]:
path_trip = "datasets/trip_data/"
path_fare = "datasets/trip_fare/"
path_combined = "datasets/trip_curated/"

fare_files = os.listdir(path_fare)

for idx,e in enumerate(os.listdir(path_trip)):
    if idx!=0:
        trip_file = path_trip+"trip_data_"+str(idx+1)+".csv"
        fare_file = path_fare+"trip_fare_"+str(idx+1)+".csv"
        curated_file = path_combined+"trip_curated_"+str(idx+1)+".csv"

        print("start: " + trip_file)
        df_curated = df_combiner(df_maker_fare(fare_file), df_maker_data(trip_file))
        print("toPandas")
        df_curated = df_curated.toPandas()
        print("to_csv")
        df_curated.to_csv(curated_file)
    

start: datasets/trip_data/trip_data_2.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_3.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_4.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_5.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_6.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_7.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_8.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_9.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_10.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_11.csv
toPandas
to_csv
start: datasets/trip_data/trip_data_12.csv
toPandas
to_csv
