# Data cleaning

In [106]:
# Imports go here
import os
from tabulate import tabulate
import csv
import glob
import pandas as pd
import os 
import shutil
import datetime
import geopandas as gpd
from datetime import date
from datetime import datetime
from pyspark.sql.functions import col, lit
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as f
import pyspark.sql.types
from pyspark.sql import Row
from shutil import copyfile
from shapely.geometry import Point
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, DateType, IntegerType, BooleanType, TimestampType, FloatType, DoubleType
from pyspark.sql.types import LongType, StringType, StructType, StructField
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=3g  pyspark-shell"
from pyspark.sql import SparkSession
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass
# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("H600 L-Group") \
    .getOrCreate()
#When dealing with RDDs, we work the sparkContext object. See https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext
sc=spark.sparkContext
#in local mode, you will be able to access the Spark GUI at http://localhost:4040

Spark application already started. Terminating existing application and starting new one


In [195]:
pip install tabulate

Note: you may need to restart the kernel to use updated packages.


### Reminder

• Yellow taxi records are records that record trip information of New York's famous yellow
taxi cars.

• Green taxi records are records that record trip information by so-called 'boro' taxis a
newer service introduced in August of 2013 to improve taxi service and availability in the
boroughs

• FHV records (short for 'For Hire Vehicles') record information from services that oer
for-hire vehicles (such as Uber, Lyft, Via, and Juno), but also luxury limousine bases.

• High volume FHV (FHVHV for short) are FHV records oered by services that make
more than 10,000 trips per day

## Auxiliary functions

In [180]:
def create_files_list(path, brand, list_files):
    global nb_files
    nb_files = 0
    for file in glob.glob("%s/%s/*.csv" %(path,brand)):
        nb_files = nb_files+1
        # Save in list the files name
        list_files.append(file)
        # Order by date the file list
        list_files.sort()

    return list_files, nb_files


from pyspark.sql.functions import *
def total_amount_comp(df):
    df = df.withColumn('total_amount', f.when(col('total_amount')==0, col('fare_amount')+col('mta_tax')+col('tip_amount')+0.29).otherwise('total_amount'))
    return df

## 1. Cleaning the FHV dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|dispatching_base_num|License Number of the base that dispatched the trip|String||
|pickup_datetime|The date and time of the trip pick-up|Datetime||
|dropoff_datetime|The date and time of the trip dropoff|Datetime||
|PULocationID|Zone in which the trip began|Integer|Not Null|
|DOLocationID|Zone in which the trip ended|Integer||
|sr_flag|Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line); share=1, nonshared=0|Boolean|| 

### Validity rules

In T4, the analysis will rely mainly on 'PULocationID'. Therefore, we want that column to be not null and under good format.
The columns 'pickup-datetime' and 'dropoff_datetime' will aslo be used. But as there is a substantial gap in values (between 2015-01 and 2016-12) for the second one, we decided to remove the constraint 'Not Null' for that one.

Action to be taken on fhv files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency

### Identifying dirty records, data repairing

In [None]:
column_list=['pickup_datetime','dropoff_datetime','PULocationID']
list_files_fhv = []
path="data/integrated"
create_files_list(path,"fhv",list_files_fhv)
fhv_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_fhv))

nb_r_step1=fhv_DF.count()
nb_c_step1=len(fhv_DF.columns)
print("1.Number of records at this stage:",nb_r_step1, nb_c_step1)

#remove useless columns
print("-Removing useless columns")
fhv_DF= fhv_DF.select('pickup_datetime','dropoff_datetime','PULocationID')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
fhv_DF=fhv_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values (PULocationID)")
#fhv_DF= fhv_DF.na.drop()
fhv_DF = fhv_DF.filter(fhv_DF.PULocationID. isNotNull())
print("---DONE---")

nb_r_step2=fhv_DF.count()
nb_c_step2=len(fhv_DF.columns)
print("2.Number of records at this stage:",nb_r_step2, nb_c_step2)

#remove duplicates
print("-Removing duplicate values")
fhv_DF= fhv_DF.dropDuplicates(column_list)
print("---DONE---")

nb_r_step3=fhv_DF.count()
nb_c_step3=len(fhv_DF.columns)
print("3.Number of records at this stage:",nb_r_step3, nb_c_step3)

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
fhv_DF = fhv_DF.where(col("PULocationid").isin(locID))
print("---DONE---")

nb_r_step4=fhv_DF.count()
nb_c_step4=len(fhv_DF.columns)
print("4.Number of records at this stage:",nb_r_step4, nb_c_step4)

print("-Changing DataTypes")
fhv_DF = fhv_DF.withColumn("dropoff_datetime",fhv_DF["dropoff_datetime"].cast(TimestampType()))
print("---DONE---")

#remove dates out of the range solely for pickup_datetime (as we there is a substantial gap in value for dropoff_datetime) )
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2015-01-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
fhv_DF = fhv_DF.filter(f.col("pickup_datetime").between(date_in,date_out))
#                    .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

nb_r_step5=fhv_DF.count()
nb_c_step5=len(fhv_DF.columns)
print("5.Number of records at this stage:",nb_r_step5, nb_c_step5)

print("Final Schema:")
fhv_DF.printSchema()

print(tabulate([
    ['Removing useless columns', 0,nb_c_step2-nb_c_step1],
    ['Removing rows with null values', nb_r_step2-nb_r_step1,0],
    ['Removing duplicate values', nb_r_step3-nb_r_step2,0],
    ['Removing rows with wrong locID', nb_r_step4-nb_r_step3,0],
    ['Removing rows with wrong dates', nb_r_step5-nb_r_step4,0]
], 
               
               headers=['Actions', 'Rows','Columns']))

1.Number of records at this stage: 1389608 6
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values (PULocationID)
---DONE---
2.Number of records at this stage: 1107322 3
-Removing duplicate values
---DONE---


In [55]:
fhv_DF = fhv_DF.toPandas().to_csv('data/cleaned/fhv/fhv_cleaned.csv', index = False)

## 2. Cleaning the FHVHV dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|hvfhs_license_num|TLC license number of the HVFHS base or business|String||
|dispatching_base_num|License Number of the base that dispatched the trip|String||
|pickup_datetime|The date and time of the trip pick-up|Datetime|Not Null|
|dropoff_datetime|The date and time of the trip dropoff|Datetime|Not Null|
|PULocationID|Zone in which the trip began|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the trip ended|Integer(smallint)|Not Null|
|sr_flag|Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line); share=1, nonshared=0|Boolean|| 


*hvfhs_license_num possible values:
• HV0002: Juno
• HV0003: Uber
• HV0004: Via
• HV0005: Lyft

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','dropoff_datetime','PULocationID',DULocationID. Therefore, we want these columns to be not null and under good format.

Action to be taken on fhvhv files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency

### Identifying dirty records, data repairing

In [210]:
column_list=['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID']
list_files_fhvhv = []
path="data/integrated"
create_files_list(path,"fhvhv",list_files_fhvhv)
fhvhv_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_fhvhv))

nb_r_step1=fhvhv_DF.count()
nb_c_step1=len(fhvhv_DF.columns)
print("1.Number of records at this stage:",nb_r_step1, nb_c_step1)

#remove useless columns
print("-Removing useless columns")
fhvhv_DF= fhvhv_DF.select('pickup_datetime','dropoff_datetime','PULocationID','DOLocationID')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
fhvhv_DF=fhvhv_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values (PULocationID)")
fhvhv_DF = fhvhv_DF.filter(fhvhv_DF.PULocationID. isNotNull())
#fhvhv_DF= fhvhv_DF.na.drop()
print("---DONE---")

nb_r_step2=fhvhv_DF.count()
nb_c_step2=len(fhvhv_DF.columns)
print("2.Number of records at this stage:",nb_r_step2, nb_c_step2)

#remove duplicates
print("-Removing duplicate values")
fhvhv_DF= fhvhv_DF.dropDuplicates(column_list)
print("---DONE---")

nb_r_step3=fhvhv_DF.count()
nb_c_step3=len(fhvhv_DF.columns)
print("3.Number of records at this stage:",nb_r_step3, nb_c_step3)

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
fhvhv_DF = fhvhv_DF.where(col("PULocationid").isin(locID))
fhvhv_DF = fhvhv_DF.where(col("DOLocationID").isin(locID))
print("---DONE---")

nb_r_step4=fhvhv_DF.count()
nb_c_step4=len(fhvhv_DF.columns)
print("4.Number of records at this stage:",nb_r_step5, nb_c_step5)

print("-Changing DataTypes")
fhvhv_DF = fhvhv_DF.withColumn("dropoff_datetime",fhvhv_DF["dropoff_datetime"].cast(TimestampType()))
print("---DONE---")

#remove dates out of the range solely for pickup_datetime,dropoff_datetime
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2019-02-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
fhvhv_DF = fhvhv_DF.filter(f.col("pickup_datetime").between(date_in,date_out))\
                .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

nb_r_step5=fhvhv_DF.count()
nb_c_step5=len(fhvhv_DF.columns)
print("5.Number of records at this stage:",nb_r_step5, nb_c_step5)

print("Final Schema:")
fhvhv_DF.printSchema()

from tabulate import tabulate
print(tabulate([
    ['Removing useless columns', 0,nb_c_step2-nb_c_step1],
    ['Removing rows with null values', nb_r_step2-nb_r_step1,0],
    ['Removing duplicate values', nb_r_step3-nb_r_step2,0],
    ['Removing rows with wrong locID', nb_r_step4-nb_r_step3,0],
    ['Removing rows with wrong dates', nb_r_step5-nb_r_step4,0]
], 
               
               headers=['Actions', 'Rows','Columns']))

1.Number of records at this stage: 321819 7
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values (PULocationID)
---DONE---
2.Number of records at this stage: 321819 4
-Removing duplicate values
---DONE---
3.Number of records at this stage: 321819 4
-Removing rows with wrong locID
---DONE---
4.Number of records at this stage: 153367 8
-Changing DataTypes
---DONE---
-Removing rows with wrong dates
---DONE---
5.Number of records at this stage: 311867 4
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)

Actions                           Rows    Columns
------------------------------  ------  ---------
Removing useless columns             0         -3
Removing rows with null values       0          0
Removing duplicate values            0          0
Removing rows with wrong

In [8]:
fhvhv_DF = fhvhv_DF.toPandas().to_csv('data/cleaned/fhvhv/fhvhv_cleaned.csv', index = False)

AttributeError: 'NoneType' object has no attribute 'toPandas'

## 3. Cleaning the GREEN dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|vendorid|A code indicating the LPEP provider that provided the record.|Integer(tinyint)||
|pickup_datetime|The date and time when the meter was engaged|Datetime|Not Null|
|dropoff_datetime|The date and time when the meter was disengaged|Datetime|Not Null|
|passenger_count|The number of passengers in the vehicle|Integer(tinyint)|Not Null|
|trip_distance|The elapsed trip distance in miles reported by the taximeter|Decimal|Not Null|
|PULocationID|Zone in which the taximeter was engaged|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the taximeter was disengaged|Integer(smallint)|Not Null|
|rateCodeID|The final rate code in effect at the end of the trip|Integer(tinyint)|1 to 6|
|store_and_fwd_flag|This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,”because the vehicle did not have a connection to the server|Boolean|Y or N|
|payment_type|A numeric code signifying how the passenger paid for the trip|Integer(tinyint)|1 to 6|
|fare_amount|The time-and-distance fare calculated by the meter|Decimal||
|extra|Miscellaneous extras and surcharges|Decimal||
|mta_tax|0.50 mta tax that is automatically triggered based on the metered rate in use|Decimal|   	|
|Improvement_surcharge|0.30 improvement surcharge assessed on hailed trips at the flag drop|Decimal|   	|
|tip_amount|tip amount – This field is automatically populated for credit card tips. Cash tips are not included|Decimal|Not Null|
|tolls_amount|total amount of all tolls paid in trip|Decimal|   	|
|total_amount|The total amount charged to passengers|Decimal|Not Null|
|trip_type|A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver|Boolean|1 or 2|

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','passenger_count','trip_distance','tip_amount','total_amount'. Therefore, we want these columns to be not null and under good format.

Action to be taken on green files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency
- check and remove outliers in numeric columns
- change negative values into positive one
- remove trip with no passenger
- compute total_amount

### Identifying dirty records, data repairing

In [209]:
#here all columns that must be NOT NULL
column_list=['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','passenger_count','trip_distance','tip_amount','total_amount']
list_files_green = []
path="data/integrated"
create_files_list(path,"green",list_files_green)
green_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_green))

nb_r_step1=green_DF.count()
nb_c_step1=len(green_DF.columns)
print("1.Number of records at this stage:",nb_r_step1, nb_c_step1)

#compute total amount
print("-Computing total_amount")
total_amount_comp(green_DF)
print("---DONE---")

#remove useless columns
print("-Removing useless columns")
green_DF= green_DF.select('pickup_datetime','dropoff_datetime','passenger_count','trip_distance','PULocationID','DOLocationID','tip_amount','total_amount')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
green_DF=green_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values(PULocationID & trip_distance)")
#green_DF= green_DF.na.drop("pickup_datetime")
green_DF = green_DF.filter(green_DF.PULocationID. isNotNull())
green_DF = green_DF.filter(green_DF.trip_distance. isNotNull())
print("---DONE---")

nb_r_step2=green_DF.count()
nb_c_step2=len(green_DF.columns)
print("2.Number of records at this stage:",nb_r_step2, nb_c_step2)

#remove duplicates
print("-Removing duplicate values")
green_DF= green_DF.dropDuplicates(column_list)
print("---DONE---")

nb_r_step3=green_DF.count()
nb_c_step3=len(green_DF.columns)
print("3.Number of records at this stage:",nb_r_step3, nb_c_step3)

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
green_DF = green_DF.where(col("PULocationID").isin(locID))
#green_DF = green_DF.where(col("DOLocationID").isin(locID))
print("---DONE---")

nb_r_step4=green_DF.count()
nb_c_step4=len(green_DF.columns)
print("4.Number of records at this stage:",nb_r_step4, nb_c_step4)

#remove dates out of the range solely for pickup_datetime,dropoff_datetime
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2013-08-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
green_DF = green_DF.filter(f.col("pickup_datetime").between(date_in,date_out))\
                .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

nb_r_step5=green_DF.count()
nb_c_step5=len(green_DF.columns)
print("5.Number of records at this stage:",nb_r_step5, nb_c_step5)

#remove trip with no passenger
print("-Removing rows with no passenger")
green_DF = green_DF[green_DF['passenger_count']!=0]
print("---DONE---")

nb_r_step6=green_DF.count()
nb_c_step6=len(green_DF.columns)
print("6.Number of records at this stage:",nb_r_step6, nb_c_step6)

#remove trip with 0 as trip distance
print("-Removing rows with 0 as trip distance")
green_DF = green_DF[green_DF['trip_distance']!=0]
print("---DONE---")

nb_r_step7=green_DF.count()
nb_c_step7=len(green_DF.columns)
print("7.Number of records at this stage:",nb_r_step7, nb_c_step7)

#absolute value for negative value
print("-Changing negative values into positive ones")
from  pyspark.sql.functions import abs
green_DF = green_DF.withColumn('passenger_count',abs(green_DF.passenger_count))
green_DF = green_DF.withColumn('trip_distance',abs(green_DF.trip_distance))
green_DF = green_DF.withColumn('tip_amount',abs(green_DF.tip_amount))
green_DF = green_DF.withColumn('total_amount',abs(green_DF.total_amount))
print("---DONE---")

print("Final Schema:")
green_DF.printSchema()

from tabulate import tabulate
print(tabulate([
    ['Removing useless columns', 0,nb_c_step2-nb_c_step1],
    ['Removing rows with null values', nb_r_step2-nb_r_step1,0],
    ['Removing duplicate values', nb_r_step3-nb_r_step2,0],
    ['Removing rows with wrong locID', nb_r_step4-nb_r_step3,0],
    ['Removing rows with wrong dates', nb_r_step5-nb_r_step4,0],
    ['Removing rows with no passenger', nb_r_step6-nb_r_step5,0],
    ['Removing rows with 0 as trip distance', nb_r_step7-nb_r_step6,0], 
], 
               
               headers=['Actions', 'Rows','Columns']))

1.Number of records at this stage: 154014 20
-Computing total_amount
---DONE---
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values(PULocationID & trip_distance)
---DONE---
2.Number of records at this stage: 153872 8
-Removing duplicate values
---DONE---
3.Number of records at this stage: 153872 8
-Removing rows with wrong locID
---DONE---
4.Number of records at this stage: 153368 8
-Removing rows with wrong dates
---DONE---
5.Number of records at this stage: 153367 8
-Removing rows with no passenger
---DONE---
6.Number of records at this stage: 152785 8
-Removing rows with 0 as trip distance
---DONE---
7.Number of records at this stage: 150756 8
-Changing negative values into positive ones
---DONE---
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- PULocation

In [None]:
green_DF = green_DF.toPandas().to_csv('data/cleaned/green/green_cleaned.csv', index = False)

## 4. Cleaning the YELLOW dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|vendorid|A code indicating the LPEP provider that provided the record.|Integer(tinyint)|1 or 2|
|pickup_datetime|The date and time when the meter was engaged|Datetime|Not Null|
|dropoff_datetime|The date and time when the meter was disengaged|Datetime||
|passenger_count|The number of passengers in the vehicle|Integer(tinyint)|Not Null|
|trip_distance|The elapsed trip distance in miles reported by the taximeter|Decimal|   	|
|PULocationID|Zone in which the taximeter was engaged|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the taximeter was disengaged|Integer(smallint)||
|rateCodeID|The final rate code in effect at the end of the trip|Integer(tinyint)|1 to 6|
|store_and_fwd_flag|This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,”because the vehicle did not have a connection to the server|Boolean|Y or N|
|payment_type|A numeric code signifying how the passenger paid for the trip|Integer(tinyint)|1 to 6|
|fare_amount|The time-and-distance fare calculated by the meter|Decimal||
|extra|Miscellaneous extras and surcharges|Decimal|   	|
|mta_tax|0.50 mta tax that is automatically triggered based on the metered rate in use|Decimal|   	|
|Improvement_surcharge|0.30 improvement surcharge assessed on hailed trips at the flag drop|Decimal|   	|
|tip_amount|tip amount – This field is automatically populated for credit card tips. Cash tips are not included|Decimal|   	|
|tolls_amount|total amount of all tolls paid in trip|Decimal|   	|
|total_amount|The total amount charged to passengers|Decimal|   	|

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','passenger_count','trip_distance','tip_amount','total_amount'. Therefore, we want these columns to be not null and under good format.

Action to be taken on yellow files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency
- check and remove outliers in numeric columns
- change negative values into positive one
- remove trip with no passenger
- compute total_amount
- remove rows with trip_distance=0

### Identifying dirty records, data repairing

In [208]:
#here all columns that must be NOT NULL
column_list=['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','passenger_count','trip_distance','tip_amount','total_amount']
list_files_yellow = []
path="data/integrated"
create_files_list(path,"green",list_files_yellow)
yellow_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_yellow))

nb_r_step1=yellow_DF.count()
nb_c_step1=len(yellow_DF.columns)
print("1.Number of records at this stage:",nb_r_step1, nb_c_step1)

#compute total amount
print("-Computing total_amount")
total_amount_comp(yellow_DF)
print("---DONE---")

#remove useless columns
print("-Removing useless columns")
yellow_DF= yellow_DF.select('pickup_datetime','dropoff_datetime','passenger_count','trip_distance','PULocationID','DOLocationID','tip_amount','total_amount')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
yellow_DF=yellow_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values (PULocationID & trip_distance)")
#green_DF= green_DF.na.drop("pickup_datetime")
yellow_DF = yellow_DF.filter(yellow_DF.PULocationID. isNotNull())
yellow_DF = yellow_DF.filter(yellow_DF.trip_distance. isNotNull())
print("---DONE---")

nb_r_step2=yellow_DF.count()
nb_c_step2=len(yellow_DF.columns)
print("2.Number of records at this stage:",nb_r_step2, nb_c_step2)

#remove duplicates
print("-Removing duplicate values")
yellow_DF= yellow_DF.dropDuplicates(column_list)
print("---DONE---")

nb_r_step3=yellow_DF.count()
nb_c_step3=len(yellow_DF.columns)
print("3.Number of records at this stage:",nb_r_step3, nb_c_step3)

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
yellow_DF = yellow_DF.where(col("PULocationid").isin(locID))
#yellow_DF = yellow_DF.where(col("DOLocationID").isin(locID))
print("---DONE---")

nb_r_step4=yellow_DF.count()
nb_c_step4=len(yellow_DF.columns)
print("4.Number of records at this stage:",nb_r_step4, nb_c_step4)

#remove dates out of the range solely for pickup_datetime,dropoff_datetime
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2009-01-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
yellow_DF = yellow_DF.filter(f.col("pickup_datetime").between(date_in,date_out))\
                .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

nb_r_step5=yellow_DF.count()
nb_c_step5=len(yellow_DF.columns)
print("5.Number of records at this stage:",nb_r_step5, nb_c_step5)

#remove trip with no passenger
print("-Removing rows with no passenger")
yellow_DF = yellow_DF[yellow_DF['passenger_count']!=0]
print("---DONE---")

nb_r_step6=yellow_DF.count()
nb_c_step6=len(yellow_DF.columns)
print("6.Number of records at this stage:",nb_r_step6, nb_c_step6)

#remove trip with 0 as trip distance
print("-Removing rows with 0 as trip distance")
yellow_DF = yellow_DF[yellow_DF['trip_distance']!=0]
print("---DONE---")

nb_r_step7=yellow_DF.count()
nb_c_step7=len(yellow_DF.columns)
print("7.Number of records at this stage:",nb_r_step7, nb_c_step7)

#absolute value for negative value
print("-Changing negative values into positive ones")
from  pyspark.sql.functions import abs
yellow_DF = yellow_DF.withColumn('passenger_count',abs(yellow_DF.passenger_count))
yellow_DF = yellow_DF.withColumn('trip_distance',abs(yellow_DF.trip_distance))
yellow_DF = yellow_DF.withColumn('tip_amount',abs(yellow_DF.tip_amount))
yellow_DF = yellow_DF.withColumn('total_amount',abs(yellow_DF.total_amount))
print("---DONE---")

print("Final Schema:")
yellow_DF.printSchema()

from tabulate import tabulate
print(tabulate([
    ['Removing useless columns', 0,nb_c_step2-nb_c_step1],
    ['Removing rows with null values', nb_r_step2-nb_r_step1,0],
    ['Removing duplicate values', nb_r_step3-nb_r_step2,0],
    ['Removing rows with wrong locID', nb_r_step4-nb_r_step3,0],
    ['Removing rows with wrong dates', nb_r_step5-nb_r_step4,0],
    ['Removing rows with no passenger', nb_r_step6-nb_r_step5,0],
    ['Removing rows with 0 as trip distance', nb_r_step7-nb_r_step6,0], 
], 
               
               headers=['Actions', 'Rows','Columns']))

1.Number of records at this stage: 154014 20
-Computing total_amount
---DONE---
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values (PULocationID & Trip_distance)
---DONE---
2.Number of records at this stage: 153872 8
-Removing duplicate values
---DONE---
3.Number of records at this stage: 153872 8
-Removing rows with wrong locID
---DONE---
4.Number of records at this stage: 153368 8
-Removing rows with wrong dates
---DONE---
5.Number of records at this stage: 153368 8
-Removing rows with no passenger
---DONE---
6.Number of records at this stage: 152786 8
-Removing rows with 0 as trip distance
---DONE---
7.Number of records at this stage: 150757 8
-Changing negative values into positive ones
---DONE---
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- Passenger_count: double (nullable = true)
 |-- Trip_distance: double (nullable = true)
 |-- PULocatio