# Data cleaning

In [10]:
# Imports go here
import os
import csv
import glob
import pandas as pd
import os 
import shutil
import datetime
import geopandas as gpd
from datetime import date
from datetime import datetime
from pyspark.sql.functions import col, lit
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pyspark.sql.functions as f
import pyspark.sql.types
from pyspark.sql import Row
from shutil import copyfile
from shapely.geometry import Point
from pyspark.sql.functions import col
from pyspark.sql.types import StringType, DateType, IntegerType, BooleanType, TimestampType, FloatType
from pyspark.sql.types import LongType, StringType, StructType, StructField
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=3g  pyspark-shell"
from pyspark.sql import SparkSession
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass
# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("H600 L-Group") \
    .getOrCreate()
#When dealing with RDDs, we work the sparkContext object. See https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext
sc=spark.sparkContext
#in local mode, you will be able to access the Spark GUI at http://localhost:4040

Spark application already started. Terminating existing application and starting new one


## Auxiliary functions

### Function: Create list 

In [2]:
def create_files_list(path, brand, list_files):
    """
    This function create the files list of specify taxi brand (brand) from the specify folder (path). 
    
    Input: the path where are the files -> /data/cleaned or data/sampled
           the name of the taxi company -> fhv, fhvfh, green, yellow
           the empty file name list in which each file will be append
    Output: number of files in the list and the list of files name.
    """  
    global nb_files
    nb_files = 0
    for file in glob.glob("%s/%s/*.csv" %(path,brand)):
        nb_files = nb_files+1
        # Save in list the files name
        list_files.append(file)
        # Order by date the file list
        list_files.sort()

    return list_files, nb_files



### Function: outliers

In [96]:
def calculate_bounds(df):
    bounds = {
        c: dict(
            zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0))
        )
        for c,d in zip(df.columns, df.dtypes) if d[1] == "double"
    }

    for c in bounds:
        iqr = bounds[c]['q3'] - bounds[c]['q1']
        bounds[c]['min'] = bounds[c]['q1'] - (iqr * 1.5)
        bounds[c]['max'] = bounds[c]['q3'] + (iqr * 1.5)

    return bounds

    d

calculate_bounds(green_DF)
calculate_bounds(yellow_DF)

{'Passenger_count': {'q1': 1.0, 'q3': 1.0, 'min': 1.0, 'max': 1.0},
 'Trip_distance': {'q1': 1.02, 'q3': 3.55, 'min': -2.775, 'max': 7.345},
 'Tip_amount': {'q1': 0.0, 'q3': 1.86, 'min': -2.79, 'max': 4.65},
 'Total_amount': {'q1': 8.15,
  'q3': 17.8,
  'min': -6.325000000000001,
  'max': 32.275000000000006}}

In [152]:
def flag_outliers(df, id_col):
    bounds = calculate_bounds(df)
    outliers = {}

    return df.select(c, id_col,
            *[
                f.when(
                    ~f.col(c).between(bounds[c]['min'], bounds[c]['max']),
                    "yes"
                ).otherwise("no").alias(c+'_outlier')
            ]
        )

In [143]:
# handle negative values
from pyspark.sql.functions import abs
def abs_neg_val(df, feature):
    #for each columns listed as an input, we drop the rows that have negative values
    df = df.withColumn('feature',abs(df.feature))
    return df



#yellow_DF = yellow_DF.withColumn("only_positive", f.when(f.col("Tip_amount") > 0, f.col("Tip_amount")).otherwise('null'))
#yellow_DF = yellow_DF.withColumn("only_positive", f.when(f.col("Total_amount") > 0, f.col("Total_amount")).otherwise('null'))
#yellow_DF = yellow_DF.withColumn("only_positive", f.when(f.col("Tip_amount") > 0, f.col("Tip_amount")).otherwise('null'))

In [153]:
#abs_neg_val(yellow_DF, 'Tip_amount')
#calculate_bounds(yellow_DF)
from pyspark.sql.functions import abs
# calculate absolute value for some columns
print ("Calculating abs for numeric columns")
yellow_DF = yellow_DF.withColumn('Trip_distance',abs(yellow_DF.Trip_distance))\
                    .withColumn('Total_amount',abs(yellow_DF.Total_amount))\
                    .withColumn('Tip_amount',abs(yellow_DF.Tip_amount))
print("---DONE---")
#calculate_bounds(yellow_DF)
yellow_DF.where(col('Trip_distance')<0).show()

Calculating abs for numeric columns
---DONE---
+-------------------+-------------------+---------------+-------------+------------+------------+----------+------------+-------------+
|    pickup_datetime|   dropoff_datetime|Passenger_count|Trip_distance|PULocationID|DOLocationID|Tip_amount|Total_amount|only_positive|
+-------------------+-------------------+---------------+-------------+------------+------------+----------+------------+-------------+
|2016-07-24 16:37:56|2016-07-24 16:48:14|            1.0|         2.54|          66|         144|      2.26|       13.56|         2.26|
|2016-07-20 07:46:43|2016-07-20 07:59:04|            1.0|         2.59|         152|          75|       0.0|        11.8|         null|
|2016-07-26 22:15:21|2016-07-26 22:40:10|            1.0|         8.39|          42|          79|       0.0|        27.8|         null|
|2016-07-01 16:44:20|2016-07-01 16:47:47|            1.0|         0.58|         145|         145|       0.0|         6.3|         null|
|

In [119]:
# handle negative values
def handle_no_passenger(df):
    #We create two dataframes, one with only the trips with no passengers, the other with passengers
    no_pass_condition = df[df['passenger_count'] == 0].index
    df_full = df.drop(no_pass_condition)
    df_empty = df[df['passenger_count']==0]
    
    return df_empty, df_full
handle_no_passenger(yellow_DF)

AttributeError: 'DataFrame' object has no attribute 'index'

In [18]:
green_DF_empty = green_DF[green_DF['passenger_count']!=0]
green_DF_empty.show()
print("Number of records at this stage:",(green_DF_empty.count(), len(green_DF_empty.columns)))

+-------------------+-------------------+---------------+-------------+------------+------------+----------+------------+
|    pickup_datetime|   dropoff_datetime|Passenger_count|Trip_distance|PULocationID|DOLocationID|Tip_amount|Total_amount|
+-------------------+-------------------+---------------+-------------+------------+------------+----------+------------+
|2016-07-24 16:37:56|2016-07-24 16:48:14|            1.0|         2.54|          66|         144|      2.26|       13.56|
|2016-07-20 07:46:43|2016-07-20 07:59:04|            1.0|         2.59|         152|          75|       0.0|        11.8|
|2016-07-26 22:15:21|2016-07-26 22:40:10|            1.0|         8.39|          42|          79|       0.0|        27.8|
|2016-07-01 16:44:20|2016-07-01 16:47:47|            1.0|         0.58|         145|         145|       0.0|         6.3|
|2016-07-23 02:52:49|2016-07-23 03:02:13|            1.0|         2.44|          41|         244|      3.39|       14.69|
|2016-07-28 19:00:04|201

• Yellow taxi records are records that record trip information of New York's famous yellow
taxi cars.

• Green taxi records are records that record trip information by so-called 'boro' taxis a
newer service introduced in August of 2013 to improve taxi service and availability in the
boroughs

• FHV records (short for 'For Hire Vehicles') record information from services that oer
for-hire vehicles (such as Uber, Lyft, Via, and Juno), but also luxury limousine bases.

• High volume FHV (FHVHV for short) are FHV records oered by services that make
more than 10,000 trips per day

## 1. Cleaning the FHV dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|Dispatching_base_num|License Number of the base that dispatched the trip|String||
|Pickup_datetime|The date and time of the trip pick-up|Datetime|Not Null|
|DropOff_datetime|The date and time of the trip dropoff|Datetime||
|PULocationID|Zone in which the trip began|Integer|Not Null|
|DOLocationID|Zone in which the trip ended|Integer||
|SR_Flag|Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line); share=1, nonshared=0|Boolean|| 

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','PULocationID'. Therefore, we want these columns to be not null and under good format.
The column 'dropoff_datetime' will aslo be used. But as there is a substantial gap in values (between 2015-01 and 2016-12), we decided to remove the constraint 'Not Null' for that one.

Action to be taken on fhv files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency

### Identifying dirty records, data repairing

In [3]:
column_list=['pickup_datetime','dropoff_datetime','PULocationID']
list_files_fhv = []
path="data/cleaned"
create_files_list(path,"fhv",list_files_fhv)
fhv_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_fhv))

print("Number of records at this stage:",(fhv_DF.count(), len(fhv_DF.columns)))

#remove useless columns
print("-Removing useless columns")
fhv_DF= fhv_DF.select('pickup_datetime','dropoff_datetime','PULocationID')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
fhv_DF=fhv_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values")
fhv_DF= fhv_DF.na.drop()
print("---DONE---")

print("Number of records at this stage:",(fhv_DF.count(), len(fhv_DF.columns)))

#remove duplicates
print("-Removing duplicate values")
fhv_DF= fhv_DF.dropDuplicates(column_list)
print("---DONE---")

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
fhv_DF = fhv_DF.where(col("PULocationid").isin(locID))
print("---DONE---")

print("Number of records at this stage:",(fhv_DF.count(), len(fhv_DF.columns)))


print("-Changing DataTypes")
fhv_DF = fhv_DF.withColumn("dropoff_datetime",fhv_DF["dropoff_datetime"].cast(TimestampType()))
print("---DONE---")

#remove dates out of the range solely for pickup_datetime (as we there is a substantial gap in value for dropoff_datetime) )
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2015-01-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
fhv_DF = fhv_DF.filter(f.col("pickup_datetime").between(date_in,date_out))
#                    .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

print("Number of records at this stage:",(fhv_DF.count(), len(fhv_DF.columns)))

print("Final Schema:")
fhv_DF.printSchema()

Number of records at this stage: (360447, 3)
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values
---DONE---
Number of records at this stage: (233412, 3)
-Removing duplicate values
---DONE---
-Removing rows with wrong locID
---DONE---
Number of records at this stage: (233412, 3)
-Changing DataTypes
---DONE---
-Removing rows with wrong dates
---DONE---
Number of records at this stage: (233412, 3)
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: double (nullable = true)



In [11]:
fhv_DF = fhv_DF.toPandas().to_csv('data/cleaned/fhv/fhv_cleaned.csv', index = False)

AttributeError: 'NoneType' object has no attribute 'toPandas'

## 2. Cleaning the FHVHV dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|Hvfhs_license_num|TLC license number of the HVFHS base or business|String||
|Dispatching_base_num|License Number of the base that dispatched the trip|String||
|Pickup_datetime|The date and time of the trip pick-up|Datetime|Not Null|
|DropOff_datetime|The date and time of the trip dropoff|Datetime|Not Null|
|PULocationID|Zone in which the trip began|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the trip ended|Integer(smallint)|Not Null|
|SR_Flag|Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line); share=1, nonshared=0|Boolean|| 


*Hvfhs_license_num possible values:
• HV0002: Juno
• HV0003: Uber
• HV0004: Via
• HV0005: Lyft

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','dropoff_datetime','PULocationID',DULocationID. Therefore, we want these columns to be not null and under good format.

Action to be taken on fhvhv files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency

### Identifying dirty records, data repairing

In [5]:
column_list=['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID']
list_files_fhvhv = []
path="data/integrated"
create_files_list(path,"fhvhv",list_files_fhvhv)
fhvhv_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_fhvhv))

print("Number of records at this stage:",(fhvhv_DF.count(), len(fhvhv_DF.columns)))

#remove useless columns
print("-Removing useless columns")
fhvhv_DF= fhvhv_DF.select('pickup_datetime','dropoff_datetime','PULocationID','DOLocationID')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
fhvhv_DF=fhvhv_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values")
fhvhv_DF= fhvhv_DF.na.drop()
print("---DONE---")

print("Number of records at this stage:",(fhvhv_DF.count(), len(fhvhv_DF.columns)))

#remove duplicates
print("-Removing duplicate values")
fhvhv_DF= fhvhv_DF.dropDuplicates(column_list)
print("---DONE---")

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
fhvhv_DF = fhvhv_DF.where(col("PULocationid").isin(locID))
fhvhv_DF = fhvhv_DF.where(col("DOLocationID").isin(locID))
print("---DONE---")

print("Number of records at this stage:",(fhvhv_DF.count(), len(fhvhv_DF.columns)))


print("-Changing DataTypes")
fhvhv_DF = fhvhv_DF.withColumn("dropoff_datetime",fhvhv_DF["dropoff_datetime"].cast(TimestampType()))
print("---DONE---")

#remove dates out of the range solely for pickup_datetime,dropoff_datetime
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2019-02-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
fhvhv_DF = fhvhv_DF.filter(f.col("pickup_datetime").between(date_in,date_out))\
                .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

print("Number of records at this stage:",(fhvhv_DF.count(), len(fhvhv_DF.columns)))

print("Final Schema:")
fhvhv_DF.printSchema()

Number of records at this stage: (321819, 7)
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values
---DONE---
Number of records at this stage: (321819, 4)
-Removing duplicate values
---DONE---
-Removing rows with wrong locID
---DONE---
Number of records at this stage: (311874, 4)
-Changing DataTypes
---DONE---
-Removing rows with wrong dates
---DONE---
Number of records at this stage: (311867, 4)
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)



In [8]:
fhvhv_DF = fhvhv_DF.toPandas().to_csv('data/cleaned/fhvhv/fhvhv_cleaned.csv', index = False)

AttributeError: 'NoneType' object has no attribute 'toPandas'

## 3. Cleaning the GREEN dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|VendorID|A code indicating the LPEP provider that provided the record.|Integer(tinyint)||
|pickup_datetime|The date and time when the meter was engaged|Datetime|Not Null|
|dropoff_datetime|The date and time when the meter was disengaged|Datetime|Not Null|
|Passenger_count|The number of passengers in the vehicle|Integer(tinyint)|Not Null|
|Trip_distance|The elapsed trip distance in miles reported by the taximeter|Decimal|Not Null|
|PULocationID|Zone in which the taximeter was engaged|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the taximeter was disengaged|Integer(smallint)|Not Null|
|RateCodeID|The final rate code in effect at the end of the trip|Integer(tinyint)|1 to 6|
|Store_and_fwd_flag|This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,”because the vehicle did not have a connection to the server|Boolean|Y or N|
|Payment_type|A numeric code signifying how the passenger paid for the trip|Integer(tinyint)|1 to 6|
|Fare_amount|The time-and-distance fare calculated by the meter|Decimal||
|Extra|Miscellaneous extras and surcharges|Decimal||
|MTA_tax|0.50 MTA tax that is automatically triggered based on the metered rate in use|Decimal|   	|
|Improvement_surcharge|0.30 improvement surcharge assessed on hailed trips at the flag drop|Decimal|   	|
|Tip_amount|Tip amount – This field is automatically populated for credit card tips. Cash tips are not included|Decimal|Not Null|
|Tolls_amount|Total amount of all tolls paid in trip|Decimal|   	|
|Total_amount|The total amount charged to passengers|Decimal|Not Null|
|Trip_type|A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver|Boolean|1 or 2|

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','Passenger_count','Trip_distance','Tip_amount','Total_amount'. Therefore, we want these columns to be not null and under good format.

Action to be taken on green files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency
- check and remove outliers in numeric columns
- change negative values into positive one
- remove trip with no passenger

### Identifying dirty records, data repairing

In [15]:
#here all columns that must be NOT NULL
column_list=['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','Passenger_count','Trip_distance','Tip_amount','Total_amount']
list_files_green = []
path="data/integrated"
create_files_list(path,"green",list_files_green)
green_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_green))

print("Number of records at this stage:",(green_DF.count(), len(green_DF.columns)))

#remove useless columns
print("-Removing useless columns")
green_DF= green_DF.select('pickup_datetime','dropoff_datetime','Passenger_count','Trip_distance','PULocationID','DOLocationID','Tip_amount','Total_amount')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
green_DF=green_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values")
green_DF= green_DF.na.drop()
print("---DONE---")

print("Number of records at this stage:",(green_DF.count(), len(green_DF.columns)))

#remove duplicates
print("-Removing duplicate values")
green_DF= green_DF.dropDuplicates(column_list)
print("---DONE---")

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
green_DF = green_DF.where(col("PULocationid").isin(locID))
green_DF = green_DF.where(col("DOLocationID").isin(locID))
print("---DONE---")

print("Number of records at this stage:",(green_DF.count(), len(green_DF.columns)))

#remove dates out of the range solely for pickup_datetime,dropoff_datetime
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2013-08-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
green_DF = green_DF.filter(f.col("pickup_datetime").between(date_in,date_out))\
                .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

print("Number of records at this stage:",(green_DF.count(), len(green_DF.columns)))

print("Final Schema:")
green_DF.printSchema()

Number of records at this stage: (154014, 20)
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values
---DONE---
Number of records at this stage: (63647, 8)
-Removing duplicate values
---DONE---
-Removing rows with wrong locID
---DONE---
Number of records at this stage: (63330, 8)
-Removing rows with wrong dates
---DONE---
Number of records at this stage: (63329, 8)
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- Passenger_count: double (nullable = true)
 |-- Trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- Tip_amount: double (nullable = true)
 |-- Total_amount: double (nullable = true)



In [42]:
#remove trip with no passenger
print("-Removing rows with no passenger")
green_DF = green_DF[green_DF['passenger_count']!=0]
print("---DONE---")

print("Number of records at this stage:",(green_DF.count(), len(green_DF.columns)))

#remove outliers
print("-Removing rows with no passenger")

print("---DONE---")

print("Number of records at this stage:",(green_DF.count(), len(green_DF.columns)))


#negative values into positive ones
print("-Changing negative values into positive ones")
green_DF = green_DF.withColumn('Total_amount',abs(yellow_DF.Total_amount))

print("---DONE---")

-Removing rows with no passenger
---DONE---
Number of records at this stage: (63281, 8)
-Removing rows with no passenger
---DONE---
Number of records at this stage: (63281, 8)
-Changing negative values into positive ones


AttributeError: 'NoneType' object has no attribute 'Trip_distance'

In [None]:
green_DF = green_DF.withColumn('Total_amount',abs(yellow_DF.Total_amount))
yellow_DF.where(col('Trip_distance')<0).show()

In [None]:
green_DF = green_DF.toPandas().to_csv('data/cleaned/green/green_cleaned.csv', index = False)

## 4. Cleaning the YELLOW dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|VendorID|A code indicating the LPEP provider that provided the record.|Integer(tinyint)|1 or 2, Not Null|
|lpep_pickup_datetime|The date and time when the meter was engaged|Datetime|Not Null|
|lpep_dropoff_datetime|The date and time when the meter was disengaged|Datetime|Not Null|
|Passenger_count|The number of passengers in the vehicle|Integer(tinyint)|???|
|Trip_distance|The elapsed trip distance in miles reported by the taximeter|Decimal|   	|
|PULocationID|Zone in which the taximeter was engaged|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the taximeter was disengaged|Integer(smallint)|Not Null|
|RateCodeID|The final rate code in effect at the end of the trip|Integer(tinyint)|1 to 6, Not Null|
|Store_and_fwd_flag|This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,”because the vehicle did not have a connection to the server|Boolean|Y or N, Not Null|
|Payment_type|A numeric code signifying how the passenger paid for the trip|Integer(tinyint)|1 to 6, Not Null|
|Fare_amount|The time-and-distance fare calculated by the meter|Decimal|Not Null|
|Extra|Miscellaneous extras and surcharges|Decimal|   	|
|MTA_tax|0.50 MTA tax that is automatically triggered based on the metered rate in use|Decimal|   	|
|Improvement_surcharge|0.30 improvement surcharge assessed on hailed trips at the flag drop|Decimal|   	|
|Tip_amount|Tip amount – This field is automatically populated for credit card tips. Cash tips are not included|Decimal|   	|
|Tolls_amount|Total amount of all tolls paid in trip|Decimal|   	|
|Total_amount|The total amount charged to passengers|Decimal|   	|

### Validity rules

In T4, the analysis will rely mainly on 'pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','Passenger_count','Trip_distance','Tip_amount','Total_amount'. Therefore, we want these columns to be not null and under good format.

Action to be taken on yellow files:
- adjust schema and datatypes
- remove useless columns 
- remove rows containing null values for analysis central columns
- remove duplicate values
- check locationID consistency
- check and remove outliers in numeric columns
- change negative values into positive one
- remove trip with no passenger

### Identifying dirty records, data repairing

In [12]:
#here all columns that must be NOT NULL
column_list=['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID','Passenger_count','Trip_distance','Tip_amount','Total_amount']
list_files_yellow = []
path="data/integrated"
create_files_list(path,"green",list_files_yellow)
yellow_DF = (spark.read
                .option("sep", ",")
                .option("header", True)
                .option("inferSchema", True)
                .csv(list_files_yellow))

print("Number of records at this stage:",(yellow_DF.count(), len(yellow_DF.columns)))

#remove useless columns
print("-Removing useless columns")
yellow_DF= yellow_DF.select('pickup_datetime','dropoff_datetime','Passenger_count','Trip_distance','PULocationID','DOLocationID','Tip_amount','Total_amount')
print("---DONE---")

#replace 'null' by nothing
print("-Replacing null values by nothing")
yellow_DF=yellow_DF.na.fill("")
print("---DONE---")

#remove null values
print("-Removing rows with null values")
yellow_DF= yellow_DF.na.drop()
print("---DONE---")

print("Number of records at this stage:",(yellow_DF.count(), len(yellow_DF.columns)))

#remove duplicates
print("-Removing duplicate values")
yellow_DF= yellow_DF.dropDuplicates(column_list)
print("---DONE---")

#remove wrong location id
print("-Removing rows with wrong locID")
zones = gpd.read_file('data/metadata/taxi_zones.shp')
zones_df = spark.createDataFrame(zones).cache()
locID = zones_df.select(col("LocationID"))
locID = [row[0] for row in locID.select("LocationID").collect()]
yellow_DF = yellow_DF.where(col("PULocationid").isin(locID))
yellow_DF = yellow_DF.where(col("DOLocationID").isin(locID))
print("---DONE---")

print("Number of records at this stage:",(yellow_DF.count(), len(yellow_DF.columns)))

#remove dates out of the range solely for pickup_datetime,dropoff_datetime
print("-Removing rows with wrong dates")
date_in = pd.to_datetime('2009-01-01 00:00:00')
date_out =  pd.to_datetime('2020-07-01 00:00:00' )
yellow_DF = yellow_DF.filter(f.col("pickup_datetime").between(date_in,date_out))\
                .filter(f.col("dropoff_datetime").between(date_in,date_out))
print("---DONE---")

print("Number of records at this stage:",(yellow_DF.count(), len(yellow_DF.columns)))

print("Final Schema:")
yellow_DF.printSchema()

Number of records at this stage: (154014, 20)
-Removing useless columns
---DONE---
-Replacing null values by nothing
---DONE---
-Removing rows with null values
---DONE---
Number of records at this stage: (63647, 8)
-Removing duplicate values
---DONE---
-Removing rows with wrong locID
---DONE---
Number of records at this stage: (63330, 8)
-Removing rows with wrong dates
---DONE---
Number of records at this stage: (63330, 8)
Final Schema:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- Passenger_count: double (nullable = true)
 |-- Trip_distance: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- Tip_amount: double (nullable = true)
 |-- Total_amount: double (nullable = true)



In [None]:
#remove trip with no passenger
print("-Removing rows with no passenger")
yellow_DF = yellow_DF[yellow_DF['passenger_count']!=0]
print("---DONE---")

print("Number of records at this stage:",(yellow_DF.count(), len(yellow_DF.columns)))

#remove outliers
print("-Removing rows with no passenger")

print("---DONE---")

print("Number of records at this stage:",(yellow_DF.count(), len(yellow_DF.columns)))

#negative values into positive ones
print("-Changing negative values into positive ones")
yellow_DF = yellow_DF.withColumn('Trip_distance',abs(yellow_DF.Trip_distance))\
                    .withColumn('Total_amount',abs(yellow_DF.Total_amount))\
                    .withColumn('Tip_amount',abs(yellow_DF.Tip_amount))

yellow_DF.where(col('Trip_distance')<0).show()
print("---DONE---")

In [41]:
green_DF = green_DF.withColumn('Trip_distance',abs(yellow_DF.Trip_distance))\
                    .withColumn('Total_amount',abs(yellow_DF.Total_amount))\
                    .withColumn('Tip_amount',abs(yellow_DF.Tip_amount))

+---------------+----------------+---------------+-------------+------------+------------+----------+------------+
|pickup_datetime|dropoff_datetime|Passenger_count|Trip_distance|PULocationID|DOLocationID|Tip_amount|Total_amount|
+---------------+----------------+---------------+-------------+------------+------------+----------+------------+
+---------------+----------------+---------------+-------------+------------+------------+----------+------------+



In [13]:
yellow_DF = yellow_DF.toPandas().to_csv('data/cleaned/yellow/yellow_cleaned.csv', index = False)