# Data cleaning

In [2]:
# Imports go here
import os
import csv
import glob
import pandas as pd
import os 
import shutil
import datetime
import geopandas as gpd
from datetime import date
from datetime import datetime
from pyspark.sql.functions import col, lit
import pyspark.sql.functions as f
import pyspark.sql.types
from shutil import copyfile
from shapely.geometry import Point
os.environ['PYSPARK_SUBMIT_ARGS'] ="--conf spark.driver.memory=3g  pyspark-shell"
from pyspark.sql import SparkSession
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass
# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("H600 L-Group") \
    .getOrCreate()
#When dealing with RDDs, we work the sparkContext object. See https://spark.apache.org/docs/latest/api/python/pyspark.html#pyspark.SparkContext
sc=spark.sparkContext
#in local mode, you will be able to access the Spark GUI at http://localhost:4040

## Auxiliary functions

In [None]:
#2. build a fonction to remove useless columns
input_file = 'data/test/fhv_tripdata_2020-04.csv'
output_file = 'data/testout/fhv_tripdata_2020-04.csv'
cols_to_remove = [5] # Column indexes to be removed (starts at 0)

cols_to_remove = sorted(cols_to_remove, reverse=True) # Reverse so we remove from the end first
row_count = 0 # Current amount of rows processed

with open(input_file, "r") as source:
    reader = csv.reader(source)
    with open(output_file, "w", newline='') as result:
        writer = csv.writer(result)
        for row in reader:
            row_count += 1
            print('\r{0}'.format(row_count), end='') # Print rows processed
            for col_index in cols_to_remove:
                del row[col_index]
            writer.writerow(row)

#or
            
def drop_unused_feat(df, features):
    """
    This function drops all the features that were initially in the dataset and considered
    not useful for the analysis performed later on. 
    The idea is to have the dataset reduced to its essential content for the analysis.
    
    Input: the dataframe to prepare and the name of the features to drop
        
    Output: the dataframe without the drop columns.
    """
        
    #drops all listed features
    df = df.drop(features, axis=1)
    
    return df

In [9]:
# Auxiliary code to help in the data cleaning process goes here

#1. check and remove duplicate records

df = pd.read_csv("data/sampled/fhv_tripdata_2020-04.csv")
ids = df['pickup_datetime']
df[ids.isin(ids[ids.duplicated()])].sort('pickup_datetime')

df = pd.read_csv('list-history.csv')
df = df.drop_duplicates(subset=['Keywords'], keep='first')
print(df)









#no passenger
dfg=pd.read_csv("data/sampled/green_tripdata_2020-04.csv")
print('Proportion of trips witout any passenger')
ratio = df_feb_2018[df_feb_2018['passenger_count']==0].shape[0]/df_feb_2018.shape[0]*100

print("{:.2f}".format(ratio),'%')


123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277

In [None]:
#5 calculate trip duration
def trip_duration_calc(pu_time, do_time):
    # computes the duration of the trip in seconds
    trip_duration = do_time - pu_time
    trip_duration = trip_duration/np.timedelta64(1,'s')
    
    return trip_duration

In [None]:
#3. identify non consistent data
#check consistency between total amount and all fees (yellow and green)
#check consistency regarding datatype

#4 fill in with data
# for numeric or fees => mean of all the rest
# for other => delete the row ?

In [12]:
dfg=pd.read_csv("data/sampled/yellow_tripdata_2020-04.csv")
print('Proportion of trips witout any passenger')
ratio = dfg[dfg['passenger_count']==0].shape[0]/dfg.shape[0]*100

print("{:.2f}".format(ratio),'%')

Proportion of trips witout any passenger
3.57 %


In [None]:
def handle_no_passenger(df):
    """
    This function splits the dataframe into two dataframes:
    - one with only the trips with no passengers
    - one with all the trips that have at least one passenger

    Input: the dataframe to prepare
        
    Output: two dataframes, one with passengers, and the other one with only the 'empty trips'
    """
    
    #We create two dataframes, one with only the trips with no passengers, the other with passengers
    no_pass_condition = df[df['passenger_count'] == 0].index
    df_full = df.drop(no_pass_condition)
    df_empty = df[df['passenger_count']==0]
    
    return df_empty, df_full

• Yellow taxi records are records that record trip information of New York's famous yellow
taxi cars.

• Green taxi records are records that record trip information by so-called 'boro' taxis a
newer service introduced in August of 2013 to improve taxi service and availability in the
boroughs

• FHV records (short for 'For Hire Vehicles') record information from services that oer
for-hire vehicles (such as Uber, Lyft, Via, and Juno), but also luxury limousine bases.

• High volume FHV (FHVHV for short) are FHV records oered by services that make
more than 10,000 trips per day

## 1. Cleaning the FHV dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|Dispatching_base_num|License Number of the base that dispatched the trip|String|Not Null|
|Pickup_datetime|The date and time of the trip pick-up|Datetime|Not Null|
|DropOff_datetime|The date and time of the trip dropoff|Datetime|Not Null|
|PULocationID|Zone in which the trip began|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the trip ended|Integer(smallint)|Not Null|
|SR_Flag|Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line); share=1, nonshared=0|Boolean|Not Null| 

In [28]:
#fhv_DF = (spark.read
#    .option("sep", ",")
#    .option("header", True)
#    .option("inferSchema", True)
#    .csv('data/sampled/fhv_tripdata_2020-04.csv'))
#fhv_DF.printSchema() 
#
#df = pd.read_csv('data/sampled/fhv_tripdata_2020-04.csv')
#df.get_dtype_counts()
df.head(20)

df['dispatching_base_num'] = df['dispatching_base_num'].fillna(99)

#np.where(condition, then, else)

Unnamed: 0,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag
0,B02794,2020-04-27 20:53:10,2020-04-27 20:58:10,264.0,76.0,
1,B00647,2020-04-06 13:32:04,2020-04-06 13:40:24,264.0,174.0,
2,B01340,2020-04-27 14:50:35,2020-04-27 14:59:24,264.0,242.0,
3,B01437,2020-04-21 13:46:59,2020-04-21 13:56:44,264.0,131.0,
4,B02563,2020-04-28 06:23:10,2020-04-28 06:31:40,264.0,198.0,
5,B01145,2020-04-16 21:41:03,2020-04-16 21:56:24,264.0,31.0,
6,B00937,2020-04-17 20:48:48,2020-04-17 20:56:29,264.0,243.0,
7,B02782,2020-04-09 05:23:00,2020-04-09 05:27:00,,,
8,B02794,2020-04-03 21:21:11,2020-04-03 21:25:56,264.0,225.0,
9,B00937,2020-04-19 10:56:43,2020-04-19 11:08:01,264.0,243.0,


### Validity rules

NameError: name 'fhv_DF' is not defined

### Identifying dirty records, data repairing

## 2. Cleaning the FHVHV dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|Hvfhs_license_num|TLC license number of the HVFHS base or business|String|Not Null|
|Dispatching_base_num|License Number of the base that dispatched the trip|String|Not Null|
|Pickup_datetime|The date and time of the trip pick-up|Datetime|Not Null|
|DropOff_datetime|The date and time of the trip dropoff|Datetime|Not Null|
|PULocationID|Zone in which the trip began|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the trip ended|Integer(smallint)|Not Null|
|SR_Flag|Indicates if the trip was a part of a shared ride chain offered by a High Volume FHV company (e.g. Uber Pool, Lyft Line); share=1, nonshared=0|Boolean|Not Null| 


*Hvfhs_license_num possible values:
• HV0002: Juno
• HV0003: Uber
• HV0004: Via
• HV0005: Lyft

### Validity rules

### Identifying dirty records, data repairing

## 3. Cleaning the GREEN dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|VendorID|A code indicating the LPEP provider that provided the record.|Integer(tinyint)|1 or 2, Not Null|
|lpep_pickup_datetime|The date and time when the meter was engaged|Datetime|Not Null|
|lpep_dropoff_datetime|The date and time when the meter was disengaged|Datetime|Not Null|
|Passenger_count|The number of passengers in the vehicle|Integer(tinyint)|???|
|Trip_distance|The elapsed trip distance in miles reported by the taximeter|Decimal|   	|
|PULocationID|Zone in which the taximeter was engaged|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the taximeter was disengaged|Integer(smallint)|Not Null|
|RateCodeID|The final rate code in effect at the end of the trip|Integer(tinyint)|1 to 6, Not Null|
|Store_and_fwd_flag|This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,”because the vehicle did not have a connection to the server|Boolean|Y or N, Not Null|
|Payment_type|A numeric code signifying how the passenger paid for the trip|Integer(tinyint)|1 to 6, Not Null|
|Fare_amount|The time-and-distance fare calculated by the meter|Decimal|Not Null|
|Extra|Miscellaneous extras and surcharges|Decimal|   	|
|MTA_tax|0.50 MTA tax that is automatically triggered based on the metered rate in use|Decimal|   	|
|Improvement_surcharge|0.30 improvement surcharge assessed on hailed trips at the flag drop|Decimal|   	|
|Tip_amount|Tip amount – This field is automatically populated for credit card tips. Cash tips are not included|Decimal|   	|
|Tolls_amount|Total amount of all tolls paid in trip|Decimal|   	|
|Total_amount|The total amount charged to passengers|Decimal|   	|
|Trip_type|A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver|Boolean|1 or 2, Not Null|

### Validity rules

### Identifying dirty records, data repairing

## 4. Cleaning the YELLOW dataset

### Analysis of valid values

|Column Value|Description|Data Type|Constraints|
|---	|---	|---	|---	|
|VendorID|A code indicating the LPEP provider that provided the record.|Integer(tinyint)|1 or 2, Not Null|
|lpep_pickup_datetime|The date and time when the meter was engaged|Datetime|Not Null|
|lpep_dropoff_datetime|The date and time when the meter was disengaged|Datetime|Not Null|
|Passenger_count|The number of passengers in the vehicle|Integer(tinyint)|???|
|Trip_distance|The elapsed trip distance in miles reported by the taximeter|Decimal|   	|
|PULocationID|Zone in which the taximeter was engaged|Integer(smallint)|Not Null|
|DOLocationID|Zone in which the taximeter was disengaged|Integer(smallint)|Not Null|
|RateCodeID|The final rate code in effect at the end of the trip|Integer(tinyint)|1 to 6, Not Null|
|Store_and_fwd_flag|This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward,”because the vehicle did not have a connection to the server|Boolean|Y or N, Not Null|
|Payment_type|A numeric code signifying how the passenger paid for the trip|Integer(tinyint)|1 to 6, Not Null|
|Fare_amount|The time-and-distance fare calculated by the meter|Decimal|Not Null|
|Extra|Miscellaneous extras and surcharges|Decimal|   	|
|MTA_tax|0.50 MTA tax that is automatically triggered based on the metered rate in use|Decimal|   	|
|Improvement_surcharge|0.30 improvement surcharge assessed on hailed trips at the flag drop|Decimal|   	|
|Tip_amount|Tip amount – This field is automatically populated for credit card tips. Cash tips are not included|Decimal|   	|
|Tolls_amount|Total amount of all tolls paid in trip|Decimal|   	|
|Total_amount|The total amount charged to passengers|Decimal|   	|

### Validity rules

### Identifying dirty records, data repairing