# Sampled Dataset exploration, meta-data collection

$\color{red}{\text{ASSUMPTIONS!!!.}}$ 
<br>
$\color{red}{\text{We assume the zipped folder containing all the csv files is placed in thesame location as this notebook. }}$ 
<br>
$\color{red}{\text{The current user has permission to copy and move files. Also can create folders}}$

In [2]:
# Imports go here

# Getting the necessary files ready
import os

# Provide the path to zipped folder containing all csv files 
zip_folder_path = "./tlc_0.2perc.zip"


# Check wether folder exist if not create
if os.path.exists('Files'):
    pass
else:
    os.mkdir('Files')
    # unzip the files
    # This line of code assumes the zipped file is in the same location as this jupyter notebook
    import zipfile
    
    with zipfile.ZipFile(zip_folder_path) as zip_ref:
        zip_ref.extractall('Files/')


In [3]:
from pyspark.sql import SparkSession
#if a spark session was already started, we stop it before starting a new one
#(there can be only one spark context per jupyter notebook)
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass

# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("TLC") \
    .getOrCreate()
    
sc=spark.sparkContext

In [4]:
# defining some key functions

#generator function
def file_content(main_folder):
    file_list = os.listdir(main_folder)
    for file_name in file_list:
        yield file_name
        


In [5]:

# Fucntion to record file size and number of records

#importing necessary libraries
import numpy as np
def record_stat(filepath):
    """Function receives a file path of a folder containing files as its argument. For each file,\
    the number of records and files size is calculated and two lists are return containing all these values.\
    This function also checks the schema of these files and group them by their schema"""
    
    file_size = [] #list to hold file size 
    file_rec = [] #list to hold file records
    
    for file in file_content(filepath):
        file_path = os.path.join(filepath, file)
        df = spark.read.csv(file_path, header=True)
        
        #Checking file size and number of records
        file_rec.append(df.count()) # append number of records
       # x = os.stat(file_path).st_size
        file_size.append(os.stat(file_path).st_size) # append each number of records
    return  np.array(file_rec) , np.array(file_size)

In [6]:
# Function to calculate the required statistics

def cal_stat(file_rec, file_size):
    ''' Function receivs two lists containing all file sizes and number of records respectively./
    It then calculates all the necessary statistics.'''
    
    stat = {} # Dictionary to hold statistics for number of records and file sizes
    records = {} # Dictionary to hold statistics for number of records
    size = {} # Dictionary to hold statistics for file sizes
    
    for x in ['min','max','mean','25th','50th','75th','90th']:
        if x == 'min':
            records[x] = min(file_rec)
            size[x] = min(file_size)
        elif x == 'max':
            records[x] = max(file_rec)
            size[x] = max(file_size)
        elif x == 'mean':
            records[x] = np.around(np.mean(file_rec),2)
            size[x] = np.around(np.mean(file_size),2)
        elif x == '25th':
            records[x] = np.around(np.percentile(file_rec,25),2)
            size[x] = np.around(np.percentile(file_size,25),2)
        elif x == '50th':
            records[x] = np.around(np.percentile(file_rec,50),2)
            size[x] = np.around(np.percentile(file_size,50),2)
        elif x == '75th':
            records[x] = np.around(np.percentile(file_rec,75),2)
            size[x] = np.around(np.percentile(file_size,75),2)
        elif x == '90th':
            records[x] = np.around(np.percentile(file_rec,90),2)
            size[x] = np.around(np.percentile(file_size,90),2)
        stat['record stats'] = records
        stat['size stats'] = size
    return stat

## Statistics about the dataset

Compute basic statistics about the number of files in this sub-dataset, their size, and the number of records (lines) in each file. For length and number of records, give the min, max, mean, 25, 50, 75, 90 percentiles values.

In [7]:
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('Files/tlc_0.2perc'))))

# Retrieving values
file_records, file_sizes = record_stat('./Files/tlc_0.2perc')

#View statistis
print(cal_stat(file_records,file_sizes))

Total number of files:  281
{'record stats': {'min': 15, 'max': 47703, 'mean': 17922.07, '25th': 3037.0, '50th': 19532.0, '75th': 28560.0, '90th': 31372.0}, 'size stats': {'min': 2512, 'max': 5959352, 'mean': 2152301.64, '25th': 257388.0, '50th': 1479679.0, '75th': 4181249.0, '90th': 5188938.0}}
None


In [8]:
# Grouping the files into yellow, green, fhv, hvfhv.
for folder in ['yellow','green','FHV','FHVHV']:
    if os.path.exists(os.path.join('Files', folder)):
        pass
    else:
        os.mkdir(os.path.join('Files', folder))
        
# Reading each file and make a copy in the corresponding subfolder
import shutil


for file in file_content('Files/tlc_0.2perc'):
    for word in ['yellow','green','FHV','FHVHV']:
        if file.lower().startswith('yellow'):
            shutil.copy2(os.path.join('Files/tlc_0.2perc', file),'Files/yellow')
        elif file.lower().startswith('green'):
            shutil.copy2(os.path.join('Files/tlc_0.2perc', file),'Files/green')
        elif file.lower().startswith('fhvhv'):
            shutil.copy2(os.path.join('Files/tlc_0.2perc', file),'Files/FHVHV')
        elif file.lower().startswith('fhv'):
            shutil.copy2(os.path.join('Files/tlc_0.2perc', file),'Files/FHV')
            


In [9]:
# Statistics FHV
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('Files/FHV'))))

# Retrieving values
file_records, file_sizes = record_stat('Files/FHV')

#View statistis
print(cal_stat(file_records,file_sizes))

Total number of files:  64
{'record stats': {'min': 959, 'max': 47672, 'mean': 21712.62, '25th': 6030.5, '50th': 21692.0, '75th': 33844.75, '90th': 43708.9}, 'size stats': {'min': 52060, 'max': 3339455, 'mean': 1147257.84, '25th': 218111.25, '50th': 646615.0, '75th': 2257156.5, '90th': 3020144.5}}
None


In [10]:
# Statistics FHVHV
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('./Files/FHVHV'))))

# Retrieving values
file_records, file_sizes = record_stat('./Files/FHVHV')

#View statistis
print(cal_stat(file_records,file_sizes))

Total number of files:  10
{'record stats': {'min': 8625, 'max': 47703, 'mean': 32181.9, '25th': 18022.0, '50th': 40714.0, '75th': 43056.25, '90th': 44922.0}, 'size stats': {'min': 535789, 'max': 2978931, 'mean': 2007750.7, '25th': 1121047.25, '50th': 2542280.0, '75th': 2687857.25, '90th': 2804332.8}}
None


In [11]:
# Statistics green
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('./Files/green'))))

# Retrieving values
file_records, file_sizes = record_stat('./Files/green')

#View statistis
print(cal_stat(file_records,file_sizes))

Total number of files:  76
{'record stats': {'min': 15, 'max': 3546, 'mean': 2026.5, '25th': 1358.75, '50th': 2072.5, '75th': 2892.25, '90th': 3126.0}, 'size stats': {'min': 2512, 'max': 570765, 'mean': 262437.29, '25th': 121494.75, '50th': 190194.5, '75th': 456955.0, '90th': 499751.0}}
None


In [12]:
# Statistics yellow
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('./Files/yellow'))))

# Retrieving values
file_records, file_sizes = record_stat('./Files/yellow')

#View statistis
print(cal_stat(file_records,file_sizes))

Total number of files:  131
{'record stats': {'min': 476, 'max': 32300, 'mean': 24203.51, '25th': 19989.0, '50th': 26294.0, '75th': 29080.0, '90th': 30202.0}, 'size stats': {'min': 43103, 'max': 5959352, 'mean': 3750759.69, '25th': 1756967.0, '50th': 4442047.0, '75th': 5123591.5, '90th': 5491438.0}}
None


## Analysis of the schema evolution.

Over time, the relational schema associated to each type of trip data (yellow, green, fhv, hvfhv) has changed. Let us analyze the changes.

## Auxiliary functions

In [13]:
# The folder created to hold zipped is deleted.
# Clearing working environment
import shutil
shutil.rmtree('./Files/tlc_0.2perc')

In [14]:
# Code to help analyze the schema changes goes here

def schema(folder_ref):
    """Function receives a file path of a folder containing files as its argument. 
    This function checks the schema of these files and group them by their schema"""
    
    schema_list = [] # list to hold all unique schema
    current_schema = [] #
    schema_list = []
    version = 0
    dest_path = ""
    
    for file in file_content(folder_ref):
        file_path = os.path.join(folder_ref, file)
        df = spark.read.csv(file_path, header=True)
        
        schema_new = list(df.columns)
                          
        for x in range(len(schema_new)):
            schema_new[x] = schema_new[x].strip().lower() # set the columns names to lowercase()
           
        if schema_new not in schema_list:
            schema_list.append(schema_new)
           
                     
        if schema_new == current_schema:
            shutil.move(file_path,dest_path)
        elif schema_new != current_schema:
            version += 1
            dest_path = os.path.join(folder_ref, 'v_' + str(version))
            os.mkdir(dest_path)
            shutil.move(file_path,dest_path)    
            current_schema = schema_new
            
    return schema_list

### Analysis of schema changes for fhv cab data files

Analyze the schema changes for the FHV cab data files. Write down your conclusions

In [15]:
schema_FHV = schema('./Files/FHV')
print(len(schema_FHV))

4


In [16]:
for x in schema_FHV:
    print(x,'\n')

['dispatching_base_num', 'pickup_date', 'locationid'] 

['dispatching_base_num', 'pickup_datetime', 'dropoff_datetime', 'pulocationid', 'dolocationid'] 

['dispatching_base_num', 'pickup_datetime', 'dropoff_datetime', 'pulocationid', 'dolocationid', 'sr_flag'] 

['pickup_datetime', 'dropoff_datetime', 'pulocationid', 'dolocationid', 'sr_flag', 'dispatching_base_number', 'dispatching_base_num'] 



### Analysis of schema changes for fhvhv data files

Analyze the schema changes for the FHV cab data files. Write down your conclusions

In [17]:
schema_FHVHV = schema('./Files/FHVHV')
print(len(schema_FHVHV))

1


In [18]:
for x in schema_FHVHV:
    print(x,'\n')

['hvfhs_license_num', 'dispatching_base_num', 'pickup_datetime', 'dropoff_datetime', 'pulocationid', 'dolocationid', 'sr_flag'] 



### Analysis of schema changes for green cab data files

Analyze the schema changes for the green taxi data files. Write down your conclusions

In [19]:
schema_green = schema ('./Files/green')
print(len(schema_green))

4


In [20]:
for x in schema_green:
    print(x,'\n')

['vendorid', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'ratecodeid', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'total_amount', 'payment_type', 'trip_type'] 

['vendorid', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'ratecodeid', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_type', 'trip_type'] 

['vendorid', 'lpep_pickup_datetime', 'lpep_dropoff_datetime', 'store_and_fwd_flag', 'ratecodeid', 'pulocationid', 'dolocationid', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge', 'total_amount', 'payment_typ

### Analysis of schema changes for yellow cab data files

Analyze the schema changes for the Yellow taxi data files. Write down your conclusions

In [21]:
schema_yellow = schema ('./Files/yellow')
print(len(schema_yellow))

5


In [22]:
for x in schema_yellow:
    print(x,'\n')

['vendor_name', 'trip_pickup_datetime', 'trip_dropoff_datetime', 'passenger_count', 'trip_distance', 'start_lon', 'start_lat', 'rate_code', 'store_and_forward', 'end_lon', 'end_lat', 'payment_type', 'fare_amt', 'surcharge', 'mta_tax', 'tip_amt', 'tolls_amt', 'total_amt'] 

['vendor_id', 'pickup_datetime', 'dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'rate_code', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'surcharge', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount'] 

['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 'trip_distance', 'pickup_longitude', 'pickup_latitude', 'ratecodeid', 'store_and_fwd_flag', 'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount'] 

['vendorid', 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'passenger_count', 