## Course: Computing Foundation of Data Science
## Project Title: Big Data 
## Authors: Kubam Ivo Mbi and Berdai Hasnae


In [17]:
# Getting the necessary files ready
import os
# Check wether folder exist if not create
if os.path.exists('Files'):
    pass
else:
    os.mkdir('Files')
    # unzip the files
    # This line of code assumes the zipped file is in the same location as this jupyter notebook
    import zipfile
    
    with zipfile.ZipFile('tlc_0.2perc.zip') as zip_ref:
        zip_ref.extractall('Files/')
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('Files/tlc_0.2perc'))))

Total number of files:  281


## Task 2.1: Collecting metadata, inspecting schema evolution

In [18]:
# Creating a new spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('TLC') \
    .getOrCreate()


In [93]:
# defining some key functions

#importing necessary libraries
import numpy as np
import shutil

#generator function
def file_content(main_folder):
    file_list = os.listdir(main_folder)
    for file_name in file_list:
        yield file_name

def record_stat_schema(filepath):
    """Function receives a file path of a folder containing files as its argument. For each file,\
    the number of records and files size is calculated and two lists are return containing all these values.\
    This function also checks the schema of these files and group them by their schema"""
    
    file_size = [] #list to hold file size 
    file_rec = [] #list to hold file records
    schema_list = [] # list to hold all unique schema
    current_schema = [] #
    schema_list = []
    version = 0
    
    for file in file_content(filepath):
        file_path = os.path.join('Files/tlc_0.2perc', file)
        df = spark.read.csv(file_path, header=True)
        
        #Checking file size and number of records
        file_rec.append(df.count()) # append number of records
       # x = os.stat(file_path).st_size
        file_size.append(os.stat(file_path).st_size) # append each number of records
        
        #Checking the schema
        
        schema_new = list(df.columns)
                          
        for x in range(len(schema_new)):
            schema_new[x] = schema_new[x].lower() # set the columns names to lowercase()
           
        if schema_new not in schema_list:
            schema_list.append(schema_new)
           
                      
        if schema_new == current_schema:
            shutil.copy2(file_path,'Files/schema_v_'+ str(version))
        elif schema_new != current_schema:
            version += 1
            if os.path.exists('Files/schema_v_'+ str(version)):
                pass
            else:
                os.mkdir('Files/schema_v_'+ str(version))
                shutil.copy2(file_path,'Files/schema_v_'+ str(version))
            current_schema = schema_new
            
    return  np.array(file_rec) , np.array(file_size), schema_list
    


In [None]:
def cal_stat(file_rec, file_size):
    ''' Function receivs two lists containing all file sizes and number of records respectively./
    It then calculates all the necessary statistics.'''
    
    stat = {} # Dictionary to hold statistics for number of records and file sizes
    records = {} # Dictionary to hold statistics for number of records
    size = {} # Dictionary to hold statistics for file sizes
    
    for x in ['min','max','mean','25th','50th','75th','90th']:
        if x == 'min':
            records[x] = min(file_rec)
            size[x] = min(file_size)
        elif x == 'max':
            records[x] = max(file_rec)
            size[x] = max(file_size)
        elif x == 'mean':
            records[x] = np.around(np.mean(file_rec),2)
            size[x] = np.around(np.mean(file_size),2)
        elif x == '25th':
            records[x] = np.around(np.percentile(file_rec,25),2)
            size[x] = np.around(np.percentile(file_size,25),2)
        elif x == '50th':
            records[x] = np.around(np.percentile(file_rec,50),2)
            size[x] = np.around(np.percentile(file_size,50),2)
        elif x == '75th':
            records[x] = np.around(np.percentile(file_rec,75),2)
            size[x] = np.around(np.percentile(file_size,75),2)
        elif x == '90th':
            records[x] = np.around(np.percentile(file_rec,90),2)
            size[x] = np.around(np.percentile(file_size,90),2)
        stat['record stats'] = records
        stat['size stats'] = size
    return print(stat)

In [94]:
a,b,c = record_stat_schema('./Files/tlc_0.2perc')


In [18]:
cal_stat(a,b)

{'record stats': {'min': 2512, 'max': 5959352, 'mean': 2152301.64, '25th': 257388.0, '50th': 1479679.0, '75th': 4181249.0, '90th': 5188938.0}, 'size stats': {'min': 15, 'max': 47703, 'mean': 17922.07, '25th': 3037.0, '50th': 19532.0, '75th': 28560.0, '90th': 31372.0}}


In [102]:
#Schema analysis
for x in range(len(c)):
    for y in range(len(c[x])):
        c[x][y] = c[x][y].strip() #removing leading and trailing spaces
        
# Columns in schema 1 to 14 not in schema 15
for x in range(14):
    ls_diff = []
    for elem in c[x]:
        if elem not in c[14]:
            ls_diff.append(elem)
    print('schema :', x+1)
    print(ls_diff)

schema : 1
['hvfhs_license_num', 'dispatching_base_num', 'pickup_datetime', 'dropoff_datetime', 'sr_flag']
schema : 2
['dispatching_base_num', 'pickup_date', 'locationid']
schema : 3
['dispatching_base_num', 'pickup_datetime', 'dropoff_datetime']
schema : 4
['dispatching_base_num', 'pickup_datetime', 'dropoff_datetime', 'sr_flag']
schema : 5
['pickup_datetime', 'dropoff_datetime', 'sr_flag', 'dispatching_base_number', 'dispatching_base_num']
schema : 6
['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'ehail_fee', 'trip_type']
schema : 7
['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'ehail_fee', 'trip_type']
schema : 8
['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'ehail_fee', 'trip_type']
schema : 9
['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'ehail_fee', 'trip_type']
schema : 10
['vendor_name', 'trip_pickup_datet

In [103]:
#Columns in schema 15 not in 1-14
for x in range(14):
    print('schema',x+1)
    print(set(c[14])-set(c[x]))

schema 1
{'fare_amount', 'improvement_surcharge', 'congestion_surcharge', 'passenger_count', 'tolls_amount', 'extra', 'payment_type', 'store_and_fwd_flag', 'total_amount', 'ratecodeid', 'mta_tax', 'trip_distance', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'tip_amount', 'vendorid'}
schema 2
{'fare_amount', 'improvement_surcharge', 'pulocationid', 'passenger_count', 'payment_type', 'total_amount', 'ratecodeid', 'dolocationid', 'tpep_pickup_datetime', 'vendorid', 'congestion_surcharge', 'tolls_amount', 'extra', 'store_and_fwd_flag', 'mta_tax', 'trip_distance', 'tpep_dropoff_datetime', 'tip_amount'}
schema 3
{'fare_amount', 'improvement_surcharge', 'congestion_surcharge', 'passenger_count', 'tolls_amount', 'extra', 'payment_type', 'store_and_fwd_flag', 'total_amount', 'ratecodeid', 'mta_tax', 'trip_distance', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'tip_amount', 'vendorid'}
schema 4
{'fare_amount', 'improvement_surcharge', 'congestion_surcharge', 'passenger_count', 'tolls_a

In [104]:
# Common columns in schema 15 and in 1-14
for x in range(14):
    ls_common = []
    for elem in c[x]:
        if elem in c[14]:
            ls_common.append(elem)
    print('schema :', x+1)
    print(ls_common)

schema : 1
['pulocationid', 'dolocationid']
schema : 2
[]
schema : 3
['pulocationid', 'dolocationid']
schema : 4
['pulocationid', 'dolocationid']
schema : 5
['pulocationid', 'dolocationid']
schema : 6
['vendorid', 'store_and_fwd_flag', 'ratecodeid', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'total_amount', 'payment_type']
schema : 7
['vendorid', 'store_and_fwd_flag', 'ratecodeid', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'payment_type']
schema : 8
['vendorid', 'store_and_fwd_flag', 'ratecodeid', 'pulocationid', 'dolocationid', 'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge', 'total_amount', 'payment_type']
schema : 9
['vendorid', 'store_and_fwd_flag', 'ratecodeid', 'pulocationid', 'dolocationid', 'passenger_count', 'trip_distance', 'fare_amount', 'extra

In [100]:
# Merging files of Schema 4 and 6
for file in os.listdir('./Files/schema_v_6'):
    shutil.move(os.path.join('Files/schema_v_6', file),'./Files/schema_v_4')
    
#delete the schema v 6 folder  
os.rmdir('./Files/schema_v_6')

## 2.2 Data integration