## Course: Computing Foundation of Data Science
## Project Title: Big Data 
## Authors: Kubam Ivo Mbi and Berdai Hasnae
## Date: 30/11/2020


In [3]:
# Getting the necessary files ready
import os
# Check wether folder exist if not create
if os.path.exists('Files'):
    pass
else:
    os.mkdir('Files')
    # unzip the files
    # This line of code assumes the zipped file is in the same location as this jupyter notebook
    import zipfile
    
    with zipfile.ZipFile('tlc_0.2perc.zip') as zip_ref:
        zip_ref.extractall('Files/')
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('Files/tlc_0.2perc'))))

Total number of files:  281


## Task 2.1: Collecting metadata, inspecting schema evolution

In [4]:
# Creating a new spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('TLC') \
    .getOrCreate()


In [5]:
# defining some key functions

#importing necessary libraries
import numpy as np
import shutil

#generator function
def file_content(main_folder):
    file_list = os.listdir(main_folder)
    for file_name in file_list:
        yield file_name

def record_stat_schema(filepath):
    """Function receives a file path of a folder containing files as its argument. For each file,\
    the number of records and files size is calculated and two lists are return containing all these values.\
    This function also checks the schema of these files and group them by their schema"""
    
    file_size = [] #list to hold file size 
    file_rec = [] #list to hold file records
    schema_list = [] # list to hold all unique schema
    current_schema = [] #
    schema_list = []
    version = 0
    
    for file in file_content(filepath):
        file_path = os.path.join('Files/tlc_0.2perc', file)
        df = spark.read.csv(file_path, header=True)
        
        #Checking file size and number of records
        file_rec.append(df.count()) # append number of records
       # x = os.stat(file_path).st_size
        file_size.append(os.stat(file_path).st_size) # append each number of records
        
        #Checking the schema
        
        schema_new = list(df.columns)
                          
        for x in range(len(schema_new)):
            schema_new[x] = schema_new[x].lower() # set the columns names to lowercase()
           
        if schema_new not in schema_list:
            schema_list.append(schema_new)
           
                      
        if schema_new == current_schema:
            shutil.copy2(file_path,'Files/schema_v_'+ str(version))
        elif schema_new != current_schema:
            version += 1
            if os.path.exists('Files/schema_v_'+ str(version)):
                pass
            else:
                os.mkdir('Files/schema_v_'+ str(version))
                shutil.copy2(file_path,'Files/schema_v_'+ str(version))
            current_schema = schema_new
            
    return  np.array(file_rec) , np.array(file_size), schema_list
    


In [6]:
def cal_stat(file_rec, file_size):
    ''' Function receivs two lists containing all file sizes and number of records respectively./
    It then calculates all the necessary statistics.'''
    
    stat = {} # Dictionary to hold statistics for number of records and file sizes
    records = {} # Dictionary to hold statistics for number of records
    size = {} # Dictionary to hold statistics for file sizes
    
    for x in ['min','max','mean','25th','50th','75th','90th']:
        if x == 'min':
            records[x] = min(file_rec)
            size[x] = min(file_size)
        elif x == 'max':
            records[x] = max(file_rec)
            size[x] = max(file_size)
        elif x == 'mean':
            records[x] = np.around(np.mean(file_rec),2)
            size[x] = np.around(np.mean(file_size),2)
        elif x == '25th':
            records[x] = np.around(np.percentile(file_rec,25),2)
            size[x] = np.around(np.percentile(file_size,25),2)
        elif x == '50th':
            records[x] = np.around(np.percentile(file_rec,50),2)
            size[x] = np.around(np.percentile(file_size,50),2)
        elif x == '75th':
            records[x] = np.around(np.percentile(file_rec,75),2)
            size[x] = np.around(np.percentile(file_size,75),2)
        elif x == '90th':
            records[x] = np.around(np.percentile(file_rec,90),2)
            size[x] = np.around(np.percentile(file_size,90),2)
        stat['record stats'] = records
        stat['size stats'] = size
    return print(stat)

In [7]:
# Retrieving values
file_records, file_sizes, schema_list = record_stat_schema('./Files/tlc_0.2perc')


In [9]:
#View statistis
print(cal_stat(file_records,file_sizes))

{'record stats': {'min': 15, 'max': 47703, 'mean': 17922.07, '25th': 3037.0, '50th': 19532.0, '75th': 28560.0, '90th': 31372.0}, 'size stats': {'min': 2512, 'max': 5959352, 'mean': 2152301.64, '25th': 257388.0, '50th': 1479679.0, '75th': 4181249.0, '90th': 5188938.0}}
None


In [10]:
#Schema analysis
for x in range(len(schema_list)):
    for y in range(len(schema_list[x])):
        schema_list[x][y] = schema_list[x][y].strip() #removing leading and trailing spaces in column names

# Columns in schema 1 to 14 not in schema 15
with open('./Files/schema_diff_others_vs_15.txt', 'w') as f:
    for x in range(14):
        ls_diff = []
        for elem in schema_list[x]:
            if elem not in schema_list[14]:
                ls_diff.append(elem)
         
        
        f.write('./Files/schema: '+str(x +1)+'\n')
        f.write(''.join(ls_diff)+'\n')
f.close()    

In [11]:
#Columns in schema 15 not in 1-14

with open('./Files/schema_diff_15_vs_others.txt', 'w') as f:
    for x in range(14):
        ls_diff2 = []
        ls_diff2.append(set(schema_list[14])-set(schema_list[x]))
        #print(ls_diff2)
        ls_diff2 = ''.join(str(e) for e in ls_diff2)
        f.write('schema: '+str(x +1)+'\n')
        f.write(ls_diff2+'\n')
f.close()

In [13]:
# Common columns in schema 15 and in 1-14
with open('./Files/schema_common.txt', 'w') as f:
    for x in range(14):
        ls_common = []
        for elem in schema_list[x]:
            if elem in schema_list[14]:
                ls_common.append(elem)
                
        f.write('schema: '+str(x +1)+'\n')
        f.write(''.join(ls_common)+'\n')
f.close()  

Files in schema 4 and 6 were found to be of the same schema. So a merging of the two folders was done

In [39]:
# Merging files of Schema 4 and 6
for file in os.listdir('./Files/schema_v_6'):
    shutil.move(os.path.join('Files/schema_v_6', file),'./Files/schema_v_4')
    
#delete the schema v 6 folder  
os.rmdir('./Files/schema_v_6')

In [37]:
# The folder created to hold zipped is deleted.
# Clearing working environment
import shutil
shutil.rmtree('./Files/tlc_0.2perc')

## 2.2 Data integration

In [370]:
from pyspark.sql import *
from pyspark.sql.functions import *
# Function to fix column names

def column_name_fix(df):
    '''Function converts column names to lowercase'''
    colnames = df.columns
    for x in range(len(colnames)):
        colnames[x] =  colnames[x].lower()
    df = df.toDF(*colnames)
    return df

In [348]:
# Function to drop column
def column_drop(df,drop_list):
    '''Function receives a spark dataframe and list of columns to be dropped.
    It returns a dataframe less the columns specified to be dropped'''
    y = []
    for x in drop_list:
        y.append(df.columns[x])
    return df.drop(*y)
     

In [343]:
# Function to rename column
def column_rename(df,col_list1,col_list2):
    '''Function renames the column name of a dataframe with a provided list of column names.
        The two lists must be of the order i.e. value one in list one replaces value one in list two and so on'''
    if len(col_list1) == len(col_list2):
        for x in range(len(col_list1)):
            df = df.select('*', df[col_list1[x]].alias(col_list2[x]))
        df = column_drop(df,col_list1)
        return df
    else:
        print('length of two list must be thesame.')
        pass
     


In [46]:
# Add columns to a dataframe
from pyspark.sql.functions import lit
def column_add(df,col_list):
    
    for x in col_list:
        df = df.withColumn(x, lit(""))
    return df

df = spark.read.csv('fhvhv_tripdata_2019-02.csv', header=True)
col_list = ['fare_amount', 'improvement_surcharge', 'congestion_surcharge', 'passenger_count']


In [47]:
# Function to convert latitude and longitude to location ID

Fixing the files Based on the schema differences identified above. The schema of folder 16 will be used as the standard or
baseline schema because it is the latest. So changess are made to files based on their current schema differences with the baseline schema


In [3]:
# Creating a folder to hold integrated files
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files'):
    pass
else:
    os.mkdir('Files/integrated_files')

In [4]:
# Schema One
import os
if os.path.exists('Files/integrated_files/FHV/Schema_v_1'):
    pass
else:
    os.mkdir('Files/integrated_files/FHV/Schema_v_1')

# Columns to be renamed
col_list1 = [2, 3]
col_list2 = ['tpep_pickup_datetime','tpep_dropoff_datetime']
# columns to be added
col_add = ['fare_amount', 'improvement_surcharge', 'congestion_surcharge', 'passenger_count', 'tolls_amount', 'extra', 'payment_type', 'store_and_fwd_flag', 'total_amount', 'ratecodeid', 'mta_tax', 'trip_distance', 'tpep_dropoff_datetime', 'tpep_pickup_datetime', 'tip_amount', 'vendorid']
folder_path = 'Files/schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join('Files/schema_v_1', file)
        df = column_name_fix(spark.read.csv(file_path, header=True))# The read process pass via column_name_fix function
        df = column_rename(df,col_list1,col_list2)
        df = column_add(df, col_add)
        df.toPandas().to_csv(os.path.join('Files/integrated_files/FHV/Schema_v_1', file))
        # write code to export to csv        
       

FileNotFoundError: [Errno 2] No such file or directory: 'Files/integrated_files/FHV/Schema_v_1'

$\color{red}{\text{WARNING!!!.}}$ $\color{red}{\text{" This action will clear all folders and their contents created during project."}}$

In [3]:
# Clearing working environment
import shutil
shutil.rmtree('./Files')