## Course: Computing Foundation of Data Science
## Project Title: Big Data 
## Authors: Kubam Ivo Mbi and Berdai Hasnae
## Date: 30/11/2020


In [4]:
# Getting the necessary files ready
import os
# Check wether folder exist if not create
if os.path.exists('Files'):
    pass
else:
    os.mkdir('Files')
    # unzip the files
    # This line of code assumes the zipped file is in the same location as this jupyter notebook
    import zipfile
    
    with zipfile.ZipFile('tlc_0.2perc.zip') as zip_ref:
        zip_ref.extractall('Files/')
#Checking the total number of files
print("Total number of files: ", str(len(os.listdir('Files/tlc_0.2perc'))))

Total number of files:  281


## Task 2.1: Collecting metadata, inspecting schema evolution

In [5]:
# Creating a new spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .master('local[*]') \
    .appName('TLC') \
    .getOrCreate()


In [6]:
# defining some key functions

#importing necessary libraries
import numpy as np
import shutil

#generator function
def file_content(main_folder):
    file_list = os.listdir(main_folder)
    for file_name in file_list:
        yield file_name

def record_stat_schema(filepath):
    """Function receives a file path of a folder containing files as its argument. For each file,\
    the number of records and files size is calculated and two lists are return containing all these values.\
    This function also checks the schema of these files and group them by their schema"""
    
    file_size = [] #list to hold file size 
    file_rec = [] #list to hold file records
    schema_list = [] # list to hold all unique schema
    current_schema = [] #
    schema_list = []
    version = 0
    
    for file in file_content(filepath):
        file_path = os.path.join('Files/tlc_0.2perc', file)
        df = spark.read.csv(file_path, header=True)
        
        #Checking file size and number of records
        file_rec.append(df.count()) # append number of records
       # x = os.stat(file_path).st_size
        file_size.append(os.stat(file_path).st_size) # append each number of records
        
        #Checking the schema
        
        schema_new = list(df.columns)
                          
        for x in range(len(schema_new)):
            schema_new[x] = schema_new[x].lower() # set the columns names to lowercase()
           
        if schema_new not in schema_list:
            schema_list.append(schema_new)
           
                      
        if schema_new == current_schema:
            shutil.copy2(file_path,'Files/schema_v_'+ str(version))
        elif schema_new != current_schema:
            version += 1
            if os.path.exists('Files/schema_v_'+ str(version)):
                pass
            else:
                os.mkdir('Files/schema_v_'+ str(version))
                shutil.copy2(file_path,'Files/schema_v_'+ str(version))
            current_schema = schema_new
            
    return  np.array(file_rec) , np.array(file_size), schema_list
    


In [7]:
def cal_stat(file_rec, file_size):
    ''' Function receivs two lists containing all file sizes and number of records respectively./
    It then calculates all the necessary statistics.'''
    
    stat = {} # Dictionary to hold statistics for number of records and file sizes
    records = {} # Dictionary to hold statistics for number of records
    size = {} # Dictionary to hold statistics for file sizes
    
    for x in ['min','max','mean','25th','50th','75th','90th']:
        if x == 'min':
            records[x] = min(file_rec)
            size[x] = min(file_size)
        elif x == 'max':
            records[x] = max(file_rec)
            size[x] = max(file_size)
        elif x == 'mean':
            records[x] = np.around(np.mean(file_rec),2)
            size[x] = np.around(np.mean(file_size),2)
        elif x == '25th':
            records[x] = np.around(np.percentile(file_rec,25),2)
            size[x] = np.around(np.percentile(file_size,25),2)
        elif x == '50th':
            records[x] = np.around(np.percentile(file_rec,50),2)
            size[x] = np.around(np.percentile(file_size,50),2)
        elif x == '75th':
            records[x] = np.around(np.percentile(file_rec,75),2)
            size[x] = np.around(np.percentile(file_size,75),2)
        elif x == '90th':
            records[x] = np.around(np.percentile(file_rec,90),2)
            size[x] = np.around(np.percentile(file_size,90),2)
        stat['record stats'] = records
        stat['size stats'] = size
    return print(stat)

In [8]:
a,b,c = record_stat_schema('./Files/tlc_0.2perc')


In [9]:
cal_stat(a,b)

{'record stats': {'min': 15, 'max': 47703, 'mean': 17922.07, '25th': 3037.0, '50th': 19532.0, '75th': 28560.0, '90th': 31372.0}, 'size stats': {'min': 2512, 'max': 5959352, 'mean': 2152301.64, '25th': 257388.0, '50th': 1479679.0, '75th': 4181249.0, '90th': 5188938.0}}


In [40]:
#Schema analysis
for x in range(len(c)):
    for y in range(len(c[x])):
        c[x][y] = c[x][y].strip() #removing leading and trailing spaces

# Columns in schema 1 to 14 not in schema 15
with open('./Files/schema_diff_others_vs_15.txt', 'w') as f:
    for x in range(14):
        ls_diff = []
        for elem in c[x]:
            if elem not in c[14]:
                ls_diff.append(elem)
         
        
        f.write('./Files/schema: '+str(x +1)+'\n')
        f.write(''.join(ls_diff)+'\n')
f.close()    

In [37]:
#Columns in schema 15 not in 1-14

with open('./Files/schema_diff_15_vs_others.txt', 'w') as f:
    for x in range(14):
        ls_diff2 = []
        ls_diff2.append(set(c[14])-set(c[x]))
        #print(ls_diff2)
        ls_diff2 = ''.join(str(e) for e in ls_diff2)
        f.write('schema: '+str(x +1)+'\n')
        f.write(ls_diff2+'\n')
f.close()

In [38]:
# Common columns in schema 15 and in 1-14
with open('./Files/schema_common.txt', 'w') as f:
    for x in range(14):
        ls_common = []
        for elem in c[x]:
            if elem in c[14]:
                ls_common.append(elem)
                
        f.write('schema: '+str(x +1)+'\n')
        f.write(''.join(ls_common)+'\n')
f.close()  

In [39]:
# Merging files of Schema 4 and 6
for file in os.listdir('./Files/schema_v_6'):
    shutil.move(os.path.join('Files/schema_v_6', file),'./Files/schema_v_4')
    
#delete the schema v 6 folder  
os.rmdir('./Files/schema_v_6')

## 2.2 Data integration

In [268]:
from pyspark.sql import *
# Function to fix column names

def column_name_fix(df):
    '''Function removes whitespaces and converts column names to lowercase'''
    return [df.select('*', df[col].alias(col.strip().lower())).drop(col) for col in df.columns]
    


In [267]:
# Function to drop column
def column_drop(df,drop_list:list):
    '''Function receives a spark dataframe and list of columns to be dropped.
    It returns a dataframe less the columns specified to be dropped'''
    return df.drop(*drop_list)
     

In [265]:
# Function to rename column
def column_rename(df,col_list1,col_list2):
    '''Function renames the column name of a dataframe with a provided name'''
    return [df.select('*', df[col_list1[x]].alias(col_list2[x])).drop(col_list1[x]) for x in range(len(col_list1))]
     
            
df2 = column_name(spark.read.csv('yellow_tripdata_2014-01.csv', header=True))   
print(column_rename(df2,['pickup_datetime'],['tpep_pickup_datetime']))

[DataFrame[dropoff_datetime: string, passenger_count: string, trip_distance: string, pickup_longitude: string, pickup_latitude: string, rate_code: string, store_and_fwd_flag: string, dropoff_longitude: string, dropoff_latitude: string, payment_type: string, fare_amount: string, surcharge: string, mta_tax: string, tip_amount: string, tolls_amount: string, total_amount: string, tpep_pickup_datetime: string]]


In [283]:
# Add columns to a dataframe
def column_add(df,col_list):
    df = df.toPandas()
    for x in col_list:
        df[x] = "" ""
    return spark.createDataFrame(df)
df = column_add(df2,['x','y'])   


In [284]:
# Function to convert latitude and longitude to location ID

$\color{red}{\text{WARNING!!!.}}$ $\color{red}{\text{" This action will clear all folders and their contents created during project."}}$

In [3]:
# Clearing working environment
import shutil
shutil.rmtree('./Files')