# Data cleaning

## Auxiliary functions

In [1]:
# defining some key functions
#generator function
def file_content(main_folder):
    file_list = sorted(os.listdir(main_folder))
    for file_name in file_list:
        if os.path.isfile(os.path.join(main_folder, file_name)): # output only files and no subfolders
            yield file_name

In [2]:
# Add indicator column
def add_col(df):
    df['dirty']= 0
    return df
    

In [3]:
# Checking null values
import numpy as np
import pandas as pd
def null_value(df, col):
    n = df.shape[0]
    for x in range(n):  
        if df.loc[x,col].isna().any():
            df.loc[x, 'dirty']= 1  
        else:
            df.loc[x, 'dirty']= 0
                
                
    return df

In [19]:
# Extracting month from date field, calculating total amount, trip duration
def extract_month(df, field):
    df['trip_month'] = pd.to_datetime(df[field]).dt.month
    return df

def tot_amount(df,col_list):
    df['tot_amount'] = df[col_list].sum(axis=1)
    return df

def trip_duration(df,col_pickup,col_dropoff):
    df['trip_duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).astype('timedelta64[m]')
    return df

In [5]:
# Check location values
import numpy as np
import pandas as pd
def locationid(df,col):
    n = df.shape[0]
    for x in range(n):
        if df.loc[x, 'dirty'] != 1:
            if (1 <= df.loc[x,col[0]] <= 265) != True or (1 <= df.loc[x,col[1]] <= 265) != True:
                df.loc[x, 'dirty']= 1
    return df
                

In [6]:
# Repairing missing values for numerical dimensions
def repair_missing(df,col,value):
    df[col] = df[col].fillna(value) 
    return df
    

In [7]:
# Reparing negative numeric dimensions
def repair_absolute(df,col):
    df[col] = df[col].abs()
        
    return df

In [None]:
# Deleting rows with pickup time less than dropoff time
def check_time(df):
    index_name = df[df['trip_duration']<0].index
    df.drop(index_name,inplace=True)
    return df

In [8]:
# converting all numeric dimensions to float type
def type_conversion(df, col, dtype):
    if not int(df[col]):
        
        df[col] = df[col].astype(dtype)
    return df
    

In [9]:
# Seperating clean and bad records

def record_separation(df):
    df
    return  df.loc[df['dirty'] == 0], df.loc[df['dirty'] == 1]

## 1. Cleaning the FHV dataset

In [10]:
# Check wether folder exist if not create

import os
if os.path.exists('Files/integrated_files/FHV/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/FHV/clean')
    
if os.path.exists('Files/integrated_files/FHV/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/FHV/dirty')

### Analysis of valid values

In [31]:
## SCHEMA VERSION 1

import pandas as pd

col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']
col_order = ['pickup_datetime','dropoff_datetime','pulocationid','dolocationid','sr_flag','dispatching_base_num']
field = 'pickup_datetime'
folder_path = './Files/integrated_files/FHV/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)




In [32]:
## SCHEMA VERSION 2

import pandas as pd
col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']
col_order = ['pickup_datetime','dropoff_datetime','pulocationid','dolocationid','sr_flag','dispatching_base_num']
field = 'pickup_datetime'
folder_path = './Files/integrated_files/FHV/Schema_v_2'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)



In [33]:
## SCHEMA VERSION 3

import pandas as pd

col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']
col_abs = ['pulocationid','dolocationid']
col_order = ['pickup_datetime','dropoff_datetime','pulocationid','dolocationid','sr_flag','dispatching_base_num']
field = 'pickup_datetime'

folder_path = './Files/integrated_files/FHV/Schema_v_3'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df = repair_absolute(df,col_abs)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)



In [34]:
## SCHEMA VERSION 4

import pandas as pd

col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']
col_abs = ['pulocationid','dolocationid']
col_order = ['pickup_datetime','dropoff_datetime','pulocationid','dolocationid','sr_flag','dispatching_base_num']
field = 'pickup_datetime'

folder_path = './Files/integrated_files/FHV/Schema_v_4'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df = repair_absolute(df,col_abs)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)



In [36]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/FHV/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

667814


In [38]:
# stacking all files into one per dataset type

df_list = []
folder_path = 'Files/integrated_files/FHV/clean'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df_list.append(df)
df_fhv = pd.concat(df_list)
df_fhv.to_csv('df_fhv.csv',index=False)

### Validity rules

### Identifying dirty records, data repairing

## 2. Cleaning the FHVHV dataset

In [39]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/FHVHV/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/FHVHV/clean')
    
if os.path.exists('Files/integrated_files/FHVHV/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/FHVHV/dirty')

### Analysis of valid values

In [40]:
import pandas as pd

col_check = ['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID']
col_abs = ['PULocationID','DOLocationID']
field = 'pickup_datetime'

folder_path = './Files/integrated_files/FHVHV/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_abs)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean = extract_month(df_clean,field)
            df_clean.to_csv(os.path.join('Files/integrated_files/FHVHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHVHV/dirty', file),index=False)

In [41]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/FHVHV/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

0


In [43]:
# stacking all files into one per dataset type

df_list = []
folder_path = 'Files/integrated_files/FHVHV/clean'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df_list.append(df)
df_fhvhv = pd.concat(df_list)
df_fhvhv.to_csv('df_fhvhv.csv',index=False)

### Validity rules

### Identifying dirty records, data repairing

## 3. Cleaning the GREEN dataset

In [17]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/green/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/green/clean')
    
if os.path.exists('Files/integrated_files/green/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/green/dirty')

### Analysis of valid values

In [22]:
## Schema 1

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_loc = ['pulocationid','dolocationid']

col_order = ['vendorid','lpep_pickup_datetime','lpep_dropoff_datetime','store_and_fwd_flag','ratecodeid','passenger_count','trip_distance','fare_amount','extra','mta_tax','tip_amount','tolls_amount','ehail_fee','total_amount','payment_type','trip_type','pulocationid','dolocationid','improvement_surcharge','congestion_surcharge']

field = 'lpep_pickup_datetime'

col_list = ['fare_amount','extra','mta_tax','tolls_amount','ehail_fee','total_amount', 'improvement_surcharge','congestion_surcharge']

folder_path = './Files/integrated_files/green/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :                       
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [23]:
## Schema 2

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_order = ['vendorid','lpep_pickup_datetime','lpep_dropoff_datetime','store_and_fwd_flag','ratecodeid','passenger_count','trip_distance','fare_amount','extra','mta_tax','tip_amount','tolls_amount','ehail_fee','total_amount','payment_type','trip_type','pulocationid','dolocationid','improvement_surcharge','congestion_surcharge']

field = 'lpep_pickup_datetime'

col_list = ['fare_amount','extra','mta_tax','tolls_amount','ehail_fee','total_amount', 'improvement_surcharge','congestion_surcharge']

folder_path = './Files/integrated_files/green/Schema_v_2'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :            
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [24]:
## Schema 3

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_order = ['vendorid','lpep_pickup_datetime','lpep_dropoff_datetime','store_and_fwd_flag','ratecodeid','passenger_count','trip_distance','fare_amount','extra','mta_tax','tip_amount','tolls_amount','ehail_fee','total_amount','payment_type','trip_type','pulocationid','dolocationid','improvement_surcharge','congestion_surcharge']

field = 'lpep_pickup_datetime'

col_list = ['fare_amount','extra','mta_tax','tolls_amount','ehail_fee','total_amount', 'improvement_surcharge','congestion_surcharge']


folder_path = './Files/integrated_files/green/Schema_v_3'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :            
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [27]:
## Schema 4

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','PULocationID','DOLocationID','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['PULocationID','DOLocationID','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['PULocationID','DOLocationID']

col_order = ['vendorid','lpep_pickup_datetime','lpep_dropoff_datetime','store_and_fwd_flag','ratecodeid','passenger_count','trip_distance','fare_amount','extra','mta_tax','tip_amount','tolls_amount','ehail_fee','total_amount','payment_type','trip_type','pulocationid','dolocationid','improvement_surcharge','congestion_surcharge']

field = 'lpep_pickup_datetime'

col_list = ['fare_amount','extra','mta_tax','tolls_amount','ehail_fee','total_amount', 'improvement_surcharge','congestion_surcharge']

folder_path = './Files/integrated_files/green/Schema_v_4'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean = df_clean.rename(columns={'PULocationID':'pulocationid','DOLocationID':'dolocationid','VendorID':'vendorid', 'RatecodeID':'ratecodeid'})
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [28]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/green/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

526


In [45]:
# stacking all files into one per dataset type

df_list = []
folder_path = 'Files/integrated_files/green/clean'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df_list.append(df)
df_green = pd.concat(df_list)
df_green.to_csv('df_green.csv',index=False)

### Validity rules

### Identifying dirty records, data repairing

## 4. Cleaning the YELLOW dataset

In [383]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/yellow/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/yellow/clean')
    
if os.path.exists('Files/integrated_files/yellow/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/yellow/dirty')

### Analysis of valid values

In [52]:
## Schema 1

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance','fare_amt','mta_tax','pulocationid','dolocationid','tolls_amt','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amt','mta_tax','tolls_amt','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amt','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_list = ['fare_amount','mta_tax','tolls_amount','total_amount', 'improvement_surcharge','congestion_surcharge']

col_order = ['vendorid','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','ratecodeid','store_and_fwd_flag','pulocationid','dolocationid','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge']
field = 'tpep_pickup_datetime'


folder_path = './Files/integrated_files/yellow/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean = df_clean.rename(columns={'fare_amt':'fare_amount','total_amt':'total_amount','tolls_amt':'tolls_amount', 'tip_amt':'tip_amount'})
            df_clean['extra'] = 0
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        #if df_dirty.shape[0] > 0 :
         #   df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [58]:
## Schema 2

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_float = ['trip_distance','mta_tax','pulocationid','dolocationid','tolls_amount']

col_list = ['fare_amount','extra','mta_tax','tolls_amount','total_amount', 'improvement_surcharge','congestion_surcharge']

col_order = ['vendorid','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','ratecodeid','store_and_fwd_flag','pulocationid','dolocationid','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge']
field = 'tpep_pickup_datetime'


folder_path = './Files/integrated_files/yellow/Schema_v_2'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)       
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        #df.drop([152,2378,8406,11208,11351,11661,13045,13898,14575,15039,15842], inplace=True) # dropping Rows with string values
        #df = type_conversion(df, col_float, float)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean['extra'] = 0
            df_clean['store_and_fwd_flag'] = ''
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        #if df_dirty.shape[0] > 0 :
        #    df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [60]:
## Schema 3

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','extra','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','extra','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge','extra']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_list = ['fare_amount','extra','mta_tax','tolls_amount','total_amount', 'improvement_surcharge','congestion_surcharge']

col_order = ['vendorid','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','ratecodeid','store_and_fwd_flag','pulocationid','dolocationid','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge']
field = 'tpep_pickup_datetime'


folder_path = './Files/integrated_files/yellow/Schema_v_3'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        #if df_dirty.shape[0] > 0 :
        #    df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [62]:
## Schema 4

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','extra','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','extra','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge','extra']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_list = ['fare_amount','extra','mta_tax','tolls_amount','total_amount', 'improvement_surcharge','congestion_surcharge']

col_order = ['vendorid','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','ratecodeid','store_and_fwd_flag','pulocationid','dolocationid','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge']
field = 'tpep_pickup_datetime'


folder_path = './Files/integrated_files/yellow/Schema_v_4'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        #if df_dirty.shape[0] > 0 :
        #   df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [63]:
## Schema 5

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','extra','trip_distance','fare_amount','mta_tax','PULocationID','DOLocationID','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['PULocationID','DOLocationID','trip_distance','extra','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge','extra']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['PULocationID','DOLocationID']

col_list = ['fare_amount','extra','mta_tax','tolls_amount','total_amount', 'improvement_surcharge','congestion_surcharge']

col_order = ['vendorid','tpep_pickup_datetime','tpep_dropoff_datetime','passenger_count','trip_distance','ratecodeid','store_and_fwd_flag','pulocationid','dolocationid','payment_type','fare_amount','extra','mta_tax','tip_amount','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge']
field = 'tpep_pickup_datetime'


folder_path = './Files/integrated_files/yellow/Schema_v_5'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean = df_clean.rename(columns={'PULocationID':'pulocationid','DOLocationID':'dolocationid','VendorID':'vendorid', 'RatecodeID':'ratecodeid'})
            df_clean =df_clean[col_order]
            df_clean = extract_month(df_clean,field)
            df_clean = tot_amount(df_clean,col_list)
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        #if df_dirty.shape[0] > 0 :
        #    df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [64]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/yellow/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

341112


In [65]:
# stacking all files into one per dataset type

df_list = []
folder_path = 'Files/integrated_files/yellow/clean'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df_list.append(df)
df_yellow = pd.concat(df_list)
df_yellow.to_csv('df_yellow.csv',index=False)

### Validity rules

In [9]:
import shutil
shutil.make_archive('yellow', 'zip', "Files/integrated_files/yellow/clean")

'/home/epb103/yellow.zip'

### Identifying dirty records, data repairing