# Data cleaning

## Auxiliary functions

In [473]:
# defining some key functions
#generator function
def file_content(main_folder):
    file_list = sorted(os.listdir(main_folder))
    for file_name in file_list:
        if os.path.isfile(os.path.join(main_folder, file_name)): # output only files and no subfolders
            yield file_name

In [474]:
# Add indicator column
def add_col(df):
    df['dirty']= 0
    return df
    

In [475]:
# Checking null values
import numpy as np
import pandas as pd
def null_value(df, col):
    n = df.shape[0]
    for x in range(n):  
        if df.loc[x,col].isna().any():
            df.loc[x, 'dirty']= 1  
        else:
            df.loc[x, 'dirty']= 0
                
                
    return df

In [476]:
# Check location values
import numpy as np
import pandas as pd
def locationid(df,col):
    n = df.shape[0]
    for x in range(n):
        if df.loc[x, 'dirty'] != 1:
            if (1 <= df.loc[x,col[0]] <= 265) != True or (1 <= df.loc[x,col[1]] <= 265) != True:
                df.loc[x, 'dirty']= 1
    return df
                

In [477]:
# Repairing missing values for numerical dimensions
def repair_missing(df,col,value):
    df[col] = df[col].fillna(value) 
    return df
    

In [478]:
# Reparing negative numeric dimensions
def repair_absolute(df,col):
    df[col] = df[col].abs()
        
    return df

In [480]:
# converting all numeric dimensions to float type
def type_conversion(df, col, dtype):
    if not int(df[col]):
        
        df[col] = df[col].astype(dtype)
    return df
    

In [481]:
# Seperating clean and bad records

def record_separation(df):
    df
    return  df.loc[df['dirty'] == 0], df.loc[df['dirty'] == 1]

## 1. Cleaning the FHV dataset

In [318]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/FHV/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/FHV/clean')
    
if os.path.exists('Files/integrated_files/FHV/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/FHV/dirty')

### Analysis of valid values

In [360]:
import pandas as pd

col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']

folder_path = './Files/integrated_files/FHV/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)




In [361]:
import pandas as pd
col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']

folder_path = './Files/integrated_files/FHV/Schema_v_2'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)



In [327]:
import pandas as pd

col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']
col_abs = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/FHV/Schema_v_3'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df = repair_absolute(df,col_abs)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)



In [359]:
import pandas as pd

col_check = ['pickup_datetime', 'dropoff_datetime', 'pulocationid','dolocationid']
col_abs = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/FHV/Schema_v_4'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df = repair_absolute(df,col_abs)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/FHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHV/dirty', file),index=False)



In [486]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/FHV/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

667814


### Validity rules

### Identifying dirty records, data repairing

## 2. Cleaning the FHVHV dataset

In [322]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/FHVHV/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/FHVHV/clean')
    
if os.path.exists('Files/integrated_files/FHVHV/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/FHVHV/dirty')

### Analysis of valid values

In [350]:
import pandas as pd

col_check = ['pickup_datetime','dropoff_datetime','PULocationID','DOLocationID']
col_abs = ['PULocationID','DOLocationID']

folder_path = './Files/integrated_files/FHVHV/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = null_value(df,col_check)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_abs)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/FHVHV/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/FHVHV/dirty', file),index=False)

In [487]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/FHVHV/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

0


### Validity rules

### Identifying dirty records, data repairing

## 3. Cleaning the GREEN dataset

In [372]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/green/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/green/clean')
    
if os.path.exists('Files/integrated_files/green/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/green/dirty')

### Analysis of valid values

In [376]:
## Schema 1

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_loc = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/green/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [378]:
## Schema 2

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/green/Schema_v_2'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [380]:
## Schema 3

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/green/Schema_v_3'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [382]:
## Schema 4

col_check = ['lpep_pickup_datetime','lpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','PULocationID','DOLocationID','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['PULocationID','DOLocationID','trip_distance','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['PULocationID','DOLocationID']

folder_path = './Files/integrated_files/green/Schema_v_4'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/green/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/green/dirty', file),index=False)

In [488]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/green/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

526


### Validity rules

### Identifying dirty records, data repairing

## 4. Cleaning the YELLOW dataset

In [383]:
# Check wether folder exist if not create
if os.path.exists('Files/integrated_files/yellow/clean'):
    pass
else:
    os.mkdir('Files/integrated_files/yellow/clean')
    
if os.path.exists('Files/integrated_files/yellow/dirty'):
    pass
else:
    os.mkdir('Files/integrated_files/yellow/dirty')

### Analysis of valid values

In [393]:
## Schema 1

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance','fare_amt','mta_tax','pulocationid','dolocationid','tolls_amt','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','fare_amt','mta_tax','tolls_amt','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amt','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/yellow/Schema_v_1'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mtag,0.5)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [471]:
## Schema 2

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

col_float = ['trip_distance','mta_tax','pulocationid','dolocationid','tolls_amount']

folder_path = './Files/integrated_files/yellow/Schema_v_2'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        #df.drop([152,2378,8406,11208,11351,11661,13045,13898,14575,15039,15842], inplace=True) # dropping Rows with string values
        df = type_conversion(df, col_float, float)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [482]:
## Schema 3

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','extra','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','extra','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge','extra']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/yellow/Schema_v_3'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [483]:
## Schema 4

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','extra','trip_distance','fare_amount','mta_tax','pulocationid','dolocationid','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['pulocationid','dolocationid','trip_distance','extra','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge','extra']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['pulocationid','dolocationid']

folder_path = './Files/integrated_files/yellow/Schema_v_4'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [484]:
## Schema 5

col_check = ['tpep_pickup_datetime','tpep_dropoff_datetime','extra','trip_distance','fare_amount','mta_tax','PULocationID','DOLocationID','tolls_amount','improvement_surcharge','congestion_surcharge']

col_abs = ['PULocationID','DOLocationID','trip_distance','extra','fare_amount','mta_tax','tolls_amount','improvement_surcharge','congestion_surcharge']

col_missing = ['tolls_amount','congestion_surcharge','extra']

col_missing_mta = ['mta_tax']

col_missing_improvement = ['improvement_surcharge']

col_loc = ['PULocationID','DOLocationID']

folder_path = './Files/integrated_files/yellow/Schema_v_5'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        df = add_col(df)
        df = repair_missing(df,col_missing,0)
        df = repair_missing(df,col_missing_mta,0.5)
        df = repair_missing(df,col_missing_improvement,0.3)
        df = repair_absolute(df,col_abs)
        df = locationid(df, col_loc)        
        df = null_value(df,col_check)
        df_clean, df_dirty = record_separation(df)
        if df_clean.shape[0] > 0 :
            df_clean.to_csv(os.path.join('Files/integrated_files/yellow/clean', file),index=False)
        if df_dirty.shape[0] > 0 :
            df_dirty.to_csv(os.path.join('Files/integrated_files/yellow/dirty', file),index=False)

In [489]:
#Statistics for bad records
x = 0
folder_path = 'Files/integrated_files/yellow/dirty'
for file in file_content(folder_path):
        file_path = os.path.join(folder_path, file)
        df =pd.read_csv(file_path)
        x += df.shape[0]
print(x)  

341112


### Validity rules

### Identifying dirty records, data repairing