# Data cleaning

In [1]:
from pyspark.sql import SparkSession

#if a spark session was already started, we stop it before starting a new one
#(there can be only one spark context per jupyter notebook)
try: 
    spark
    print("Spark application already started. Terminating existing application and starting new one")
    spark.stop()
except: 
    pass

# Create a new spark session (note, the * indicates to use all available CPU cores)
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("TLC") \
    .getOrCreate()
    
sc=spark.sparkContext

## Auxiliary functions

In [25]:
# Auxiliary code to help in the data cleaning process goes here
# defining some key functions
#generator function
def file_content(main_folder):
    file_list = os.listdir(main_folder)
    for file_name in file_list:
        yield file_name
        
from pyspark.sql import *
from pyspark.sql.functions import *
# Function to fix column names

def column_name_fix(df):
    '''Function converts column names to lowercase and remove whitespaces'''
    colnames = df.columns
    for x in range(len(colnames)):
        colnames[x] =  colnames[x].strip()
        colnames[x] =  colnames[x].lower()
        
    df = df.toDF(*colnames)
    return df

# Add columns to a dataframe
from pyspark.sql.functions import lit
def column_add(df,col_list):
    
    for x in col_list:
        df = df.withColumn(x, lit(""))
    return df




In [53]:
from pyspark.sql.types import *
def type_convert(df,col_type):
    col_name = df.columns
    for x in range(len(col_name)):
        df = df.withColumn(col_name[x], df[col_name[x]].cast(col_type[x]()))
    return df

## 1. Cleaning the FHV dataset

In [11]:
# Merging files 
import shutil
import os
# moving the files to 
for file in os.listdir('./Files/integrated_files/FHV/schema_v_1'):
    shutil.move(os.path.join('Files/integrated_files/FHV/schema_v_1', file), 'Files/integrated_files/FHV')
    
for file in os.listdir('./Files/integrated_files/FHV/schema_v_2'):
    shutil.move(os.path.join('Files/integrated_files/FHV/schema_v_2', file), 'Files/integrated_files/FHV')

for file in os.listdir('./Files/integrated_files/FHV/schema_v_3'):
    shutil.move(os.path.join('Files/integrated_files/FHV/schema_v_3', file), 'Files/integrated_files/FHV')

for file in os.listdir('./Files/integrated_files/FHV/schema_v_4'):
    shutil.move(os.path.join('Files/integrated_files/FHV/schema_v_4', file), 'Files/integrated_files/FHV')

In [17]:
# Clearing working environment

if len(os.listdir('./Files/integrated_files/FHV')) == 52:
    print("All files were moved successfully")
    shutil.rmtree('./Files/integrated_files/FHV/schema_v_1')
    shutil.rmtree('./Files/integrated_files/FHV/schema_v_2')
    shutil.rmtree('./Files/integrated_files/FHV/schema_v_3')
    shutil.rmtree('./Files/integrated_files/FHV/schema_v_4')
else:
    print("Files were not successfully moved. Check each schema folder")
    

All files were moved successfully


### Analysis of valid values

In [58]:
col_types = [StringType, TimestampType, TimestampType,IntegerType, IntegerType, BinaryType,StringType, BooleanType]

### Validity rules

In [137]:
def validation_rules(df):
    col_name = df.columns
    for x in range(len(df)):
        # dispatching_base_num rules
        if type(df[col_name[0]][x]) == 'str':
            pass
        else:
            df.dirty[x] = True
    return df
       
    
    

In [138]:
df = column_name_fix(spark.read.csv('./Files/integrated_files/FHV/fhv_tripdata_2015-01.csv', header=True))
col_add = ['dirty']
df = column_add(df,col_add)
df = type_convert(df,col_types)
df = df.toPandas()
df = validation_rules(df)
#df.schema
print(df.dirty)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dirty[x] = True


0       True
1       True
2       True
3       True
4       True
        ... 
5457    True
5458    True
5459    True
5460    True
5461    True
Name: dirty, Length: 5462, dtype: object


In [103]:
type('a')

str

### Identifying dirty records, data repairing

## 2. Cleaning the FHVHV dataset

### Analysis of valid values

### Validity rules

### Identifying dirty records, data repairing

## 3. Cleaning the GREEN dataset

### Analysis of valid values

### Validity rules

### Identifying dirty records, data repairing

## 4. Cleaning the YELLOW dataset

### Analysis of valid values

### Validity rules

### Identifying dirty records, data repairing