# Final Cleaning

In the final cleaning, since every csv file is a lot cleaner to work with, we can remove any additional bad values to clean our data further
Once we remove those final values, our csvs will be sql ready!

In [4]:
import pandas as pd

In [1]:
# Removing numerical values from our port of lading and unlading data
# Numerical data is uneeded in this case
port_lading_fps = [
    "./data/cleaned_/2019/lookup_table_files/port_of_lading_lookup.csv",
    "./data/cleaned_/2019/lookup_table_files/port_of_unlading.csv",
    "./data/cleaned_/2020/lookup_table_files/port_of_lading_lookup.csv",
    "./data/cleaned_/2020/lookup_table_files/port_of_unlading.csv"
]


In [2]:
header_2019_fps = [
    "./data/cleaned_/2019/header_table_files/header_0.csv",
    "./data/cleaned_/2019/header_table_files/header_1.csv",
    "./data/cleaned_/2019/header_table_files/header_2.csv",
    "./data/cleaned_/2019/header_table_files/header_3.csv"
]

In [3]:
header_2020_fps = [
    "./data/cleaned_/2020/header_table_files/header_0.csv",
    "./data/cleaned_/2020/header_table_files/header_1.csv",
    "./data/cleaned_/2020/header_table_files/header_2.csv",
]

In [4]:
date_fps = [
    "./data/cleaned_/2019/lookup_table_files/estimated_arrival_lookup.csv",
    "./data/cleaned_/2019/lookup_table_files/arrival_date_lookup.csv",
    "./data/cleaned_/2020/lookup_table_files/estimated_arrival_lookup.csv",
    "./data/cleaned_/2020/lookup_table_files/arrival_date_lookup.csv"
]

In [5]:
# Now, we pass over each file and get the ids

df = pd.read_csv(port_lading_fps[0])

#  Converting columns to numeric, if they can't they'll be NaN
numeric_mask = pd.to_numeric(df['port_of_lading'],errors='coerce').notna()

removed_ids = df.loc[numeric_mask,'port_lading_id'].tolist()

df_clean = df.loc[~numeric_mask]

In [7]:
idx = 0
for fp in header_2019_fps:
    df = pd.read_csv(fp)
    df_filtered = df[~df['port_lading_id'].isin(removed_ids)]

    df_filtered.to_csv(f'header_{idx}.csv',index=False)
    idx += 1


In [8]:
df_clean.to_csv('port_of_lading_lookup.csv',index=False)

In [12]:
# Now, there's a lot of garbage year data, data from years before what we want to use, we need to remove it!

df = pd.read_csv(date_fps[3])

# Creates a T/F df with all the values that have the year
mask_2020 = df['arrival_date'].str[:4] == "2020"

# Any value that doesn't have the year, will be removed
ids_to_remove = df.loc[~mask_2020,'arrival_id'].tolist()

# A clean dataframe, with only the values with 2019 are kept
df_clean = df.loc[mask_2020]

# This dataframe is our lookup table, we must save the new and updated lookup table
df_clean.to_csv('arrival_date_lookup.csv',index=False)

In [14]:
idx = 0
for fp in header_2020_fps:
    df = pd.read_csv(fp)
    df_filtered = df[~df['arrival_id'].isin(ids_to_remove)]
    df_filtered.to_csv(f'header_{idx}.csv',index=False)
    idx += 1