# Loading and Cleaning data in chunks
To be able to work with all the files, they must first be cleaned and saved as parquet files
These files will be saved in chunks of 100,000

In [6]:
import pandas as pd
from glob import glob
import dask.dataframe as dd
chunksize = 400_000

In [2]:
# file paths for large csv files

file_paths = [
    "./data/raw/2019/cargodesc_files/ams__cargodesc_2019__202001080000_part_0.csv", 
    "./data/raw/2019/cargodesc_files/ams__cargodesc_2019__202001080000_part_1.csv", 
    "./data/raw/2019/cargodesc_files/ams__cargodesc_2019__202001080000_part_2.csv",
    "./data/raw/2019/cargodesc_files/ams__cargodesc_2019__202001080000_part_3.csv",
    "./data/raw/2019/cargodesc_files/ams__cargodesc_2019__202001080000_part_4.csv",
    "./data/raw/2019/container_files/ams__container_2019__202001080000_part_0.csv",
    "./data/raw/2019/container_files/ams__container_2019__202001080000_part_1.csv",
    "./data/raw/2019/container_files/ams__container_2019__202001080000_part_2.csv",
    "./data/raw/2019/container_files/ams__container_2019__202001080000_part_3.csv",
    "./data/raw/2019/header_files/ams__header_2019__202001080000_part_0.csv",
    "./data/raw/2019/header_files/ams__header_2019__202001080000_part_1.csv",
    "./data/raw/2019/header_files/ams__header_2019__202001080000_part_2.csv",#11
    "./data/raw/2019/header_files/ams__header_2019__202001080000_part_3.csv", #12
    "./data/raw/2020/cargodesc_files/ams__cargodesc_2020__202009291500_part_0.csv",#13
    "./data/raw/2020/cargodesc_files/ams__cargodesc_2020__202009291500_part_1.csv",#14
    "./data/raw/2020/cargodesc_files/ams__cargodesc_2020__202009291500_part_2.csv",#15
    "./data/raw/2020/cargodesc_files/ams__cargodesc_2020__202009291500_part_3.csv",#16
    "./data/raw/2020/container_files/ams__container_2020__202009291500_part_0.csv",#17
    "./data/raw/2020/container_files/ams__container_2020__202009291500_part_1.csv",#18
    "./data/raw/2020/container_files/ams__container_2020__202009291500_part_2.csv",#19
    "./data/raw/2020/header_files/ams__header_2020__202009291500_part_0.csv",#20
    "./data/raw/2020/header_files/ams__header_2020__202009291500_part_1.csv",#21
    "./data/raw/2020/header_files/ams__header_2020__202009291500_part_2.csv",
]

In [None]:
# Cargo desc files
i = 0
for chunk in pd.read_csv(file_paths[-1],chunksize=chunksize):
    chunk.drop_duplicates(inplace=True)
    chunk.drop(['description_sequence_number','piece_count'],axis=1,inplace=True)
    chunk['description_text'] = (
        chunk['description_text'].astype(str)
        .str.lower()
        .str.strip()
        .str.replace(r'\s+', ' ', regex=True)
        .str.replace(r'\.\s*\.', '.',regex=True)
        .str.replace(r'[",]+','',regex=True)
        .str.replace(r'\s*\.\s*$','',regex=True)
    )
    chunk.to_parquet(f"./data/cleaned/2020/cargodesc_files/chunk_{i}.parquet",index=False)
    i += 1


In [None]:
# Container files
i = 0
for chunk in pd.read_csv(file_paths[-1],chunksize=chunksize):
    chunk.drop_duplicates(inplace=True)
    chunk.drop(['container_type',
                'load_status',
                'seal_number_1',
                'seal_number_2',
                'equipment_description_code',
                'container_type'
                ],axis=1,inplace=True)
    chunk = chunk.loc[~((chunk['container_length'] == 0) & (chunk['container_width'] == 0) & (chunk['container_height'] == 0))]
    chunk.dropna(thresh=5,inplace=True)
    chunk.to_parquet(f"./data/cleaned/2020/container_files/chunk_{i}.parquet",index=False)
    i += 1

In [None]:
# Reusable code that will be used to read the csv files as chunks, clean the data and then save them as a parquet file for space reasons

i = 0
for chunk in pd.read_csv(file_paths[22],chunksize=chunksize):
    chunk.drop_duplicates(inplace=True)
    chunk.drop([
    'carrier_code',
    'vessel_country_code',
    'foreign_port_of_lading_qualifier',
    'record_status_indicator',
    'foreign_port_of_destination_qualifier',
    'place_of_receipt',
    'conveyance_id_qualifier',
    'conveyance_id',
    'in_bond_entry_type',
    'secondary_notify_party_1',
    'secondary_notify_party_2',
    'secondary_notify_party_3',
    'secondary_notify_party_4',
    'secondary_notify_party_5',
    'secondary_notify_party_6',
    'secondary_notify_party_7',                                                                     
    'secondary_notify_party_8',
    'secondary_notify_party_9',
    'secondary_notify_party_10',
    ],axis=1,inplace=True)
    chunk.dropna(thresh=6,inplace=True)

    chunk = chunk[chunk['weight'] != 0]
    chunk.to_parquet(f"data/cleaned/2020/header_files/chunk_{i}.parquet",index=False)
    i +=1




In [54]:
# So the file has been chunked and there's now a bunch of files
# We can append all the files into one single file for clarity
files = glob("data/cleaned/2020/header_files/chunk_*.parquet")
df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

df.to_parquet("data/cleaned/2020/header_files/ams_header_2020_part_2.parquet",index=False)

In [None]:
# THis is to merge all the dfs into one for easier merging with the header csv
dfs_cargodesc_list = [
]
for file in cleaned_fps_2019_cargodesc:
    df = pd.read_parquet(file)
    dfs_cargodesc_list.append(df)

In [None]:
df_cargodesc_combined = pd.concat(dfs_cargodesc_list,ignore_index=True)

In [None]:
merged = []
for df in dfs_header_list:
    merge = df.merge(
        df_cargodesc_combined[['identifier','description_text']],
        on='identifier',
        how='left'
    )
    merged.append(merge)