In [1]:
#CLEAN

import pandas as pd
import os
import numpy as np

In [2]:
directory_path = "C:/Users/james/OneDrive/Documents/ADA/Wedge_Project/Wedge-Project/wedge-data/wedge-raw-trans-data/raw_data"

In [48]:
schema = {
    "datetime": "datetime64[ns]",
    "register_no": "object",
    "emp_no": "object",
    "trans_no": "object",
    "upc": "object",  # Equivalent to STRING in BigQuery
    "description": "object",
    "trans_type": "object",
    "trans_subtype": "object",
    "trans_status": "object",
    "department": "object",
    "quantity": "object",
    "Scale": "object",
    "cost": "object",
    "unitPrice": "object",
    "total": "object",
    "regPrice": "object",
    "altPrice": "object",
    "tax": "object",
    "taxexempt": "object",
    "foodstamp": "object",
    "wicable": "object",
    "discount": "object",
    "memDiscount": "object",
    "discountable": "object",
    "discounttype": "object",
    "voided": "object",
    "percentDiscount": "object",
    "ItemQtty": "object",
    "volDiscType": "object",
    "volume": "object",
    "VolSpecial": "object",
    "mixMatch": "object",
    "matched": "object",
    "memType": "object",
    "staff": "object",
    "numflag": "object",
    "itemstatus": "object",
    "tenderstatus": "object",
    "charflag": "object",
    "varflag": "object",
    "batchHeaderID": "object",
    "local": "object",
    "organic": "object",
    "display": "object",
    "receipt": "object",
    "card_no": "object",
    "store": "object",
    "branch": "object",
    "match_id": "object",
    "trans_id": "object"
}

In [52]:
def clean_data(file_path, delimiter=',', schema=None):
    r"""
    Cleans the provided CSV file based on specified transformations.
    Specifically, it converts "\N", "\\N", and "NULL" to the string "null".
    It also formats 'datetime' column to the specified format.
    
    :param file_path: Path to the CSV file.
    :param delimiter: Delimiter used in the CSV file. Defaults to comma.
    :param schema: Optional schema to apply to DataFrame columns.
    :return: Cleaned DataFrame.
    """

    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path, delimiter=delimiter, encoding='utf-8', low_memory=False)

    # Apply replacements for string columns based on schema
    if schema:
        for column, dtype in schema.items():
            if dtype == 'object':
                df[column].replace({r"\\N": "null", r"\\\\N":"null", "NULL": "null", r'^\s*$': "null", "": "null"}, regex=True, inplace=True)

    # Format datetime column if it exists
    if 'datetime' in df.columns:
        df['datetime'] = pd.to_datetime(df['datetime'], format='%m/%d/%Y %H:%M', errors='coerce')
        df['datetime'] = df['datetime'].dt.strftime('%Y-%m-%d %H:%M:%S')

    return df

In [55]:

def process_directory(directory_path):
    """
    Processes all CSV files in a given directory, cleans them and saves the cleaned files.
    
    :param directory_path: Path to the directory containing CSV files.
    """
    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory_path, filename)
            
            # Detecting the delimiter (either comma or semicolon)
            with open(file_path, 'r') as file:
                first_line = file.readline()
                delimiter = ';' if ';' in first_line else ','

            # Clean the data
            cleaned_df = clean_data(file_path, delimiter)

            # Saving the cleaned data
            cleaned_filename = filename.replace('.csv', '_clean.csv')
            cleaned_df.to_csv(os.path.join(directory_path, cleaned_filename), index=False)
            print(f"Cleaned and saved: {cleaned_filename}")


# Remember to specify the path to the directory containing your CSV files.
# The script will process each file, clean it according to the rules, and save the cleaned version.

In [56]:
process_directory(directory_path)

Cleaned and saved: transArchive_201510_clean.csv
Cleaned and saved: transArchive_201511_clean.csv
Cleaned and saved: transArchive_201512_clean.csv
Cleaned and saved: transArchive_201601_clean.csv
Cleaned and saved: transArchive_201601_clean_clean.csv


KeyboardInterrupt: 