In [None]:
import glob
'''
Module "glob" can be used to get the file names or folder names based on certain patterns.
'''

'''
*: match any string of characters (including empty strings).
?: match a single character.
[...]: match any character within square brackets.
**: match any number of folders and subfolders when used with recursive=True.
'''
glob.glob('D:/DE projects/project/data/retail_db/**', recursive=True)
file_names = glob.glob('D:/DE projects/project/data/retail_db/*/*')
file_names

In [None]:
import pandas as pd
import json

for file_name in file_names:
    df_file = pd.read_csv(file_name, header=None)
    print(f'Shape of {file_name} is {df_file.shape}')

In [None]:
def get_column_names(schemas, table_name, sorting_key='column_position'):
    column_details = schemas[table_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

schemas = json.load(open('D:/DE projects/project/data/retail_db/schemas.json'))
orders_columns = get_column_names(schemas, 'orders')
orders_columns

In [None]:
import re # Regular expression module.
for file in file_names:
    patterns = r'/|\\' # "r" stands for raw string, and "|" used to split characters.
    print(re.split(patterns, file))

In [None]:
for file in file_names:
    print(f'Processing {file}')
    patterns = r'/|\\'
    file_path_list = re.split(patterns, file)
    table_name = file_path_list[-2]
    columns = get_column_names(schemas, table_name)
    df = pd.read_csv(file, names=columns)
    print(f'Shape of {table_name} is {df.shape}')

df

In [18]:
# Generate file paths for JSON files dynamically.
tgt_base_dir = 'D:/DE projects/project/data/retail_db_json'

for file in file_names:
    patterns = r'/|\\'
    file_path_list = re.split(patterns, file)
    table_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{table_name}/{file_name}'
    print(json_file_path)

D:/DE projects/project/data/retail_db_json/categories/part-00000
D:/DE projects/project/data/retail_db_json/customers/part-00000
D:/DE projects/project/data/retail_db_json/departments/part-00000
D:/DE projects/project/data/retail_db_json/orders/part-00000
D:/DE projects/project/data/retail_db_json/order_items/part-00000
D:/DE projects/project/data/retail_db_json/products/part-00000


In [19]:
# Write Pandas dataframe to JSON files.
import os
tgt_base_dir = 'D:/DE projects/project/data/retail_db_json'
for file in file_names:
    print(f'Processing {file}')
    patterns = r'/|\\'
    file_path_list = re.split(patterns, file)
    table_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{table_name}/{file_name}'
    columns = get_column_names(schemas, table_name)
    df = pd.read_csv(file, names=columns)
    os.makedirs(f'{tgt_base_dir}/{table_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

Processing D:/DE projects/project/data/retail_db\categories\part-00000
Processing D:/DE projects/project/data/retail_db\customers\part-00000
Processing D:/DE projects/project/data/retail_db\departments\part-00000
Processing D:/DE projects/project/data/retail_db\orders\part-00000
Processing D:/DE projects/project/data/retail_db\order_items\part-00000
Processing D:/DE projects/project/data/retail_db\products\part-00000


In [29]:
# Modularize file format converter for dataset.
import glob
import os
import json
import re
import pandas as pd

def get_column_names(schemas, table_name, sorting_key='column_position'):
    column_details = schemas[table_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

def read_csv(file, schemas):
    patterns = r'/|\\'
    file_path_list = re.split(patterns, file)
    table_name = file_path_list[-2]
    file_name = file_path_list[-1]
    columns = get_column_names(schemas, table_name)
    df = pd.read_csv(file, names=columns)
    return df

def to_json(df, tgt_base_dir, table_name, file_name):
    json_file_path = f'{tgt_base_dir}/{table_name}/{file_name}'
    os.makedirs(f'{tgt_base_dir}/{table_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

def file_converter(src_base_dir, tgt_base_dir, table_name):
    schemas = json.load(open(f'{src_base_dir}/schemas.json'))
    files = glob.glob(f'{src_base_dir}/{table_name}/*')

    for file in files:
        df = read_csv(file, schemas)
        patterns = r'/|\\'
        file_name = re.split(patterns, file)[-1]
        to_json(df, tgt_base_dir, table_name, file_name)

def process_files(table_names=None):
    src_base_dir = 'D:/DE projects/project/data/retail_db'
    tgt_base_dir = 'D:/DE projects/project/data/retail_db_json'
    schemas = json.load(open(f'{src_base_dir}/schemas.json'))

    if not table_names:
        table_names = schemas.keys()
    
    for table_name in table_names:
        print(f'Processing {table_name}')
        file_converter(src_base_dir, tgt_base_dir, table_name)

process_files()

Processing departments
Processing categories
Processing orders
Processing products
Processing customers
Processing order_items


**THE END!** (8 cells total)