In [24]:
import glob

In [25]:
src_file_names = glob.glob('data/retail_db/*/part*')
for each_file in src_file_names:
    print(each_file.split('/'))

['data', 'retail_db', 'customers', 'part-00000']
['data', 'retail_db', 'products', 'part-00000']
['data', 'retail_db', 'departments', 'part-00000']
['data', 'retail_db', 'order_items', 'part-00000']
['data', 'retail_db', 'orders', 'part-00000']
['data', 'retail_db', 'categories', 'part-00000']


In [26]:
base_dir = 'data/retail_db_json'

In [27]:
for each_file_name in src_file_names:
    file_list= each_file_name.split('/')
    file_part=file_list[-1]
    ds_name=file_list[-2]
    json_path= f'{base_dir}{ds_name}/{file_part}'
    print(json_path)

data/retail_db_jsoncustomers/part-00000
data/retail_db_jsonproducts/part-00000
data/retail_db_jsondepartments/part-00000
data/retail_db_jsonorder_items/part-00000
data/retail_db_jsonorders/part-00000
data/retail_db_jsoncategories/part-00000


In [28]:
import json

In [29]:
def get_column_names(schemas, ds_name, sorting_key='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details, key=lambda col: col[sorting_key])
    return [col['column_name'] for col in columns]

In [30]:
schemas = json.load(open('data/retail_db/schemas.json'))

In [31]:
import pandas as pd

In [32]:
for each_file_name in src_file_names:
    print(f'Processing {each_file_name}')
    file_list= each_file_name.split('/')
    ds_name=file_list[-2]
    columns=get_column_names(schemas,ds_name)
    df=pd.read_csv(each_file_name, names=columns)
    print(f'Shape of {each_file_name} is {df.shape}')

Processing data/retail_db/customers/part-00000
Shape of data/retail_db/customers/part-00000 is (12435, 9)
Processing data/retail_db/products/part-00000
Shape of data/retail_db/products/part-00000 is (1345, 6)
Processing data/retail_db/departments/part-00000
Shape of data/retail_db/departments/part-00000 is (6, 2)
Processing data/retail_db/order_items/part-00000
Shape of data/retail_db/order_items/part-00000 is (172198, 6)
Processing data/retail_db/orders/part-00000
Shape of data/retail_db/orders/part-00000 is (68883, 4)
Processing data/retail_db/categories/part-00000
Shape of data/retail_db/categories/part-00000 is (58, 3)


In [33]:
import os

In [34]:
tgt_base_dir = 'data/retail_db_json'
for file in src_file_names:
    print(f'Processing {file}')
    file_path_list = file.split('/')
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{base_dir}/{ds_name}/{file_name}'
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    os.makedirs(f'{tgt_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

Processing data/retail_db/customers/part-00000
Processing data/retail_db/products/part-00000
Processing data/retail_db/departments/part-00000
Processing data/retail_db/order_items/part-00000
Processing data/retail_db/orders/part-00000
Processing data/retail_db/categories/part-00000
