In [1]:
import glob

In [2]:
glob?

In [3]:
help(glob)

Help on module glob:

NAME
    glob - Filename globbing utility.

MODULE REFERENCE
    https://docs.python.org/3.10/library/glob.html
    
    The following documentation is automatically generated from the Python
    source files.  It may be incomplete, incorrect or include features that
    are considered implementation detail and may vary between Python
    implementations.  When in doubt, consult the module reference at the
    location listed above.

FUNCTIONS
    escape(pathname)
        Escape all special characters.
    
    glob(pathname, *, root_dir=None, dir_fd=None, recursive=False)
        Return a list of paths matching a pathname pattern.
        
        The pattern may contain simple shell-style wildcards a la
        fnmatch. However, unlike fnmatch, filenames starting with a
        dot are special cases that are not matched by '*' and '?'
        patterns.
        
        If recursive is true, the pattern '**' will match any files and
        zero or more directori

In [5]:
pwd

'F:\\Data ng\\File Format Convertor'

In [6]:
glob.glob('../data/retail_db/**',recursive = True)

['../data/retail_db\\',
 '../data/retail_db\\categories',
 '../data/retail_db\\categories\\part-00000',
 '../data/retail_db\\create_db.sql',
 '../data/retail_db\\create_db_tables_pg.sql',
 '../data/retail_db\\customers',
 '../data/retail_db\\customers\\part-00000',
 '../data/retail_db\\departments',
 '../data/retail_db\\departments\\part-00000',
 '../data/retail_db\\load_db_tables_pg.sql',
 '../data/retail_db\\orders',
 '../data/retail_db\\orders\\part-00000',
 '../data/retail_db\\order_items',
 '../data/retail_db\\order_items\\part-00000',
 '../data/retail_db\\products',
 '../data/retail_db\\products\\part-00000',
 '../data/retail_db\\README.md',
 '../data/retail_db\\schemas.json']

In [7]:
glob.glob('../data/retail_db/*/*')

['../data/retail_db\\categories\\part-00000',
 '../data/retail_db\\customers\\part-00000',
 '../data/retail_db\\departments\\part-00000',
 '../data/retail_db\\orders\\part-00000',
 '../data/retail_db\\order_items\\part-00000',
 '../data/retail_db\\products\\part-00000']

In [105]:
src_file_names = glob.glob('data/retail_db/*/*')

In [106]:
src_file_names

['data/retail_db\\categories\\part-00000',
 'data/retail_db\\customers\\part-00000',
 'data/retail_db\\departments\\part-00000',
 'data/retail_db\\orders\\part-00000',
 'data/retail_db\\order_items\\part-00000',
 'data/retail_db\\products\\part-00000']

In [10]:
import pandas as pd

In [14]:
for file_name in src_file_names:
    df = pd.read_csv(file_name,header=None)
    print(f"shape of {file_name} is {df.shape}")

shape of ../data/retail_db\categories\part-00000 is (58, 3)
shape of ../data/retail_db\customers\part-00000 is (12435, 9)
shape of ../data/retail_db\departments\part-00000 is (6, 2)
shape of ../data/retail_db\orders\part-00000 is (68883, 4)
shape of ../data/retail_db\order_items\part-00000 is (172198, 6)
shape of ../data/retail_db\products\part-00000 is (1345, 6)


In [15]:
import json

In [20]:
def get_column_names(schemas,ds_name,sorting_key ='column_position'):
    column_details = schemas[ds_name]
    columns = sorted(column_details,key = lambda col : col[sorting_key])
    return [col['column_name'] for col in columns]

In [21]:
schemas = json.load(open('../data/retail_db/schemas.json'))

In [22]:
order_columns = get_column_names(schemas,'orders')

In [23]:
order_columns

['order_id', 'order_date', 'order_customer_id', 'order_status']

In [24]:
orders = pd.read_csv('../data/retail_db/orders/part-00000',names = order_columns)

In [25]:
orders

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [26]:
type(orders)

pandas.core.frame.DataFrame

In [27]:
#lets see some example below

In [28]:
s = 's/a\\b\\c'


In [29]:
s.split("/")

['s', 'a\\b\\c']

In [30]:
s.split("\\")

['s/a', 'b', 'c']

In [31]:
import re

In [32]:
re.split('[/\\\]',s)

['s', 'a', 'b', 'c']

In [36]:
for file in src_file_names:
    print(re.split('[/\\\]',file))

['..', 'data', 'retail_db', 'categories', 'part-00000']
['..', 'data', 'retail_db', 'customers', 'part-00000']
['..', 'data', 'retail_db', 'departments', 'part-00000']
['..', 'data', 'retail_db', 'orders', 'part-00000']
['..', 'data', 'retail_db', 'order_items', 'part-00000']
['..', 'data', 'retail_db', 'products', 'part-00000']


In [37]:
file = src_file_names[0]

In [38]:
file_details = re.split('[/\\\]',file)

In [41]:
ds_name = file_details[-2]

In [42]:
columns = get_column_names(schemas,ds_name)

In [43]:
columns

['category_id', 'category_department_id', 'category_name']

In [44]:
df = pd.read_csv(file,names=columns)

In [45]:
df

Unnamed: 0,category_id,category_department_id,category_name
0,1,2,Football
1,2,2,Soccer
2,3,2,Baseball & Softball
3,4,2,Basketball
4,5,2,Lacrosse
5,6,2,Tennis & Racquet
6,7,2,Hockey
7,8,2,More Sports
8,9,3,Cardio Equipment
9,10,3,Strength Training


In [46]:
df.shape

(58, 3)

In [49]:
for file in src_file_names:
    print(f"processing file {file}")
    file_path_list = re.split('[/\\\]',file)
#     print(file_path_list)
    ds_name = file_path_list[-2]
    columns = get_column_names(schemas,ds_name)
    df = pd.read_csv(file,names = columns)
    print(f"shape of {ds_name} is {df.shape}")

processing file ../data/retail_db\categories\part-00000
shape of categories is (58, 3)
processing file ../data/retail_db\customers\part-00000
shape of customers is (12435, 9)
processing file ../data/retail_db\departments\part-00000
shape of departments is (6, 2)
processing file ../data/retail_db\orders\part-00000
shape of orders is (68883, 4)
processing file ../data/retail_db\order_items\part-00000
shape of order_items is (172198, 6)
processing file ../data/retail_db\products\part-00000
shape of products is (1345, 6)


In [50]:
df

Unnamed: 0,product_id,product_cateogry_id,product_name,product_description,product_price,product_image
0,1,2,Quest Q64 10 FT. x 10 FT. Slant Leg Instant U,,59.98,http://images.acmesports.sports/Quest+Q64+10+F...
1,2,2,Under Armour Men's Highlight MC Football Clea,,129.99,http://images.acmesports.sports/Under+Armour+M...
2,3,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+M...
3,4,2,Under Armour Men's Renegade D Mid Football Cl,,89.99,http://images.acmesports.sports/Under+Armour+M...
4,5,2,Riddell Youth Revolution Speed Custom Footbal,,199.99,http://images.acmesports.sports/Riddell+Youth+...
...,...,...,...,...,...,...
1340,1341,59,Nike Women's Cleveland Browns Johnny Football,,34.00,http://images.acmesports.sports/Nike+Women%27s...
1341,1342,59,Nike Men's St. Louis Rams Michael Sam #96 Nam,,32.00,http://images.acmesports.sports/Nike+Men%27s+S...
1342,1343,59,Nike Men's Home Game Jersey St. Louis Rams Mi,,100.00,http://images.acmesports.sports/Nike+Men%27s+H...
1343,1344,59,Nike Men's Home Game Jersey St. Louis Rams Aa,,100.00,http://images.acmesports.sports/Nike+Men%27s+H...


In [56]:
tgt_base_dir = 'data/retail_db_json'

In [57]:
file = src_file_names[0]

In [58]:
file_path_list = re.split('[/\\\]',file)

In [60]:
ds_name = file_path_list[-2]

In [61]:
file_name = file_path_list[-1]

In [62]:
f'{tgt_base_dir}/{ds_name}/{file_name}'

'../data/retail_db_json/categories/part-00000'

In [63]:
for file in src_file_names:
    file_path_list = re.split("[/\\\]",file)
    ds_name =file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{tgt_base_dir}/{ds_name}/{file_name}'
    print(json_file_path)

../data/retail_db_json/categories/part-00000
../data/retail_db_json/customers/part-00000
../data/retail_db_json/departments/part-00000
../data/retail_db_json/orders/part-00000
../data/retail_db_json/order_items/part-00000
../data/retail_db_json/products/part-00000


In [64]:
#to generate the json format along with files


In [65]:
columns = [
    'order_id','order_date','order_customer_id','order_status'
]

In [67]:
df = pd.read_csv('../data/retail_db/orders/part-00000',names = columns)

In [68]:
import os
os.makedirs('data/retail_db_json/orders',exist_ok=True)

In [69]:
 df

Unnamed: 0,order_id,order_date,order_customer_id,order_status
0,1,2013-07-25 00:00:00.0,11599,CLOSED
1,2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
2,3,2013-07-25 00:00:00.0,12111,COMPLETE
3,4,2013-07-25 00:00:00.0,8827,CLOSED
4,5,2013-07-25 00:00:00.0,11318,COMPLETE
...,...,...,...,...
68878,68879,2014-07-09 00:00:00.0,778,COMPLETE
68879,68880,2014-07-13 00:00:00.0,1117,COMPLETE
68880,68881,2014-07-19 00:00:00.0,2518,PENDING_PAYMENT
68881,68882,2014-07-22 00:00:00.0,10000,ON_HOLD


In [101]:
df.to_json(
'data/retail_db_json/orders/part-00000',
    orient='records',
    lines = True
)

OSError: Cannot save file into a non-existent directory: 'data\retail_db_json\orders'

In [102]:
base_dir

'../data/retail_db'

In [74]:
# Now we can convert all files data into json format

In [111]:
tgrt_base_dir = 'data/retail_db_json'
for file in src_file_names:
    print(f"Processing file {file}")
    file_path_list = re.split('[/\\\]',file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f"{tgrt_base_dir}/{ds_name}/{file_name}"
    columns = get_column_names(schemas,ds_name)
#     print(f"file_path_list :{file_path_list},file_name:{file_name},json_file_path:{json_file_path}")
    df = pd.read_csv(file,names=columns)
    
    os.makedirs(f'{tgrt_base_dir}/{ds_name}/{file_name}',exist_ok=True)
    print(str(json_file_path))
    df.to_json(
    json_file_path,
        orient='records',
        lines=True
    )

Processing file data/retail_db\categories\part-00000
data/retail_db_json/categories/part-00000


PermissionError: [Errno 13] Permission denied: 'data/retail_db_json/categories/part-00000'

In [109]:
base_dir="data/retail_db"

In [113]:
tgt_base_dir = 'data/retail_db_json'
for file in src_file_names:
    print(f'Processing {file}')
    file_path_list = re.split('[/\\\]', file)
    ds_name = file_path_list[-2]
    file_name = file_path_list[-1]
    json_file_path = f'{base_dir}/{ds_name}/{file_name}'
    columns = get_column_names(schemas, ds_name)
    df = pd.read_csv(file, names=columns)
    os.makedirs(f'{tgt_base_dir}/{ds_name}', exist_ok=True)
    df.to_json(
        json_file_path,
        orient='records',
        lines=True
    )

Processing data/retail_db\categories\part-00000
Processing data/retail_db\customers\part-00000
Processing data/retail_db\departments\part-00000
Processing data/retail_db\orders\part-00000
Processing data/retail_db\order_items\part-00000
Processing data/retail_db\products\part-00000


ParserError: Error tokenizing data. C error: Expected 6 fields in line 685, saw 7


In [104]:
pwd

'F:\\Data ng\\File Format Convertor'

In [1]:
pwd

'F:\\Data ng\\File Format Convertor'

In [2]:
ls

 Volume in drive F is New Volume
 Volume Serial Number is EAF4-BB2C

 Directory of F:\Data ng\File Format Convertor

18-08-2024  15:53    <DIR>          .
31-08-2024  13:16    <DIR>          ..
15-08-2024  21:37             3,312 .gitignore
11-08-2024  21:20    <DIR>          .ipynb_checkpoints
18-08-2024  15:01    <DIR>          app-file-format-convertor-db-file-loader
11-08-2024  21:21    <DIR>          data
11-08-2024  21:21            80,193 FileFormatConvertor.ipynb
               2 File(s)         83,505 bytes
               5 Dir(s)  448,857,923,584 bytes free


In [4]:
cd app-file-format-convertor-db-file-loader


F:\Data ng\File Format Convertor\app-file-format-convertor-db-file-loader


In [6]:
%ls

 Volume in drive F is New Volume
 Volume Serial Number is EAF4-BB2C

 Directory of F:\Data ng\File Format Convertor\app-file-format-convertor-db-file-loader

18-08-2024  15:01    <DIR>          .
31-08-2024  16:08    <DIR>          ..
18-08-2024  15:21               132 .env
18-08-2024  14:24    <DIR>          data
18-08-2024  15:51            89,047 DatabaseLoader.ipynb
31-08-2024  15:49             3,062 databaseloader.py
15-08-2024  21:27             2,064 ffc-app.py
17-08-2024  18:25    <DIR>          ffc-venv
15-08-2024  21:38               189 hw.py
18-08-2024  15:24                50 requirements.txt
               6 File(s)         94,544 bytes
               4 Dir(s)  448,857,923,584 bytes free


In [8]:
from databaseloader import process_files

In [10]:
process_files()

src dir data/retail_db , connect uri postgresql://postgres:1234567@localhost:5432/local_retail_db
