In [5]:
import json 
import os
import pandas as pd
from pprint import pprint
SRC = '../sources'

In [7]:
schemas = []
for filename in os.listdir(SRC):
    file_path = os.path.join(SRC, filename)
    if file_path.endswith('.csv'):
        try:
            df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding='ISO-8859-1')
    elif file_path.endswith('.json'):
        with open(file_path, 'r') as f:
            data = json.load(f)
        if isinstance(data, list):
            df = pd.json_normalize(data)
        else:
            df = pd.json_normalize([data])
    elif file_path.endswith('.xls'):
        df = pd.read_excel(file_path)
    elif file_path.endswith('.jsonl'):
        df = pd.read_json(file_path, lines=True)
    else: 
        print(f"Unsupported file type: {file_path}")
        continue    

    schema = {col: str(dtype) for col, dtype in zip(df.columns, df.dtypes)}
    schemas.append({"file": filename, "schema": schema})

pprint(schemas)


[{'file': 'wissel-activity-ariregister.rik.ee.csv',
  'schema': {'Area of Activity': 'object',
             'EMTAK Code': 'object',
             'ID azienda': 'int64',
             'NACE Code': 'float64',
             'Source': 'object'}},
 {'file': 'DDD-teamblind.com.csv',
  'schema': {'Unnamed: 0': 'int64',
             'founded': 'float64',
             'industry': 'object',
             'locations': 'object',
             'name': 'object',
             'size': 'object',
             'website': 'object'}},
 {'file': 'wissel-aziende-info-clipper.com.csv',
  'schema': {'Address Name': 'object',
             'City': 'object',
             'Country': 'object',
             'INDEX': 'int64',
             'Location type': 'object',
             'Name': 'object',
             'Postalcode': 'object',
             'State': 'object',
             'Trade Name': 'object',
             'URL': 'object',
             'Unnamed: 0': 'int64'}},
 {'file': 'output_govuk_bigsize.json',
  'schema': {'com

In [17]:
schemas = sorted(schemas, key= lambda x: x['file'])
with open(f'all_schemas.txt', 'w') as f:
    for elem in schemas:
        filename = elem['file'].split('.')[0]
        text = filename + "("
        for key, _ in elem['schema'].items():
            text += f"{key},"
            #filename(att1, att2, att3);
        text += ");"
        f.write(text + '\n')