In [49]:
import pandas as pd
import json
import os
import re

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase the width of the display
pd.set_option('display.max_colwidth', None)  # Don't truncate text in columns

DATA_SRC = "../sources"  
with open('mediated_schema.json', 'r') as f:
    data = json.load(f)


In [18]:
df = pd.json_normalize(data, record_path=None)
df.to_csv('mediated_schema.csv', index=False)

In [50]:
mediated_schema_keys = list(data['mediated_schema'].keys())
df = pd.DataFrame(columns=mediated_schema_keys)
display(df)

Unnamed: 0,company_id,company_name,trade_name,industry,area_of_activity,nace_code,emtak_code,sic_code,legal_form,company_status,registration_date,foundation_year,headquarters_address,city,state_or_region,country,website,number_of_employees,revenue,market_cap,ceo_or_management,source


In [45]:
field_mapping_path = "field_mapping.json"
with open(field_mapping_path, "r", encoding="utf-8") as f:
    field_mapping = json.load(f)

field_df = pd.json_normalize(field_mapping)
display(field_df)

Unnamed: 0,company_id,company_name,brand_name,aliases,company_status,company_type,legal_form,foundation_year,registration_date,industry,business_description,classification_codes,address_street,address_city,address_state,address_postal_code,address_country,headquarters,website,social_links,phone_number,email,ceo,representatives,partners,employees,employee_range,revenue,profit,market_cap,ranking,notes,source
0,"[ID azienda, ID, Unnamed: 0, INDEX, company_number, rank, Code, RANK, id]","[Name, name, BRAND NAME, company_name]",[BRAND NAME],"[Trade Name, Alternate names]","[Status, company_status]","[company_type, Ownership, type, Legal form (when used to mean type), MAIN MARKET (in brand contexts, if it represents a business type), Stage (e.g. 'Series B', 'Private', 'Public', etc.)]","[Legal form, company_type (where explicitly referencing legal structure)]","[founded, foundation year, founded_date, company_creation_date, dateJoined (if referencing the start/founding date), Registration Date (when it is clearly the founding date in some records), est_of_ownership]","[Registration Date, company_creation_date]","[industry, Industry, Sector, Area of Activity, CATEGORY, nature_of_business, company_business (when a short single-industry descriptor), SIC description (if spelled out in words), business_description (if specifically the main industry label)]","[Area of Activity, business_description, Notes (if describing company’s main business), company_business (when multiple categories are listed and we store them in text form)]","[NACE Code, EMTAK Code, SIC code, NAICS code, other industry classification codes]","[Address Name, Address, registered_office_address (street portion), HQ address (street portion), URL (if it includes address in some textual form, rarely used)]","[City, headquarters (city portion), locations (city portion), registered_office_address (city portion)]","[State, headquarters (state portion), registered_office_address (state portion)]","[Postalcode, registered_office_address (postal code portion)]","[Country, headquarters (country portion), nation]","[Headquarters, headquarter, headquarters (full string, if present as a single field)]","[website, URL, company_website, link]","[Facebook, Twitter, Instagram, Pinterest]",[telephone],[email],"[ceo, chief_executive, Name (when role = 'CEO')]","[Role, Name (when role is something like 'Management board member'), Start Date, Code (person's code / ID), Sole trader]","[Participation, Contribution, Name (partner), Code (partner/company ID if relevant)]","[employees, number_of_employees, size (when it gives a numeric, e.g. '10,000+ employees' can go to employees or employee_range)]",[size (if textual range like '1 to 50 employees')],"[revenue, annual_revenue_in_usd, Sales]","[Profit, annual_net_income_in_usd]","[market_cap, Market Value]","[world_rank, rank, RANK, company_number (if used as rank in a brand listing), Unnamed: 0 (in some data sets used for rank-like indexing)]","[Notes, Source (if short textual note describing the record), comment fields]","[Source, source, MAIN MARKET (if used as data source info), any direct mention of 'Electronic submission ...', 'Annual report ...', etc.]"


In [33]:
def normalize_strings(value):
    if isinstance(value, str):
        value = re.sub(r'\s+', ' ', value.strip())
        value = value.replace('\\n', '').strip()
    return value

In [47]:
def map_source_to_schema(source_df, field_mapping):
    temp_df = df.copy()
    source_columns = {col.lower(): col for col in source_df.columns}
    
    for schema_field, possible_source_fields in field_mapping.items():
        for source_field in possible_source_fields:
            normalized_field = source_field.lower()
            if normalized_field in source_columns:
                temp_df[schema_field] = source_df[source_columns[normalized_field]]
                break
            else:
                print(f"Field {normalized_field} not found in source data")  
        else:
            temp_df[schema_field] = None
    
    return temp_df

In [48]:
for source_file in os.listdir(DATA_SRC):
    file_path = os.path.join(DATA_SRC, source_file)
    
    if file_path.endswith('.csv'):
        try:
            source_df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            source_df = pd.read_csv(file_path, encoding='ISO-8859-1')
    elif file_path.endswith('.json'):
        with open(file_path, 'r') as f:
            data = json.load(f)
        if isinstance(data, list):
            source_df = pd.json_normalize(data)
        else:
            source_df = pd.json_normalize([data])
    elif file_path.endswith('.xls'):
        source_df = pd.read_excel(file_path)
    elif file_path.endswith('.jsonl'):
        source_df = pd.read_json(file_path, lines=True)

display(source_df.head(1))  

temp = map_source_to_schema(source_df, field_mapping)
display(temp.head(1))

Unnamed: 0,id,name,address,nation,hhid,industry,sic_code,type,est_of_ownership
0,d60823f20d864692b517a8ad6c1418ed,Enel Spa,"VIALE REGINA MARGHERITA 137, ROMA, 00198, ROMA, ITALY",IT00934061003,H-IT0031426662,"Finance, Insurance, and Real Estate",Holding company (6719),Corporation,1962


Field id azienda not found in source data
Field brand name not found in source data
Field trade name not found in source data
Field alternate names not found in source data
Field status not found in source data
Field company_status not found in source data
Field company_type not found in source data
Field ownership not found in source data
Field legal form not found in source data
Field company_type (where explicitly referencing legal structure) not found in source data
Field founded not found in source data
Field foundation year not found in source data
Field founded_date not found in source data
Field company_creation_date not found in source data
Field datejoined (if referencing the start/founding date) not found in source data
Field registration date (when it is clearly the founding date in some records) not found in source data
Field registration date not found in source data
Field company_creation_date not found in source data
Field area of activity not found in source data
Field

Unnamed: 0,company_id,company_name,brand_name,aliases,company_status,company_type,legal_form,foundation_year,registration_date,industry,business_description,classification_codes,address_street,address_city,address_state,address_postal_code,address_country,headquarters,website,social_links,phone_number,email,ceo,representatives,partners,employees,employee_range,revenue,profit,market_cap,ranking,notes,source
0,d60823f20d864692b517a8ad6c1418ed,Enel Spa,,,,Corporation,,1962,,"Finance, Insurance, and Real Estate",,,"VIALE REGINA MARGHERITA 137, ROMA, 00198, ROMA, ITALY",,,,IT00934061003,,,,,,,,,,,,,,,,
