In [14]:
import pandas as pd
import json
import os
import re

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase the width of the display
pd.set_option('display.max_colwidth', None)  # Don't truncate text in columns

DATA_SRC = "../sources"  
schema_file_path = "mediated_schema.json"  
with open(schema_file_path, "r", encoding="utf-8") as f:
    schema_data = json.load(f)

companies_keys = list(schema_data["schema"].keys())
companies_df = pd.DataFrame(columns=companies_keys)

employees_keys = list(schema_data['employees'].keys())
emp_df = pd.DataFrame(columns=employees_keys)

In [15]:
display(companies_df)

Unnamed: 0,company_id,company_name,trade_name,industry,sector,categories,company_status,company_type,headquarters,address,postal_code,city,country,state,continent,foundation_year,registration_date,website,rank,market_cap_or_valuation,number_of_employees,ceo,founder,assets,profit_or_net_income,revenue,total_funding_raised,share_price,change_1_day,change_1_year,debts,phone,sic_code,emtak_code,nace_code,facebook,twitter,pinterest,instagram,investors,region,notes_or_description


In [16]:
display(emp_df)

Unnamed: 0,company_id,nameEmployee,code,role,startDate,participation,contribution


In [17]:
schema_mapping_path = "schema_mapping.json"
with open(schema_mapping_path, "r", encoding="utf-8") as f:
    field_mapping = json.load(f)

In [18]:
def normalize_strings(value):
    if isinstance(value, str):
        value = re.sub(r'\s+', ' ', value.strip())
        value = value.replace('\\n', '').strip()
        value = value.lower()
    return value

In [19]:
def map_source_to_schema(source_df, field_mapping):
    source_columns = field_mapping.keys()
    mapped_df = source_df[list(source_columns)].copy()
    mapped_df.rename(columns=field_mapping, inplace=True)
    mediated_schema_columns = list(set(field_mapping.values()))

    for column in mediated_schema_columns:
        if column not in mapped_df.columns:
            mapped_df[column] = None
        else:
            mapped_df[column] = mapped_df[column].apply(normalize_strings)
    return mapped_df

In [20]:
def integrate_data(companies_df, transformed_df):
    for column in transformed_df.columns:
        if column not in companies_df.columns:
            companies_df[column] = None  # Add missing columns with None

    # Concatenate along rows (axis=0), ignoring index to avoid conflicts
    integrated_df = pd.concat([companies_df, transformed_df], ignore_index=True)

    return integrated_df

In [21]:
emp_df = pd.DataFrame()
companies_df = pd.DataFrame()

for source_file in os.listdir(DATA_SRC):
    file_path = os.path.join(DATA_SRC, source_file)
    
    if file_path.endswith('.csv'):
        try:
            source_df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            source_df = pd.read_csv(file_path, encoding='ISO-8859-1')
    elif file_path.endswith('.json'):
        with open(file_path, 'r') as f:
            data = json.load(f)
        if isinstance(data, list):
            source_df = pd.json_normalize(data)
        else:
            source_df = pd.json_normalize([data])
    elif file_path.endswith('.xls'):
        source_df = pd.read_excel(file_path)
    elif file_path.endswith('.jsonl'):
        source_df = pd.read_json(file_path, lines=True)

    datasource = os.path.splitext(source_file)[0]
    if source_file.startswith('wissel-partners') or source_file.startswith('wissel-rappresentanti'):
        print(f"Processing {datasource} data, row count: {source_df.shape[0]}")
        temp = map_source_to_schema(source_df, field_mapping['employees_mapping'][datasource])
        emp_df = integrate_data(emp_df, temp)
    else:
        print(f"Processing {datasource} data, row count: {source_df.shape[0]}")
        temp = map_source_to_schema(source_df, field_mapping['companies_mapping'][datasource])
        companies_df = integrate_data(companies_df, temp)


display(companies_df.head(10))
display(emp_df.head(10))

Processing wissel-activity-ariregister.rik.ee data, row count: 1015
Processing DDD-teamblind.com data, row count: 946


  integrated_df = pd.concat([companies_df, transformed_df], ignore_index=True)
  integrated_df = pd.concat([companies_df, transformed_df], ignore_index=True)


Processing wissel-aziende-info-clipper.com data, row count: 4656
Processing output_govuk_bigsize data, row count: 7000
Processing MalPatSaj-wikipedia.org data, row count: 3111
Processing wissel-aziende-ariregister.rik.ee data, row count: 1469
Processing ft.com data, row count: 5000
Processing disfold.com data, row count: 16097
Processing campaignindia data, row count: 1000
Processing output_globaldata data, row count: 5336
Processing MalPatSaj-forbes.com data, row count: 2000
Processing company_social_urls data, row count: 848
Processing AmbitionBox data, row count: 9899
Processing valueToday_dataset data, row count: 10682
Processing wissel-rappresentanti-ariregister.rik.ee data, row count: 1005
Processing companiesMarketCap_dataset data, row count: 5897
Processing wissel-partners-ariregister.rik.ee data, row count: 1191
Processing DDD-cbinsight.com data, row count: 1185
Processing hitHorizons_dataset data, row count: 667


Unnamed: 0,company_id,industry,emtak_code,nace_code,notes_or_description,company_name,website,headquarters,number_of_employees,foundation_year,trade_name,address,postal_code,city,state,country,company_status,company_type,sector,registration_date,share_price,revenue,ceo,market_cap_or_valuation,rank,region,phone,profit_or_net_income,assets,facebook,twitter,instagram,pinterest,debts,continent,categories,founder,change_1_day,change_1_year,investors,total_funding_raised,sic_code
0,2,other activities auxiliary to financial services that are not classified elsewhere,66199 (emtak 2008),66.19,electronic submission (14.07.2021),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,3,specialised design activities,74101 (emtak 2008),74.1,annual report (07.02.2022),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,4,computer programming activities,62011 (emtak 2008),62.01,annual report (29.06.2022),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,5,construction of residential and non-residential buildings,41201 (emtak 2008),41.2,electronic submission (26.02.2022),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,6,"renting and operational leasing of other machinery, equipment and tangible assets not classified elsewhere",77399 (emtak 2008),77.39,annual report (18.04.2022),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,7,manufacture of plastic packing goods \xc2\xa0,22221 (emtak 2008),22.22,annual report (04.05.2022),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,8,"other retail sale not in stores, stalls or markets",47991 (emtak 2008),47.99,statistics estonia (01.11.2021),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,9,"activities of saunas, sunbeds and massage salons and other services related to physical well-being",96041 (emtak 2008),96.04,electronic submission (06.07.2020),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,11,business and other management consultancy activities,70221 (emtak 2008),70.22,electronic submission (21.09.2021),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,12,holiday home (chalets),55202 (emtak 2008),55.2,annual report (28.04.2022),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Unnamed: 0,company_id,nameEmployee,code,role,startDate,participation,contribution
0,2,janis jegi,38807314918,management board member,14.07.2021,,
1,3,siim tiigim\xc3\xa4gi,38807042731,management board member,15.04.2013,,
2,4,dominik gronkiewicz,39207230150,management board member,11.12.2019,,
3,5,rainis luik,37508026514,management board member,28.02.2022,,
4,6,kaido kaabel,36706040246,management board member,28.11.1996,,
5,7,marek kaabel,37008100259,management board member,22.06.2010,,
6,9,joosep tinn,38205090298,management board member,26.11.2021,,
7,11,german randla,38201180255,management board member,21.09.2021,,
8,12,sven mansberg,37104060289,management board member,02.09.2020,,
9,14,ines k\xc3\xa4\xc3\xa4rma,48303160318,management board member,12.09.2017,,


In [22]:
display(companies_df.info())
display(emp_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76808 entries, 0 to 76807
Data columns (total 42 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   company_id               31383 non-null  object 
 1   industry                 27953 non-null  object 
 2   emtak_code               1015 non-null   object 
 3   nace_code                1015 non-null   float64
 4   notes_or_description     16673 non-null  object 
 5   company_name             75793 non-null  object 
 6   website                  43903 non-null  object 
 7   headquarters             34960 non-null  object 
 8   number_of_employees      32726 non-null  object 
 9   foundation_year          27217 non-null  object 
 10  trade_name               677 non-null    object 
 11  address                  18627 non-null  object 
 12  postal_code              4656 non-null   object 
 13  city                     5825 non-null   object 
 14  state                 

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2196 entries, 0 to 2195
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   company_id     2196 non-null   object
 1   nameEmployee   2196 non-null   object
 2   code           2196 non-null   object
 3   role           1005 non-null   object
 4   startDate      1005 non-null   object
 5   participation  1191 non-null   object
 6   contribution   1191 non-null   object
dtypes: object(7)
memory usage: 120.2+ KB


None

In [24]:
emp_df = emp_df.sort_values(by=['nameEmployee'])
companies_df = companies_df.sort_values(by=['company_name'])
comp_columns_order = ["company_id","company_name","trade_name","industry","sector","categories", 
                 "company_status","company_type","headquarters", "address", "postal_code","city", 
                 "country", "state", "continent", "foundation_year", "registration_date", "website", 
                 "rank", "market_cap_or_valuation", "number_of_employees", "ceo", "founder", 
                 "assets", "profit_or_net_income", "revenue", "total_funding_raised", "share_price", 
                 "change_1_day", "change_1_year", "debts", "phone", "sic_code", "emtak_code", "nace_code", 
                 "facebook", "twitter", "pinterest", "instagram", "investors", "region", "notes_or_description"]
companies_df = companies_df[comp_columns_order]

emp_df.to_csv("impiegati.csv", index=False)
companies_df.to_csv("aziende.csv", index=False)