In [1]:
import pandas as pd
import json
import os
import re

pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)  # Increase the width of the display
pd.set_option('display.max_colwidth', None)  # Don't truncate text in columns

DATA_SRC = "../sources"  
schema_file_path = "mediated_schema.json"  
with open(schema_file_path, "r", encoding="utf-8") as f:
    schema_data = json.load(f)

companies_keys = list(schema_data["schema"].keys())
companies_df = pd.DataFrame(columns=companies_keys)

employees_keys = list(schema_data['employees'].keys())
emp_df = pd.DataFrame(columns=employees_keys)

In [2]:
display(companies_df)

Unnamed: 0,company_id,company_name,trade_name,industry,sector,categories,company_status,company_type,headquarters,address,postal_code,city,country,state,continent,foundation_year,registration_date,website,rank,market_cap_or_valuation,number_of_employees,ceo,founder,assets,profit_or_net_income,revenue,total_funding_raised,share_price,change_1_day,change_1_year,debts,phone,sic_code,emtak_code,nace_code,facebook,twitter,pinterest,instagram,investors,region,notes_or_description


In [3]:
display(emp_df)

Unnamed: 0,idAzienda,nameEmployee,code,role,startDate,participation,contribution


In [17]:
schema_mapping_path = "schema_mapping.json"
with open(schema_mapping_path, "r", encoding="utf-8") as f:
    field_mapping = json.load(f)

In [5]:
def normalize_strings(value):
    if isinstance(value, str):
        value = re.sub(r'\s+', ' ', value.strip())
        value = value.replace('\\n', '').strip()
        value = value.lower()
    return value

In [44]:
def map_source_to_schema(source_df, field_mapping):
    source_columns = field_mapping.keys()
    mapped_df = source_df[list(source_columns)].copy()
    mapped_df.rename(columns=field_mapping, inplace=True)
    mediated_schema_columns = list(set(field_mapping.values()))

    for column in mediated_schema_columns:
        if column not in mapped_df.columns:
            mapped_df[column] = None
        else:
            mapped_df[column] = mapped_df[column].apply(normalize_strings)
    return mapped_df

In [37]:
def integrate_data(companies_df, transformed_df):
    for column in transformed_df.columns:
        if column not in companies_df.columns:
            companies_df[column] = None  # Add missing columns with None

    # Concatenate along rows (axis=0), ignoring index to avoid conflicts
    integrated_df = pd.concat([companies_df, transformed_df], ignore_index=True)

    return integrated_df

In [45]:
emp_df = pd.DataFrame()
companies_df = pd.DataFrame()

for source_file in os.listdir(DATA_SRC):
    file_path = os.path.join(DATA_SRC, source_file)
    
    if file_path.endswith('.csv'):
        try:
            source_df = pd.read_csv(file_path, encoding='utf-8')
        except UnicodeDecodeError:
            source_df = pd.read_csv(file_path, encoding='ISO-8859-1')
    elif file_path.endswith('.json'):
        with open(file_path, 'r') as f:
            data = json.load(f)
        if isinstance(data, list):
            source_df = pd.json_normalize(data)
        else:
            source_df = pd.json_normalize([data])
    elif file_path.endswith('.xls'):
        source_df = pd.read_excel(file_path)
    elif file_path.endswith('.jsonl'):
        source_df = pd.read_json(file_path, lines=True)

    datasource = os.path.splitext(source_file)[0]
    if source_file.startswith('wissel-partners') or source_file.startswith('wissel-rappresentanti'):
        print(f"Processing {datasource} data, row count: {source_df.shape[0]}")
        temp = map_source_to_schema(source_df, field_mapping['employees_mapping'][datasource])
        emp_df = integrate_data(emp_df, temp)
    else:
        print(f"Processing {datasource} data, row count: {source_df.shape[0]}")
        temp = map_source_to_schema(source_df, field_mapping['companies_mapping'][datasource])
        companies_df = integrate_data(companies_df, temp)


display(companies_df.head(10))
display(emp_df.head(10))

Processing AmbitionBox data, row count: 9899
result df row count: 9899
Processing campaignindia data, row count: 1000
result df row count: 10899
Processing companiesMarketCap_dataset data, row count: 5897
result df row count: 16796
Processing company_social_urls data, row count: 848
result df row count: 17644
Processing DDD-cbinsight.com data, row count: 1185
result df row count: 18829
Processing DDD-teamblind.com data, row count: 946
result df row count: 19775
Processing disfold.com data, row count: 16097
result df row count: 35872
Processing ft.com data, row count: 5000
result df row count: 40872
Processing hitHorizons_dataset data, row count: 667
result df row count: 41539
Processing MalPatSaj-forbes.com data, row count: 2000
result df row count: 43539
Processing MalPatSaj-wikipedia.org data, row count: 3111
result df row count: 46650
Processing output_globaldata data, row count: 5336
result df row count: 51986
Processing output_govuk_bigsize data, row count: 7000
result df row coun

Unnamed: 0,company_name,industry,headquarters,company_type,foundation_year,rank,region,company_id,market_cap_or_valuation,country,share_price,change_1_day,change_1_year,categories,facebook,twitter,instagram,pinterest,registration_date,city,investors,notes_or_description,total_funding_raised,website,number_of_employees,ceo,revenue,address,sic_code,profit_or_net_income,assets,sector,phone,company_status,debts,state,continent,founder,emtak_code,nace_code,trade_name,postal_code
0,tcs,it services & consulting,"mumbai, maharashtra, india",public,1968 (55 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,accenture,it services & consulting,"dublin, ireland",public,1989 (34 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,cognizant,it services & consulting,"teaneck. new jersey., united states (usa)",private,1994 (29 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,icici bank,banking,"mumbai, maharashtra, india",public,1994 (29 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,hdfc bank,banking,"mumbai, maharashtra, india",public,1994 (29 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,wipro,it services & consulting,"bangalore/bengaluru, karnataka, india",public,1945 (78 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,infosys,it services & consulting,"bengaluru/bangalore, karnataka, india",public,1981 (42 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,capgemini,it services & consulting,"paris, france",public,1967 (56 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,tech mahindra,it services & consulting,"pune, maharashtra, india",public,1986 (37 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,genpact,it services & consulting,"new york, new york, united states (usa)",public,1997 (26 yrs old),,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


Unnamed: 0,idAzienda,nameEmployee,code,participation,contribution,company_id,role,startDate
0,2,janis jegi,38807314918,70.00%,7000.00 eur,,,
1,2,kaire jegi,48812154914,30.00%,3000.00 eur,,,
2,3,siim tiigim\xc3\xa4gi,38807042731,100.00%,2500.00 eur,,,
3,4,dominik gronkiewicz,39207230150,100.00%,2500.00 eur,,,
4,5,arti haidak,38205186529,50.00%,1250.00 eur,,,
5,5,rainis luik,37508026514,50.00%,1250.00 eur,,,
6,6,kaido kaabel,36706040246,44.00%,1122.00 eur,,,
7,6,marek kaabel,37008100259,44.00%,1122.00 eur,,,
8,6,arvi kaabel,34111240226,12.00%,306.00 eur,,,
9,7,kaido kaabel,36706040246,50.00%,1278.00 eur,,,


In [43]:
display(companies_df.info())
display(emp_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76808 entries, 0 to 76807
Data columns (total 42 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   company_name             75793 non-null  object 
 1   industry                 27953 non-null  object 
 2   headquarters             34961 non-null  object 
 3   company_type             17362 non-null  object 
 4   foundation_year          27258 non-null  object 
 5   rank                     17579 non-null  object 
 6   region                   1000 non-null   object 
 7   company_id               31383 non-null  object 
 8   market_cap_or_valuation  30093 non-null  object 
 9   country                  29913 non-null  object 
 10  share_price              6793 non-null   object 
 11  change_1_day             5897 non-null   object 
 12  change_1_year            5648 non-null   object 
 13  categories               15532 non-null  object 
 14  facebook              

None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2196 entries, 0 to 2195
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   idAzienda      1191 non-null   object
 1   nameEmployee   2196 non-null   object
 2   code           2196 non-null   object
 3   participation  1191 non-null   object
 4   contribution   1191 non-null   object
 5   company_id     1005 non-null   object
 6   role           1005 non-null   object
 7   startDate      1005 non-null   object
dtypes: object(8)
memory usage: 137.4+ KB


None

In [None]:
emp_df.to_csv("employees_mediated.csv", index=False)
companies_df.to_csv("companies_mediated.csv", index=False)