In [4]:
pip install xlrd>=2.0.1


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import json
import os
import numpy as np

def load_data(file_path):
    """
    Carica un file CSV, XLS, JSON o JSONL e restituisce un DataFrame con le colonne 
    'name', 'category', 'address', 'city', 'country', 'yearFoundation', 'founders', 'link',
    'rank', 'market_cap', 'employees', 'ceo', 'assets', 'profit', 'sales', 'revenue', 'telephone',
    'iban', 'sic_code', 'facebook'.
    """
    encodings = ['utf-8', 'ISO-8859-1', 'latin1']
    
    for encoding in encodings:
        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path, encoding=encoding)
            elif file_path.endswith(('.xls', '.xlsx')):
                try:
                    df = pd.read_excel(file_path)
                except Exception as e:
                    print(f"Errore con il file {file_path} (Excel) usando encoding {encoding}: {e}")
                    continue
            elif file_path.endswith('.json'):
                with open(file_path, 'r', encoding=encoding) as f:
                    data = json.load(f)
                if isinstance(data, list):
                    df = pd.DataFrame(data)
                elif isinstance(data, dict):
                    df = pd.DataFrame([data])
                else:
                    raise ValueError(f"Formato del JSON non supportato in {file_path}")
            elif file_path.endswith('.jsonl'):
                df = pd.read_json(file_path, lines=True)
            else:
                raise ValueError("Formato non supportato")
            
            # Inizializzazione dei valori come NaN
            code_azienda_value = np.nan #hhid
            name_value = np.nan
            category_value = np.nan
            sector_value = np.nan
            address_value = np.nan
            postal_code_value = np.nan  #va gestito
            city_value = np.nan
            country_value = np.nan
            county_position_value = np.nan  
            continent_value = np.nan 
            year_foundation_value = np.nan
            year_legal_form_value = np.nan
            year_data_joined_value = np.nan
            founders_value = np.nan
            link_value = np.nan
            rank_value = np.nan
            rank_2010_value = np.nan
            market_cap_value = np.nan
            employees_value = np.nan
            ceo_value = np.nan
            assets_value = np.nan
            profit_value = np.nan
            sales_value = np.nan
            revenue_value = np.nan
            telephone_value = np.nan
            partita_iva_value = np.nan
            hhid_value = np.nan
            sic_code_value = np.nan
            facebook_value = np.nan
            twitter_value = np.nan
            pinterest_value = np.nan
            instagram_value= np.nan
            capital_value = np.nan
            deletion_date_value=np.nan
            emtak_code_value = np.nan
            nace_code_value = np.nan
            source_value = np.nan
            annual_net_income_in_usd_value = np.nan
            annual_results_for_year_ending_value = np.nan
            total_liabilities_in_usd_value = np.nan
            total_equity_in_usd_value = np.nan
            company_status_value = np.nan
            company_type = np.nan
            valuetion_value = np.nan
            investors_value = np.nan
            total_raised_value = np.nan
            share_price_value = np.nan
            change_1_day_value = np.nan
            change_1_year_value = np.nan
            main_market_value = np.nan
            ownership_value = np.nan
            note_value = np.nan


            # Mappatura colonne esistenti
            code_azienda_columns = ["ID azienda", "ID", "id", "company_number"]
            name_columns = ["Name", "name", "Company", "BRAND NAME"]
            category_columns = ["category", "Area of Activity", "nature_of_business", "company_business", "industry", "CATEGORY", "Industry"]
            sector_columns = ["Sector"]
            address_columns = ["address", "Address Name", "Address"]
            city_columns = ["city", "City", "headquarters_region_city", "headquarters", "location", "Headquarter"]
            country_columns = ["country", "Country", "headquarters_country", "headquarters", "Headquarter", "Headquarters"]
            county_position_columns = ["headquarters_sub_region"]
            continent_columns = ["-	headquarters_continent"]
            foundation_columns = ["company_creation_date", "founded", "Foundation Year", "Founded"]
            year_legal_form_columns = ["Legal form"]
            year_data_joined_columns = ["dateJoined"]
            founders_columns = ["founders"]
            link_columns = ["URL", "company_website", "website", "link"]

            # Mappatura nuovi campi
            rank_columns = ["world_rank", "rank"]
            market_cap_columns = ["market_cap", "Market Value"]
            employees_columns = ["number_of_employees", "employees", "size"]
            ceo_columns = ["ceo"]
            assets_columns = ["total_assets_usd", "Assets"]
            profit_columns = ["Profit"]
            sales_columns = ["Sales"]
            revenue_columns = ["annual_revenue_in_usd", "revenue"]
            telephone_columns = ["telephone"]
            partita_iva_columns = ["national"]
            sic_code_columns = ["sic_code"]
            facebook_columns = ["Facebook"]
            instagram_colums = ["Instagram"]
            

            # Estrazione dei valori per ciascuna colonna
            for col in code_azienda_columns:
                if col in df.columns:
                    code_azienda_value= df[col].astype(str).str.strip()
                    break
            
            for col in name_columns:
                if col in df.columns:
                    name_value = df[col].astype(str).str.strip()
                    break

            for col in category_columns:
                if col in df.columns:
                    category_value = df[col]
                    break
            for col in sector_columns:
                if col in df.columns:
                    sector_value = df[col]
                    break

            for col in address_columns:
                if col in df.columns:
                    address_value = df[col]
                    if col == "address":
                        address_value = address_value.str.split(',').str[0]  # Primo valore prima della virgola
                    break

            for col in city_columns:
                if col in df.columns:
                    city_value = df[col]
                    if col in ["Headquarter", "headquarters"]:
                        city_value = city_value.str.split(',').str[1]
                    break

            for col in country_columns:
                if col in df.columns:
                    country_value = df[col]
                    if col in ["headquarters", "Headquarter"]:
                        country_value = country_value.str.split(',').str[2]
                    break
            
            for col in foundation_columns:
                if col in df.columns:
                    year_foundation_value = df[col]
                    break
            
            for col in founders_columns:
                if col in df.columns:
                    founders_value = df[col]
                    break
            
            for col in link_columns:
                if col in df.columns:
                    link_value = df[col]
                    break

            # Estrazione dei nuovi campi
            for col in rank_columns:
                if col in df.columns:
                    rank_value = df[col]
                    break

            for col in market_cap_columns:
                if col in df.columns:
                    market_cap_value = df[col]
                    break

            for col in employees_columns:
                if col in df.columns:
                    employees_value = df[col]
                    break

            for col in ceo_columns:
                if col in df.columns:
                    ceo_value = df[col]
                    break

            for col in assets_columns:
                if col in df.columns:
                    assets_value = df[col]
                    break

            for col in profit_columns:
                if col in df.columns:
                    profit_value = df[col]
                    break

            for col in sales_columns:
                if col in df.columns:
                    sales_value = df[col]
                    break

            for col in revenue_columns:
                if col in df.columns:
                    revenue_value = df[col]
                    break

            for col in telephone_columns:
                if col in df.columns:
                    telephone_value = df[col]
                    break

            for col in partita_iva_columns:
                if col in df.columns:
                    partita_iva_value = df[col]
                    break

            for col in sic_code_columns:
                if col in df.columns:
                    sic_code_value = df[col]
                    break

            for col in facebook_columns:
                if col in df.columns:
                    facebook_value = df[col]
                    break
            
            # Creazione del DataFrame con i valori estratti
            result_df = pd.DataFrame({
                "code azienda": code_azienda_value,
                "name": name_value, 
                "category": category_value, 
                "sector": sector_value,
                "address": address_value, 
                "city": city_value, 
                "country": country_value,
                "yearFoundation": year_foundation_value,
                "founders": founders_value,
                "link": link_value,
                "rank": rank_value,
                "market_cap": market_cap_value,
                "employees": employees_value,
                "ceo": ceo_value,
                "assets": assets_value,
                "profit": profit_value,
                "sales": sales_value,
                "revenue": revenue_value,
                "telephone": telephone_value,
                "partita iva": partita_iva_value,
                "sic_code": sic_code_value,
                "facebook": facebook_value
            }, index=range(len(df)))
            
            return result_df
        
        except Exception as e:
            print(f"Errore con il file {file_path} usando encoding {encoding}: {e}")
    
    print(f"Impossibile caricare il file {file_path} con i codici di encoding disponibili.")
    return pd.DataFrame(columns=["code azienda","name", "category","sector", "address", "city", "country", "yearFoundation", "founders", "link", "rank", "market_cap", "employees", "ceo", "assets", "profit", "sales", "revenue", "telephone", "partita_iva", "sic_code", "facebook"])

# Cartella contenente i file
source_folder = "sources/"

# Ottieni la lista dei file nella cartella source
file_list = [os.path.join(source_folder, file) for file in os.listdir(source_folder) 
             if file.endswith(('.csv', '.xls', '.xlsx', '.json', '.jsonl'))]

# Separare i file in due gruppi
wissel_files = [file for file in file_list if os.path.basename(file) in [
    "wissel-rappresentanti-ariregister.rik.ee.csv", "wissel-partners-ariregister.rik.ee.csv"]]
other_files = [file for file in file_list if file not in wissel_files]

# Creare DataFrame
other_df = pd.concat([load_data(file) for file in other_files], ignore_index=True)

# Salvare i risultati
other_df.to_csv("output_other.csv", index=False, encoding='utf-8')

print("Schema unificato salvato in  'output_other.csv'")


Errore con il file sources/AmbitionBox.csv usando encoding utf-8: 'utf-8' codec can't decode byte 0xe9 in position 3: invalid continuation byte
Schema unificato salvato in  'output_other.csv'


In [3]:
import pandas as pd
import numpy as np
import os
import json

def load_data(file_path):
    """
    Carica un file CSV, XLS, JSON o JSONL e restituisce un DataFrame con le colonne 
    'name', 'category', 'address', 'city', 'country', 'yearFoundation', 'founders', 'link',
    'rank', 'market_cap', 'employees', 'ceo', 'assets', 'profit', 'sales', 'revenue', 'telephone',
    'iban', 'sic_code', 'facebook'.
    """
    encodings = ['utf-8', 'ISO-8859-1', 'latin1']
    
    for encoding in encodings:
        try:
            if file_path.endswith('.csv'):
                df = pd.read_csv(file_path, encoding=encoding)
            elif file_path.endswith(('.xls', '.xlsx')):
                try:
                    df = pd.read_excel(file_path)
                except Exception as e:
                    print(f"Errore con il file {file_path} (Excel) usando encoding {encoding}: {e}")
                    continue
            elif file_path.endswith('.json'):
                with open(file_path, 'r', encoding=encoding) as f:
                    data = json.load(f)
                if isinstance(data, list):
                    df = pd.DataFrame(data)
                elif isinstance(data, dict):
                    df = pd.DataFrame([data])
                else:
                    raise ValueError(f"Formato del JSON non supportato in {file_path}")
            elif file_path.endswith('.jsonl'):
                df = pd.read_json(file_path, lines=True)
            else:
                raise ValueError("Formato non supportato")
            
            # Mappatura colonne
            idAzienda_columns = ["ID azienda"]
            nameEmployee_columns = ["Name"]
            code_columns = ["Code"]
            role_columns = ["Role"]
            startDate_columns = ["Start Date"]
            participation_columns = ["Participation"]
            contribution_columns = ["Contribution"]

            # Inizializzazione dei valori come NaN
            idAzienda_value = np.nan
            nameEmployee_value = np.nan
            code_value = np.nan
            role_value = np.nan
            startDate_value = np.nan
            participation_value = np.nan
            contribution_value = np.nan

            # Estrazione dei valori per ciascuna colonna 
            if any(col in df.columns for col in idAzienda_columns):
                idAzienda_value = df[idAzienda_columns[0]] if idAzienda_columns[0] in df.columns else np.nan

            if any(col in df.columns for col in nameEmployee_columns):
                nameEmployee_value = df[nameEmployee_columns[0]].astype(str).str.strip() if nameEmployee_columns[0] in df.columns else np.nan

            if any(col in df.columns for col in code_columns):
                code_value = df[code_columns[0]] if code_columns[0] in df.columns else np.nan

            if any(col in df.columns for col in role_columns):
                role_value = df[role_columns[0]] if role_columns[0] in df.columns else np.nan

            if any(col in df.columns for col in startDate_columns):
                startDate_value = df[startDate_columns[0]] if startDate_columns[0] in df.columns else np.nan

            if any(col in df.columns for col in participation_columns):
                participation_value = df[participation_columns[0]] if participation_columns[0] in df.columns else np.nan

            if any(col in df.columns for col in contribution_columns):
                contribution_value = df[contribution_columns[0]] if contribution_columns[0] in df.columns else np.nan

            # Creazione del DataFrame con i valori estratti
            result_df = pd.DataFrame({
                "idAzienda": idAzienda_value, 
                "nameEmployee": nameEmployee_value, 
                "code": code_value, 
                "role": role_value, 
                "startDate": startDate_value,
                "participation": participation_value,
                "contribution": contribution_value,
            })
            
            return result_df
        
        except Exception as e:
            print(f"Errore con il file {file_path} usando encoding {encoding}: {e}")
    
    print(f"Impossibile caricare il file {file_path} con i codici di encoding disponibili.")
    return pd.DataFrame(columns=["idAzienda", "nameEmployee", "code", "role", "startDate", "partecipation", "contribution"])

# Cartella contenente i file
source_folder = "sources/"

# Ottieni la lista dei file nella cartella source
file_list = [os.path.join(source_folder, file) for file in os.listdir(source_folder) 
             if file.endswith(('.csv', '.xls', '.xlsx', '.json', '.jsonl'))]

# Separare i file in due gruppi
wissel_files = [file for file in file_list if os.path.basename(file) in [
    "wissel-rappresentanti-ariregister.rik.ee.csv", "wissel-partners-ariregister.rik.ee.csv"]]

# Creare DataFrame
wissel_df = pd.concat([load_data(file) for file in wissel_files], ignore_index=True)

# Salvare i risultati
wissel_df.to_csv("output_wissel.csv", index=False, encoding='utf-8')

print("Schema unificato salvato in 'output_wissel.csv'")


Schema unificato salvato in 'output_wissel.csv'


Analisi dei dati estratti


In [18]:
import pandas as pd

# Funzione per normalizzare i valori
def normalize_string(s):
    if pd.isna(s) or not s:
        return s  # Se il valore è NaN o vuoto, non lo modificare
    # Converti tutto in minuscolo
    s = str(s).lower()
    # Rimuovi spazi extra all'inizio e alla fine
    s = s.strip()
    # Sostituire più spazi consecutivi con uno solo
    s = ' '.join(s.split())
    return s

# Carica il file CSV
file_path = "output_other_sorted.csv"
other_df = pd.read_csv(file_path)

# Conta il numero totale di righe nel DataFrame
total_rows = len(other_df)
print(f"Numero totale di righe nel file: {total_rows}")

# Lista di colonne da normalizzare
columns_to_normalize = [
    "name", "category", "address", "city", "country", "yearFoundation", 
    "founders", "link", "rank", "market_cap", "employees", "ceo", 
    "assets", "profit", "sales", "revenue", "telephone", "iban", 
    "sic_code", "facebook"
]

# Applica la normalizzazione a tutte le colonne indicate
for col in columns_to_normalize:
    if col in other_df.columns:
        other_df[col] = other_df[col].apply(normalize_string)

# Stampa un esempio per verificare la normalizzazione
print(other_df.head())

# Conta il numero di duplicati per ciascuna colonna
for col in columns_to_normalize:
    if col in other_df.columns:
        duplicates_count = other_df.duplicated(subset=[col]).sum()
        print(f"Numero di duplicati per '{col}': {duplicates_count}")
        
# Conta il numero di duplicati considerando insieme 'name' e 'link'
duplicates_combined_count = other_df.duplicated(subset=["name", "link"]).sum()
print(f"Numero di duplicati considerando insieme 'name' e 'link': {duplicates_combined_count}")

# Salvare il DataFrame normalizzato
other_df.to_csv("output_other_normalized.csv", index=False, encoding='utf-8')
print("File normalizzato salvato come 'output_other_normalized.csv'")


Numero totale di righe nel file: 76808
                                             name  \
0                "all about me" full stop limited   
1  "q" (chester green) management company limited   
2                                          #sinob   
3                      'all aboard' shops limited   
4                'q' accountancy services limited   

                                            category address city  country  \
0  82990 - other business support service activit...     NaN  NaN      NaN   
1              98000 - residents property management     NaN  NaN      NaN   
2                                             retail     NaN  NaN  germany   
3  88990 - other social work activities without a...     NaN  NaN      NaN   
4         69201 - accounting and auditing activities     NaN  NaN      NaN   

      yearFoundation founders                   link rank market_cap  \
0     13 august 2018      NaN                    NaN  NaN        NaN   
1  27 september 2007      NaN

In [16]:
import pandas as pd

# Funzione per normalizzare i valori
def normalize_string(s):
    # Converti tutto in minuscolo
    s = str(s).lower()
    # Rimuovi spazi extra all'inizio e alla fine
    s = s.strip()
    # Sostituire più spazi consecutivi con uno solo
    s = ' '.join(s.split())
    return s

# Carica il file CSV
file_path = "output_wissel.csv"
wissel_df = pd.read_csv(file_path)

# Conta il numero totale di righe nel DataFrame
total_rows = len(wissel_df)
print(f"Numero totale di righe nel file: {total_rows}")

# Controlla se la colonna 'nameEmployee' esiste nel DataFrame
if 'nameEmployee' in wissel_df.columns:
    # Normalizza i valori nella colonna 'nameEmployee'
    wissel_df['nameEmployee'] = wissel_df['nameEmployee'].apply(normalize_string)
    
    # Ordina il DataFrame sulla base della colonna 'nameEmployee'
    wissel_df = wissel_df.sort_values(by="nameEmployee")
    
    # Conta il numero di duplicati nella colonna 'nameEmployee'
    duplicates_nameEmployee_count = wissel_df.duplicated(subset=["nameEmployee"]).sum()
    print(f"Numero di duplicati per 'nameEmployee': {duplicates_nameEmployee_count}")

    # Salvare il file ordinato e normalizzato
    wissel_df.to_csv("output_wissel_sorted.csv", index=False, encoding='utf-8')
    print("File ordinato e normalizzato salvato come 'output_wissel_sorted.csv'")
else:
    print("La colonna 'nameEmployee' non è presente nel file.")


Numero totale di righe nel file: 2196
Numero di duplicati per 'nameEmployee': 903
File ordinato e normalizzato salvato come 'output_wissel_sorted.csv'
