domani provo a fare qualcosa con ditto o bo, qualsiasi modello che riesco a installare

## Data to train DITTO

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

ground_truth_path = '../ground_truth.csv'
df = pd.read_csv(ground_truth_path)

attributes = [
    "company_id", "company_name", "industry", "sector", "categories",
    "company_status", "company_type", "address", "postal_code", "city",
    "country", "state", "foundation_year", "registration_date",
    "website", "rank", "market_cap_or_valuation", "number_of_employees",
    "ceo", "assets", "profit_or_net_income", "revenue", "share_price",
    "change_1_day", "change_1_year", "debts", "phone", "notes_or_description"
]

In [5]:
def format_company(row, prefix="c1"):
    formatted_text = []
    for attr in attributes:
        col_name = f"{prefix}.{attr}"
        if col_name in row and pd.notna(row[col_name]):
            formatted_text.append(f"COL {attr} VAL {str(row[col_name])}")
    return " ".join(formatted_text)

In [12]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=18)
train_df, valid_df = train_test_split(train_df, test_size=0.1, random_state=42)

def save_ditto_format(df, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            company_1 = format_company(row, "c1")
            company_2 = format_company(row, "c2")
            is_match = "1" if row["is_match"] == True else "0"  # Convert True/False to 1/0
            f.write(f"{company_1} \t {company_2} \t {is_match}\n")

In [13]:
#save_ditto_format(train_df, 'train.txt')
#save_ditto_format(test_df, 'test.txt')
save_ditto_format(valid_df, "valid.txt")


## prepare dataset to run DITTO predictions

In [17]:
import pandas as pd
import json
from itertools import combinations

df = pd.read_csv('../aziende_normalizzate.csv').fillna("")

company_groups = df.groupby("company_name")

candidate_pairs = []
for _, group in company_groups:
    if len(group) > 1:
        pairs = list(combinations(group.iterrows(), 2))  # Generate all possible pairs
        for (idx1, row1), (idx2, row2) in pairs:
            candidate_pairs.append((row1, row2))
            
print(f"Generated {len(candidate_pairs)} candidate pairs.")

Generated 39762 candidate pairs.


In [21]:
def format_company(row):
    formatted_values = []
    for col in attributes:
        if col in row and str(row[col]).strip():
            formatted_values.append(f"{col}: {str(row[col])}")

    return {" ".join(formatted_values)}


In [22]:
output_path = "company_pairs.jsonl"
with open(output_path, 'w', encoding='utf-8') as f:
    for row1, row2 in candidate_pairs:
        json_obj = {
            "company_1": format_company(row1),
            "company_2": format_company(row2)
        }
        f.write(json.dumps(json_obj) + "\n")

TypeError: Object of type set is not JSON serializable

In [None]:
import pandas as pd
import json

# Load dataset
input_file = "../aziende_normalizzate.csv"  # Update with the actual path
output_file = "input/pairs.jsonl"  # Output file for Ditto

df = pd.read_csv(input_file)

def format_record(record):
    """Formats a single record into the required JSONL format."""
    title_entries = [f'"{col}: {str(record[col]).strip()}"' for col in df.columns if pd.notna(record[col])]
    return {"title": " ".join(title_entries)}

# Group by company name (blocking by exact name)
grouped = df.groupby("company_name", dropna=False)

with open(output_file, "w", encoding="utf-8") as f:
    for _, group in grouped:
        records = [format_record(row) for _, row in group.iterrows()]
        json.dump(records, f, ensure_ascii=False)
        f.write("\n")

print(f"Formatted data saved to {output_file}")
