In [None]:
from wikidata.client import Client
from datasets import load_dataset
# Inizializza client
client = Client()

def load_entity(wikidata_id):
    try:
        entity = client.get(wikidata_id, load=True)
        return entity
    except Exception as e:
        print(f"Errore nel caricamento {wikidata_id}: {e}")
        return None

def extract_cultural_features(entity):
    properties = {
        "P495": "country of origin",
        "P2596": "culture",
        "P131": "location",
        "P136": "genre",
        "P31": "instance of",
        "P279": "subclass of",
        "P921": "main subject",
        "P361": "part of"
    }

    results = {}

    for pid, label in properties.items():
        try:
            prop = client.get(pid)
            print(f"\nProcessing property {pid} ({label})...")
            
            if prop in entity:
                # Ottieni direttamente i valori della proprietà
                targets = entity[prop]
                
                # Se è un singolo valore, mettilo in una lista
                if not isinstance(targets, list):
                    targets = [targets]

                labels = []
                
                for target in targets:
                    if hasattr(target, 'label'):
                        label_text = target.label
                        labels.append(label_text)
                    else:
                        print("Target has no label attribute")
                
                results[label] = labels
            else:
                print(f"Property {pid} not found in entity")
                results[label] = []
                
        except Exception as e:
            print(f"Error with property {label}: {e}")
            results[label] = []

    return results

if __name__ == "__main__":
    wikidata_id = "Q7802"  # Bauhaus (non Bauhaus Archive)
    entity = load_entity(wikidata_id)

    if entity:
        features = extract_cultural_features(entity)
        print(f"Feature culturali di {wikidata_id}:\n")
        for prop, values in features.items():
            print(f"- {prop}: {values}")


Processing property P495 (country of origin)...
Property P495 not found in entity

Processing property P2596 (culture)...
Property P2596 not found in entity

Processing property P131 (location)...
Property P131 not found in entity

Processing property P136 (genre)...
Property P136 not found in entity

Processing property P31 (instance of)...

Processing property P279 (subclass of)...

Processing property P921 (main subject)...
Property P921 not found in entity

Processing property P361 (part of)...
Property P361 not found in entity
Feature culturali di Q7802:

- country of origin: []
- culture: []
- location: []
- genre: []
- instance of: [m'type of food or dish']
- subclass of: [m'staple food']
- main subject: []
- part of: []


VOTING

In [10]:
from datasets import load_dataset
from collections import defaultdict, Counter
from sklearn.metrics import classification_report

# === 1. Caricamento dataset ===
dataset = load_dataset("sapienzanlp/nlp2025_hw1_cultural_dataset")
train_set = dataset["train"]
validation_set = dataset["validation"]

feature_cols = ["country_of_origin", "genre", "instance_of"]
label_col = "label"

# === 2. Costruzione delle frequenze ===
def build_feature_label_probs(dataset, feature_columns, label_column):
    counter = defaultdict(lambda: Counter())

    for example in dataset:
        label = example[label_column]
        for feat in feature_columns:
            value = example.get(feat, "").strip().lower()
            if value:
                counter[(feat, value)][label] += 1

    probs = {}
    for (feat, val), label_counts in counter.items():
        total = sum(label_counts.values())
        probs[(feat, val)] = {
            label: count / total for label, count in label_counts.items()
        }
    print("probs")
    print(probs)
    return probs

# === 3. Voting classifier ===
def predict_label_voting(sample, feature_to_label_probs):
    votes = Counter()

    for feat, val in sample.items():
        key = (feat, val.strip().lower())
        if key in feature_to_label_probs:
            best_label = max(
                feature_to_label_probs[key],
                key=feature_to_label_probs[key].get
            )
            votes[best_label] += 1

    if not votes:
        return "CR"  # fallback neutro
    return votes.most_common(1)[0][0]

# === 4. Valutazione
def evaluate(dataset, feature_columns, label_column, feature_to_label_probs):
    y_true = []
    y_pred = []

    for example in dataset:
        sample = {feat: example.get(feat, "") for feat in feature_columns}
        pred = predict_label_voting(sample, feature_to_label_probs)
        y_true.append(example[label_column])
        y_pred.append(pred)

    print(classification_report(y_true, y_pred, digits=3))

# === 5. MAIN ===
if __name__ == "__main__":
    probs = build_feature_label_probs(train_set, feature_cols, label_col)
    evaluate(validation_set, feature_cols, label_col, probs)


probs
{}
                         precision    recall  f1-score   support

                     CR      0.000     0.000     0.000       0.0
      cultural agnostic      0.000     0.000     0.000     117.0
     cultural exclusive      0.000     0.000     0.000      76.0
cultural representative      0.000     0.000     0.000     107.0

               accuracy                          0.000     300.0
              macro avg      0.000     0.000     0.000     300.0
           weighted avg      0.000     0.000     0.000     300.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
