In [7]:
# Import of all the libraries
!pip install wikidata # to download data
# !pip install wget

import pandas as pd
import numpy as np
import time
import os.path

from wikidata.client import Client
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# LIBRERIE USATE DAL PROF,
# from sklearn.decomposition import PCA
# from torch import FloatTensor as FT
# from torch import LongTensor as LT

# import matplotlib.pyplot as plt
# import torch.nn as nn
# import collections
# import torch
# import json
# import re
# import wget




In [58]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
# data_path = '/content/drive/MyDrive/HOMEWORK_1/gold_dataset.tsv'
data_path = '/content/drive/MyDrive/NLP_HOMEWORK_1/data/gold_dataset.tsv'
gold_dataset = pd.read_csv(data_path, sep='\t')

Mounted at /content/drive/


In [32]:
gold_dataset_nolink = gold_dataset.copy()
gold_dataset_nolink["item"] = gold_dataset_nolink["item"].str.split("/").str[-1]
gold_dataset_nolink.head()
# gold_dataset.head()
# gold_dataset.shape

Unnamed: 0,item,name,description,type,category,subcategory,label
0,Q306,Sebastián Piñera,Chilean entrepreneur and politician (1949–2024),entity,politics,politician,cultural exclusive
1,Q12735,John Amos Comenius,"Czech teacher, educator, philosopher and write...",entity,politics,politician,cultural representative
2,Q1752,Macrinus,Roman emperor from 217 to 218,entity,politics,politician,cultural representative
3,Q1639,Lamine Diack,Senegalese sports manager (1933–2021),entity,politics,politician,cultural representative
4,Q9588,Richard Nixon,President of the United States from 1969 to 1974,entity,politics,politician,cultural representative


In [None]:
# Test to extract info from wikidata. We can remove it?
def fetch_wikidata_properties(qid):

    client = Client()
    entity = client.get(qid, load=True)
    data = entity.attributes  # è un dict, mi da tutto quello che c'è su wikidata relativo a questo item

    return data

url = "http://www.wikidata.org/entity/Q321103"
qid = url.split('/')[-1]  # Q321103

print(fetch_wikidata_properties(qid))

{'pageid': 308428, 'ns': 0, 'title': 'Q321103', 'lastrevid': 2182280166, 'modified': '2024-06-17T16:22:21Z', 'type': 'item', 'id': 'Q321103', 'labels': {'de': {'language': 'de', 'value': 'Bühl'}, 'cs': {'language': 'cs', 'value': 'Bühl'}, 'en': {'language': 'en', 'value': 'Bühl'}, 'nl': {'language': 'nl', 'value': 'Bühl'}, 'af': {'language': 'af', 'value': 'Bühl'}, 'an': {'language': 'an', 'value': 'Bühl'}, 'ast': {'language': 'ast', 'value': 'Bühl'}, 'bar': {'language': 'bar', 'value': 'Bühl'}, 'br': {'language': 'br', 'value': 'Bühl'}, 'ca': {'language': 'ca', 'value': 'Bühl'}, 'co': {'language': 'co', 'value': 'Bühl'}, 'cy': {'language': 'cy', 'value': 'Bühl'}, 'da': {'language': 'da', 'value': 'Bühl'}, 'de-at': {'language': 'de-at', 'value': 'Bühl'}, 'de-ch': {'language': 'de-ch', 'value': 'Bühl'}, 'en-ca': {'language': 'en-ca', 'value': 'Bühl'}, 'en-gb': {'language': 'en-gb', 'value': 'Bühl'}, 'eo': {'language': 'eo', 'value': 'Bühl'}, 'es': {'language': 'es', 'value': 'Bühl'}, 'e

In [46]:
client = Client()
all_properties = set()
items_map = {}

dataset_len = len(gold_dataset)

# If matrix has not been generated yet, create it.
# Otherwise load from local

if(not os.path.isfile("/content/drive/MyDrive/NLP_HOMEWORK_1/data/item_properties_matrix.csv")):
  for idx, row in tqdm(gold_dataset.iterrows(), total=dataset_len):
    item_url = row['item']
    qid = item_url.split('/')[-1]

    entity = client.get(qid, load = True)
    data = entity.attributes
    claims = data.get("claims", {})

    #print(data)
    #print(claims)

    item_properties = set(claims.keys())
    all_properties.update(item_properties)
    items_map[qid] = item_properties

  properties_list = sorted(all_properties)
  items_list = list(items_map.keys())  #Put all the IQD in a list

  #Creation of the frequency matrix of all the properties of the items in the dataset
  freq_matrix = pd.DataFrame(
      0,
      index=items_list,
      columns=properties_list
  )

  for qid, props_for_item in items_map.items():
      for prop in props_for_item:
          freq_matrix.loc[qid, prop] = 1

  freq_matrix.index.name = 'item'
  freq_matrix.reset_index(inplace=True)
  freq_matrix.to_csv("/content/drive/MyDrive/NLP_HOMEWORK_1/data/item_properties_matrix.csv", index=False)

else:
  freq_matrix = pd.read_csv("/content/drive/MyDrive/NLP_HOMEWORK_1/data/item_properties_matrix.csv")

100%|██████████| 6251/6251 [17:35<00:00,  5.92it/s]


DATASET CLEANING. There are mistakes and misspellings in the label column of the gold dataset.
Operation:
*   Remove NaN
*   Remove ambigious labels
*   Assign correct labels

In [48]:
freq_matrix.shape

(6251, 4303)

In [53]:
X = gold_dataset["label"]
bad_words = ['cultural ag', 'cult', 'cultural agn', 'cultural represent', 'cultural ex', 'cultural']

label_set = set(X)
# print(f"Unique labels: {label_set}")

nan_count = X[X.isna()].shape[0]
# print(f"NaN: {nan_count}")

for word in bad_words:
    count_word = X[X == word].shape[0]
    # print(f"{word}: {count_word}")

#Operations
gold_dataset_clean = gold_dataset_nolink.copy()
gold_dataset_clean = gold_dataset_clean.dropna(subset=["label"])
gold_dataset_clean["label"] = gold_dataset_clean["label"].str.strip().str.lower()

replacements = {
    "cultural agn": "cultural agnostic",
    "cultural ag": "cultural agnostic",
    "cultural ex": "cultural exclusive",
    "cultural represent": "cultural representative"
}
gold_dataset_clean["label"] = gold_dataset_clean["label"].replace(replacements)
gold_dataset_clean = gold_dataset_clean[~gold_dataset_clean["label"].isin(["cultural", "cult"])]
assert gold_dataset_nolink.shape[0] - gold_dataset_clean.shape[0] == 13, "Dataset not getting cleaned correctly"

print(gold_dataset_clean["label"].value_counts())

#SALVIAMO GOLD DATASET
# gold_dataset_clean.to_csv("/content/drive/MyDrive/NLP_HOMEWORK_1/data/gold_dataset_clean.csv", index=False)


label
cultural exclusive         2686
cultural agnostic          1866
cultural representative    1686
Name: count, dtype: int64


EXTRACT PROPERTIES FREQUENCIES. <br>
We look for each label, each pair of labels and also for all the three labels combined, how many properties they have.

Dictionary is made this way:
{ cultural_category: [property distribution] } <br>
( Ex: { cultural_representative: [0, 12, 124, 43] } )


In [52]:
label_properties_distribution = {} #Distribution of all the labels. Everytime i see a property, i add it here

for index, row in freq_matrix.iterrows():
    item_id = row["item"]
    properties = row[1:]

    item = gold_dataset_clean.loc[gold_dataset_clean["item"] == item_id]

    if not item.empty:
        label = item["label"].values[0]

        if label not in label_properties_distribution:
            label_properties_distribution[label] = properties.copy()
        else:
            label_properties_distribution[label] += properties


# { 'cultural exclusive': Series(...),
#   'cultural representative': Series(...),
#   'cultural agnostic': Series(...) }

categories = list(label_properties_distribution.keys())
# print(f"Categories inside dic: {categories}")

cult_ex = label_properties_distribution[categories[0]]
cult_rep = label_properties_distribution[categories[1]]
cult_agn = label_properties_distribution[categories[2]]

assert cult_ex.shape[0] == cult_rep.shape[0] == cult_agn.shape[0], "Shapes mismatch!"

# Properties > 0 (with at least 1 item with that label)
non_zero_ex = cult_ex[cult_ex != 0]
non_zero_rep = cult_rep[cult_rep != 0]
non_zero_agn = cult_agn[cult_agn != 0]

#?
ex = set(non_zero_ex.index)
rep = set(non_zero_rep.index)
agn = set(non_zero_agn.index)

# Properties common for all the labels
common_properties = ex & rep & agn

# Couple of props
ex_rep = ex & rep
ex_agn = ex & agn
agn_rep = rep & agn

#exclusive properties for each label
only_ex = ex.difference(rep).difference(agn)
only_rep = rep.difference(ex).difference(agn)
only_agn = agn.difference(ex).difference(rep)

print(f"Common Properties: {len(common_properties)}")
print(f"Common Exclusive-Representative: {len(ex_rep)}")
print(f"Common Exclusive-Agnostic: {len(ex_agn)}")
print(f"Common Agnostic-Representative: {len(agn_rep)}")
print(f"Only Exclusive: {len(only_ex)}")
print(f"Only Representative: {len(only_rep)}")
print(f"Only Agnostic: {len(only_agn)}")


Common Properties: 915
Common Exclusive-Representative: 1986
Common Exclusive-Agnostic: 1031
Common Agnostic-Representative: 1100
Only Exclusive: 627
Only Representative: 929
Only Agnostic: 458


PROPERTIES CLASSIFIER<br>
We check how each prorperty influences the assingment to each label.
Logistic regressor for multiclass classification

In [59]:
#Load data- frequencies matrix and gold dataset and merge them
freq_matrix = pd.read_csv("/content/drive/MyDrive/NLP_HOMEWORK_1/data/item_properties_matrix.csv")
gold_dataset_clean = pd.read_csv("/content/drive/MyDrive/NLP_HOMEWORK_1/data/gold_dataset_clean.csv")

# We merge them. "how=inner" rimuove le righe che non trovano match esatto tra i due elemnti di cui deve fare il merge.
merged_df = pd.merge(freq_matrix, gold_dataset_clean[["item", "label"]], on="item", how="inner")

# X and Y for classification
X = merged_df.drop(["item", "label"], axis=1)
y = merged_df["label"]

# train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42) #they always use 42 for random, i don't now why

# Logistic regressor with standard values
clf = LogisticRegression(
    penalty="l2", #L2
    C=1.0,
    solver="saga",
    multi_class="multinomial",
    max_iter=10000, #increase it ?
    random_state=42
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))




                         precision    recall  f1-score   support

      cultural agnostic       0.74      0.80      0.77       373
     cultural exclusive       0.73      0.81      0.77       538
cultural representative       0.66      0.50      0.57       337

               accuracy                           0.72      1248
              macro avg       0.71      0.70      0.70      1248
           weighted avg       0.72      0.72      0.72      1248



In [65]:
coeffs = clf.coef_
# coeffs.shape() # (n_class, n_feature)
feature_names = X_train.columns
class_labels = clf.classes_

# how many "top we want to see"
TOP_N = 10

for i, class_label in enumerate(class_labels):
    #sorted for weights negative->positive
    sorted_idx = np.argsort(coeffs[i])
    top_neg = sorted_idx[:TOP_N]
    top_pos = sorted_idx[-TOP_N:]

    print(f"\n=== Class: {class_label} ===")
    print("\nTop POSITIVE features:")
    for idx in reversed(top_pos):
        print(f"  {feature_names[idx]} (coeff: {coeffs[i, idx]:.3f})")

    print("\n Top NEGATIVE features:")
    for idx in top_neg:
        print(f"  {feature_names[idx]} (coeff: {coeffs[i, idx]:.3f})")



=== Class: cultural agnostic ===

Top POSITIVE features:
  P6366 (coeff: 1.037)
  P1640 (coeff: 0.938)
  P2388 (coeff: 0.935)
  P10890 (coeff: 0.893)
  P7512 (coeff: 0.892)
  P3858 (coeff: 0.891)
  P161 (coeff: 0.880)
  P6839 (coeff: 0.879)
  P1741 (coeff: 0.868)
  P1566 (coeff: 0.856)

 Top NEGATIVE features:
  P17 (coeff: -1.352)
  P5456 (coeff: -1.290)
  P7314 (coeff: -1.216)
  P2341 (coeff: -1.183)
  P6412 (coeff: -1.149)
  P2596 (coeff: -1.142)
  P495 (coeff: -1.061)
  P140 (coeff: -1.020)
  P9466 (coeff: -1.018)
  P10221 (coeff: -0.938)

=== Class: cultural exclusive ===

Top POSITIVE features:
  P1001 (coeff: 1.818)
  P17 (coeff: 1.189)
  P8253 (coeff: 1.120)
  P4794 (coeff: 1.039)
  P1114 (coeff: 1.023)
  P6112 (coeff: 0.938)
  P11245 (coeff: 0.928)
  P4614 (coeff: 0.918)
  P4239 (coeff: 0.881)
  P2094 (coeff: 0.880)

 Top NEGATIVE features:
  P6839 (coeff: -1.370)
  P11386 (coeff: -1.120)
  P5604 (coeff: -1.001)
  P2910 (coeff: -0.968)
  P5905 (coeff: -0.923)
  P2949 (coeff: 