In [1]:
import math
import numpy as np
import math
from datasets import load_dataset
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import re
import json
import seaborn as sns
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pandas as pd
import os
import time
import random

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
silverdataset = pd.read_csv('datasets/silver.csv')
golddataset = pd.read_csv('datasets/gold.csv')

text_silver = pd.read_csv('datasets/wikipedia_text_stats_grouped_silver_links.csv')
text_gold = pd.read_csv('datasets/wikipedia_text_stats_grouped_gold_links.csv')


In [56]:
# text_silver e text_gold preprocessing
silver_text_dataset = pd.DataFrame(text_silver)
silver_text_dataset = silver_text_dataset.drop(columns=["engtext"])
gold_text_dataset = pd.DataFrame(text_gold)
gold_text_dataset = gold_text_dataset.drop(columns=["engtext"])

compute_len = lambda x: len(x)
silver_text_dataset["len"] = silver_text_dataset["distribution"].apply(compute_len)
gold_text_dataset["len"] = gold_text_dataset["distribution"].apply(compute_len)

silver_text_dataset = silver_text_dataset.drop(columns="distribution")
gold_text_dataset = gold_text_dataset.drop(columns="distribution")

# avg, std, len are categorized based on quantili
silver_text_dataset['avg_bins'] = pd.qcut(silver_text_dataset['avg'], q=4)  # 5 intervalli uguali
silver_text_dataset['std_bins'] = pd.qcut(silver_text_dataset['std'], q=4)  # 5 intervalli uguali
silver_text_dataset['len_bins'] = pd.qcut(silver_text_dataset['len'], q=4)  # 5 intervalli uguali

gold_text_dataset['avg_bins'] = pd.qcut(gold_text_dataset['avg'], q=4)  # 5 intervalli uguali
gold_text_dataset['std_bins'] = pd.qcut(gold_text_dataset['std'], q=4)  # 5 intervalli uguali
gold_text_dataset['len_bins'] = pd.qcut(gold_text_dataset['len'], q=4)  # 5 intervalli uguali

#avg, std, len are dropped
silver_text_dataset = silver_text_dataset.drop(columns=["avg", "std", "len"])
gold_text_dataset = gold_text_dataset.drop(columns=["avg", "std", "len"])

print(silver_text_dataset.head(1))
print(gold_text_dataset.head(1))


                                      entity              avg_bins  \
0  http://www.wikidata.org/entity/Q100309406  (1856.696, 3120.483]   

             std_bins       len_bins  
0  (-0.001, 1019.487]  (4.999, 24.0]  
                                    entity                        avg_bins  \
0  http://www.wikidata.org/entity/Q1074069  (381.33200000000005, 2190.379]   

             std_bins       len_bins  
0  (-0.001, 1924.045]  (60.0, 159.5]  


In [68]:
#Now we want to combine [silverdataset, silver_text_dataset] and [goldataset, gold_text_dataset] with respect to the key <entity>
silver_merged = pd.merge(silverdataset, silver_text_dataset, left_on='item', right_on='entity')
gold_merged = pd.merge(golddataset, gold_text_dataset, left_on='item', right_on='entity')

silver_merged = silver_merged.drop(columns="entity")
gold_merged = gold_merged.drop(columns="entity")
print(silver_merged.columns)

Index(['item', 'name', 'description', 'type', 'category', 'subcategory',
       'label', 'avg_bins', 'std_bins', 'len_bins'],
      dtype='object')


In [None]:
training_data = silver_merged
evaluation_data = gold_merged

In [64]:
def heuristic_from_text(column):
    nations = set(pd.read_csv('datasets/national_adjectives.csv')["Country"])
    national_adjectives = set(pd.read_csv('datasets/national_adjectives.csv')["Adjective"])
    
    column['new_description'] = column.apply(lambda elem: set(elem.iloc[0].split()), axis=1)
    column['length_description_intersection'] = column['new_description'].apply(lambda elem:  len(elem.intersection(national_adjectives)))
    
    column['nations'] = column['new_description'].apply(lambda elem:  len(elem.intersection(nations)))

    return column['length_description_intersection'], column['nations']

In [66]:
# Rimuovo colonne non utili dal training set e dall'evaluation test
dataset = training_data.drop(columns=["item", "name", "description"])
evaluation_dataset = evaluation_data.drop(columns=["item", "name", "description"])

# Heuristic su 'description'
train_desc = pd.DataFrame(training_data["description"])
dataset['h_description'], dataset['nations'] = heuristic_from_text(train_desc)

# Stesso preprocessing per l'evaluation set
eval_desc = pd.DataFrame(evaluation_data["description"])
evaluation_dataset['h_description'], evaluation_dataset['nations'] = heuristic_from_text(eval_desc)

# Colonne categoriali da codificare
categorical_columns = ["h_description", "nations", "type", "category", "subcategory", "label", "avg_bins", "std_bins", "len_bins"]

# One-hot encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(pd.concat([dataset[categorical_columns], evaluation_dataset[categorical_columns]]))
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_columns))

# Pulizia e unione nel dataset di training
dataset = dataset.drop(columns=categorical_columns)
dataset = pd.concat([dataset, encoded_df], axis=1)

# Applico lo stesso encoder all'evaluation set
encoded_eval = encoder.transform(evaluation_dataset[categorical_columns])
encoded_eval_df = pd.DataFrame(encoded_eval, columns=encoder.get_feature_names_out(categorical_columns))
evaluation_dataset = evaluation_dataset.drop(columns=categorical_columns)
evaluation_dataset = pd.concat([evaluation_dataset, encoded_eval_df], axis=1)

# Seleziono feature e target
labels = ['label_cultural agnostic', 'label_cultural exclusive', 'label_cultural representative']
X_train = dataset.drop(columns=labels)
y_train = dataset[labels]
X_test = evaluation_dataset.drop(columns=labels)
y_test = evaluation_dataset[labels]

# Random Forest training
rf_model = RandomForestClassifier(n_estimators=1000, random_state=42)
rf_model.fit(X_train, y_train)

# Predizioni sul test set
y_pred = rf_model.predict(X_test)

# Accuratezza
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuratezza del modello: {accuracy:.2f}")

# Confusion matrix
cm = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
plt.figure(figsize=(10, 7))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


KeyError: "['avg_bins', 'std_bins', 'len_bins'] not in index"