In [114]:
import math
import numpy as np
import math
from datasets import load_dataset

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd
import re
import json
import seaborn as sns

import matplotlib.pyplot as plt
import pandas as pd
import os
import time
import random

import ast

In [115]:
silverdataset = pd.read_csv('datasets/silver.csv')
golddataset = pd.read_csv('datasets/gold.csv')

text_silver = pd.read_csv('datasets/wikipedia_text_stats_grouped_silver_links.csv')
text_gold = pd.read_csv('datasets/wikipedia_text_stats_grouped_gold_links.csv')

ref_silver = pd.read_csv('datasets/wikipedia_references_stats_grouped_silver_links.csv')
ref_gold = pd.read_csv('datasets/wikipedia_references_stats_grouped_gold_links.csv')

In [116]:
def heuristic_from_text(column):
    nations = set(pd.read_csv('datasets/national_adjectives.csv')["Country"])
    national_adjectives = set(pd.read_csv('datasets/national_adjectives.csv')["Adjective"])

    column['new_description'] = column.apply(lambda elem: None if type(elem.iloc[0]) is float else set(elem.iloc[0].split()), axis=1)

    column['length_description_intersection'] = column['new_description'].apply(lambda elem: None if elem is None else len(elem.intersection(national_adjectives)))
    
    column['nations'] = column['new_description'].apply(lambda elem: None if elem is None else len(elem.intersection(nations)))

    return column['length_description_intersection'], column['nations']

In [117]:
def gini_index(p):
    p = np.array(ast.literal_eval(p))
    return 1 - np.sum(p ** 2)

def entropy(p):
    p = np.array(ast.literal_eval(p))
    p_log2_p = np.where(p > 0, p * np.log2(p), 0)
    return -np.sum(p_log2_p)

In [118]:
# text_silver e text_gold preprocessing
silver_text_dataset = pd.DataFrame(text_silver)
#silver_text_dataset = silver_text_dataset.drop(columns=["engtext"])
gold_text_dataset = pd.DataFrame(text_gold)
#gold_text_dataset = gold_text_dataset.drop(columns=["engtext"])

array_augmentation = 1
assert(array_augmentation == 1)

#Compute size of distribution array which equals to number of wikipedia links per item. (There is one per language)
compute_len = lambda x: len(array_augmentation*ast.literal_eval(x)) if type(ast.literal_eval(x)) is not float else None
silver_text_dataset["len"] = silver_text_dataset["distribution"].apply(compute_len)
gold_text_dataset["len"] = gold_text_dataset["distribution"].apply(compute_len)

#Saving entropy and gini informations
silver_text_dataset["entropy_text"] = silver_text_dataset["distribution"].apply(entropy)
silver_text_dataset["gini_text"] = silver_text_dataset["distribution"].apply(gini_index)

gold_text_dataset["entropy_text"] = gold_text_dataset["distribution"].apply(entropy)
gold_text_dataset["gini_text"] = gold_text_dataset["distribution"].apply(gini_index)

#Compute sum over the distribution array
compute_sum = lambda x: sum(array_augmentation*ast.literal_eval(x)) if type(ast.literal_eval(x)) is not float else None
silver_text_dataset["sum_over_texts"] = silver_text_dataset["distribution"].apply(compute_sum)
gold_text_dataset["sum_over_texts"] = gold_text_dataset["distribution"].apply(compute_sum)

# silver_text_dataset = silver_text_dataset.drop(columns="distribution")
# gold_text_dataset = gold_text_dataset.drop(columns="distribution")

# n_quantili = 5

# # avg, std, len are categorized based on quantili
# avg_bins_edges = pd.qcut(silver_text_dataset['avg'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali
# std_bins_edges = pd.qcut(silver_text_dataset['std'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali
# len_bins_edges = pd.qcut(silver_text_dataset['len'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali
# sum_bins_edges = pd.qcut(silver_text_dataset['sum_over_texts'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali

# silver_text_dataset['avg_bins'] = pd.cut(silver_text_dataset['avg'], bins=avg_bins_edges, include_lowest=True)
# silver_text_dataset['std_bins'] = pd.cut(silver_text_dataset['std'], bins=std_bins_edges, include_lowest=True)
# silver_text_dataset['len_bins'] = pd.cut(silver_text_dataset['len'], bins=len_bins_edges, include_lowest=True)
# silver_text_dataset['texts_sum'] = pd.cut(silver_text_dataset['sum_over_texts'], bins=sum_bins_edges, include_lowest=True)
# silver_text_dataset.dropna(subset=["len_bins"], inplace=True)
# silver_text_dataset.dropna(subset=["texts_sum"], inplace=True)

# gold_text_dataset['avg_bins'] = pd.cut(gold_text_dataset['avg'], bins=avg_bins_edges, include_lowest=True)
# gold_text_dataset['std_bins'] = pd.cut(gold_text_dataset['std'], bins=std_bins_edges, include_lowest=True)
# gold_text_dataset['len_bins'] = pd.cut(gold_text_dataset['len'], bins=len_bins_edges, include_lowest=True)
# gold_text_dataset['texts_sum'] = pd.cut(gold_text_dataset['sum_over_texts'], bins=sum_bins_edges, include_lowest=True)
# gold_text_dataset.dropna(subset=["len_bins"], inplace=True)
# gold_text_dataset.dropna(subset=["texts_sum"], inplace=True)

# #avg, std, len are dropped
# silver_text_dataset = silver_text_dataset.drop(columns=["avg", "std", "len", "sum_over_texts"])
# gold_text_dataset = gold_text_dataset.drop(columns=["avg", "std", "len", "sum_over_texts"])

print(silver_text_dataset.columns)

Index(['entity', 'engtext', 'distribution', 'std', 'avg', 'len',
       'entropy_text', 'gini_text', 'sum_over_texts'],
      dtype='object')


In [119]:
#Now we want to combine [silverdataset, silver_text_dataset] and [goldataset, gold_text_dataset] with respect to the key <entity>
silver_merged = pd.merge(silverdataset, silver_text_dataset, left_on='item', right_on='entity')
gold_merged = pd.merge(golddataset, gold_text_dataset, left_on='item', right_on='entity')

silver_merged = silver_merged.drop(columns="entity")
gold_merged = gold_merged.drop(columns="entity") 

print(silver_merged.columns)

Index(['item', 'name', 'description', 'type', 'category', 'subcategory',
       'label', 'engtext', 'distribution', 'std', 'avg', 'len', 'entropy_text',
       'gini_text', 'sum_over_texts'],
      dtype='object')


In [120]:
# text_silver e text_gold preprocessing
silver_ref_dataset = pd.DataFrame(ref_silver)
gold_ref_dataset = pd.DataFrame(ref_gold)

array_augmentation = 1
assert(array_augmentation == 1)

#Compute sum over the distribution array
compute_sum = lambda x: sum(array_augmentation*ast.literal_eval(x)) if type(ast.literal_eval(x)) is not float else None
silver_ref_dataset["sum_over_ref"] = silver_ref_dataset["ref_distribution"].apply(compute_sum)
gold_ref_dataset["sum_over_ref"] = gold_ref_dataset["ref_distribution"].apply(compute_sum)

#Saving entropy and gini informations
silver_ref_dataset["entropy_ref"] = silver_ref_dataset["ref_distribution"].apply(entropy)
silver_ref_dataset["gini_ref"] = silver_ref_dataset["ref_distribution"].apply(gini_index)

gold_ref_dataset["entropy_ref"] = gold_ref_dataset["ref_distribution"].apply(entropy)
gold_ref_dataset["gini_ref"] = gold_ref_dataset["ref_distribution"].apply(gini_index)

# silver_ref_dataset = silver_ref_dataset.drop(columns="ref_distribution")
# gold_ref_dataset = gold_ref_dataset.drop(columns="ref_distribution")

# n_quantili = 5

# # avg, std, len are categorized based on quantili
# avg_bins_edges = pd.qcut(silver_ref_dataset['avg_ref'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali
# std_bins_edges = pd.qcut(silver_ref_dataset['std_ref'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali
# sum_bins_edges = pd.qcut(silver_ref_dataset['sum_over_ref'], q=n_quantili, retbins=True)[1]  # 5 intervalli uguali

# silver_ref_dataset['avg_ref_bins'] = pd.cut(silver_ref_dataset['avg_ref'], bins=avg_bins_edges, include_lowest=True)
# silver_ref_dataset['std_ref_bins'] = pd.cut(silver_ref_dataset['std_ref'], bins=std_bins_edges, include_lowest=True)
# silver_ref_dataset['ref_sum'] = pd.cut(silver_ref_dataset['sum_over_ref'], bins=sum_bins_edges, include_lowest=True)
# silver_ref_dataset.dropna(subset=["ref_sum"], inplace=True)

# gold_ref_dataset['avg_ref_bins'] = pd.cut(gold_ref_dataset['avg_ref'], bins=avg_bins_edges, include_lowest=True)
# gold_ref_dataset['std_ref_bins'] = pd.cut(gold_ref_dataset['std_ref'], bins=std_bins_edges, include_lowest=True)
# gold_ref_dataset['ref_sum'] = pd.cut(gold_ref_dataset['sum_over_ref'], bins=sum_bins_edges, include_lowest=True)
# gold_ref_dataset.dropna(subset=["ref_sum"], inplace=True)

# #avg, std, len are dropped
# silver_ref_dataset = silver_ref_dataset.drop(columns=["avg_ref", "std_ref", "sum_over_ref"])
# gold_ref_dataset = gold_ref_dataset.drop(columns=["avg_ref", "std_ref", "sum_over_ref"])

print(silver_ref_dataset.columns)

Index(['entity', 'ref_distribution', 'std_ref', 'avg_ref', 'sum_over_ref',
       'entropy_ref', 'gini_ref'],
      dtype='object')


In [121]:
#Now we want to combine [silverdataset, silver_ref_dataset] and [goldataset, gold_ref_dataset] with respect to the key <entity>
silver_merged = pd.merge(silver_merged, silver_ref_dataset, left_on='item', right_on='entity')
gold_merged = pd.merge(gold_merged, gold_ref_dataset, left_on='item', right_on='entity')

silver_merged = silver_merged.drop(columns="entity")
gold_merged = gold_merged.drop(columns="entity") 


In [122]:
training_data = silver_merged
evaluation_data = gold_merged

In [123]:
# Rimuovo colonne non utili dal training set e dall'evaluation test
dataset = training_data.drop(columns=["description", "engtext"])
evaluation_dataset = evaluation_data.drop(columns=["description", "engtext"])

In [124]:
#PREPROCESSING SU DESCRIPTION
# Heuristic su 'description'
train_desc = pd.DataFrame(training_data["description"])
dataset['h_adj_descr'], dataset['h_nat_descr'] = heuristic_from_text(train_desc)

# Stesso preprocessing per l'evaluation set
eval_desc = pd.DataFrame(evaluation_data["description"])
evaluation_dataset['h_adj_descr'], evaluation_dataset['h_nat_descr'] = heuristic_from_text(eval_desc)

In [125]:
#PREPROCESSING SU ENGTEXT
#Heuristic su engtext
train_engtext = pd.DataFrame(training_data["engtext"])
dataset['h_adj_engtext'], dataset['h_nat_engtext'] = heuristic_from_text(train_engtext)
dataset.dropna(subset=["h_adj_engtext", "h_nat_engtext"], inplace=True)

# Stesso preprocessing per l'evaluation set
eval_engtext = pd.DataFrame(evaluation_data["engtext"])
evaluation_dataset['h_adj_engtext'], evaluation_dataset['h_nat_engtext'] = heuristic_from_text(eval_engtext)
evaluation_dataset.dropna(subset=["h_adj_engtext", "h_nat_engtext"], inplace=True)


In [126]:
#Now informations on engtext are divided by quantili
# n_quantili_engtext = 4
# adj_engtext_bins_edges = pd.qcut(dataset['h_adj_engtext'], q=n_quantili_engtext, retbins=True, duplicates='drop')[1]
# nat_engtext_bins_edges = pd.qcut(dataset['h_nat_engtext'], q=n_quantili_engtext, retbins=True, duplicates='drop')[1]

# dataset['adj_engtext_bins'] = pd.cut(dataset['h_adj_engtext'], bins=adj_engtext_bins_edges, include_lowest=True)
# dataset['nat_engtext_bins'] = pd.cut(dataset['h_nat_engtext'], bins=nat_engtext_bins_edges, include_lowest=True)

# evaluation_dataset['adj_engtext_bins'] = pd.cut(evaluation_dataset['h_adj_engtext'], bins=adj_engtext_bins_edges, include_lowest=True)
# evaluation_dataset['nat_engtext_bins'] = pd.cut(evaluation_dataset['h_nat_engtext'], bins=nat_engtext_bins_edges, include_lowest=True)

# #Now drop the original columns
# dataset = dataset.drop(columns=["h_adj_engtext", "h_nat_engtext"])
# evaluation_dataset = evaluation_dataset.drop(columns=["h_adj_engtext", "h_nat_engtext"])

In [127]:
dataset.rename(columns={"std": "std_text", "avg": "avg_text", "distribution": "text_distribution"}, inplace=True)
evaluation_dataset.rename(columns={"std": "std_text", "avg": "avg_text", "distribution": "text_distribution"}, inplace=True)

In [128]:
print(len(dataset))
print(len(evaluation_dataset))

6193
299


In [129]:
dataset.to_csv("datasets/silver_unicum.csv", mode='w', index=False)
evaluation_dataset.to_csv("datasets/gold_unicum.csv", mode='w', index=False)