In [1]:
from italian_csv_type_prediction.dataframe_generators import SimpleDatasetGenerator
from italian_csv_type_prediction.models import TypePredictor
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [2]:
def dataset_generation(number:int):
    return SimpleDatasetGenerator().build(number, verbose=False)

def _dataset_generation(args):
    return dataset_generation(*args)

def parallel_dataset_generation(number:int):
    processes = min(cpu_count()*5, number)
    with Pool(cpu_count()) as p:
        Xs, ys = list(zip(*tqdm(
            p.imap(_dataset_generation, (
                (number//processes, )
                for _ in range(processes)
            )),
            total=processes,
            leave=False
        )))
    return np.vstack(Xs), np.concatenate(ys)

In [3]:
x_train, y_train = parallel_dataset_generation(1000)
x_test, y_test = parallel_dataset_generation(1000)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))



In [4]:
model = TypePredictor()

model.fit(x_train, y_train)

In [5]:
y_pred = model._model.predict(x_test)

In [6]:
accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(0.8666348846910196, 0.8256851687565119)

In [7]:
from random_csv_generator import random_csv

df = random_csv(10)

In [8]:
df

Unnamed: 0,region,province,surname,name,sex,birth_municipality,birth_province,birth_region,birth_cap,birth_province_code,birthdate,address,house_number,cap,municipality,province_code,codice_fiscale,total_debit,payed_debit
0,Friuli Venezia Giulia,Udine,Scherini,Andrea,M,Mese,Sondrio,Lombardia,23020,SO,1923-01-11,Via C. Battisti,7,33029,Villa Santina,UD,SCHNDR23A11F153R,"Eu 74.656,00","Eu 24.779,00"
1,Emilia Romagna,Bologna,Careddu,Domenico,M,Padru,Olbia Tempio,Sardegna,7020,OT,1961-10-21,Via Del Triumvirato,84 - c/o Aeroporto,40132,Bologna,BO,CRDDNC61R21M301V,"Eu 70.265,00","Eu 39.660,00"
2,Campania,Napoli,Serra,Flavio,M,Birori,Nuoro,Sardegna,8010,NU,1978-09-15,Corso Italia,3,80065,Sant'agnello,,SRRFLV78P15A880H,"Eu 17.153,00","Eu 3.411,00"
3,Basilicata,Potenza,Guglielmi,Houda,F,Bordighera,Imperia,Liguria,18012,IM,1936-06-14,Via Ariosto,4,85012,Corleto Perticara,PZ,GGLHDO36H54A984F,"Eu 62.465,00","Eu 53.979,00"
4,Sicilia,Palermo,D'Incà,Silvana,F,Arsie',Belluno,Veneto,32030,BL,1931-08-17,Via V. Amedeo,103,90018,Termini Imerese,PA,DNCSVN31M57A443E,"Eu 2.001,00","Eu 737,00"
5,Lazio,Roma,Cerrato,Alex,M,Castagnole Delle Lanze,Asti,Piemonte,14054,AT,1998-04-25,Via Benedetto Croce,149/151/153,142,Roma,RM,CRRLXA98D25C049P,"Eu 35.640,00","Eu 23.181,00"
6,Calabria,Reggio Calabria,Macchi,Giuseppe,M,Azzio,Varese,Lombardia,21030,VA,2000-02-23,Via Roma,3,89040,Portigliola,RC,MCCGPP00B23A532C,"Eu 65.924,00","Eu 37.453,00"
7,Sicilia,Messina,Concas,Fatima Zahra,F,Gonnosfanadiga,Medio Campidano,Sardegna,9035,VS,1954-12-22,Via Umberto I,420,98035,Giardini-naxos,ME,CNCFMZ54T62E085J,"Eu 64.889,00","Eu 41.495,00"
8,Campania,Napoli,Rossi,Alma,F,Rancio Valcuvia,Varese,Lombardia,21030,VA,1962-09-27,Via De Meis,106/108,80147,Napoli,,RSSLMA62P67H173B,"Eu 84.618,00","Eu 64.479,00"
9,Emilia Romagna,Rimini,Zordan,Nicolo',M,Vicenza,Vicenza,Veneto,36100,VI,1963-05-25,Via Santarcangiolese,2870,47824,Poggio Berni,RN,ZRDNCL63E25L840X,"Eu 65.307,00","Eu 64.505,00"


In [9]:
model.predict_dataframe(df)

Unnamed: 0,region,province,surname,name,sex,birth_municipality,birth_province,birth_region,birth_cap,birth_province_code,birthdate,address,house_number,cap,municipality,province_code,codice_fiscale,total_debit,payed_debit
0,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency
1,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency
2,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,,CodiceFiscale,Currency,Currency
3,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency
4,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency
5,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency
6,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency
7,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,String,ProvinceCode,CodiceFiscale,Currency,Currency
8,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,,CodiceFiscale,Currency,Currency
9,Region,Municipality,Surname,Name,BiologicalSex,Municipality,String,Region,CAP,ProvinceCode,Date,Address,String,CAP,Municipality,ProvinceCode,CodiceFiscale,Currency,Currency


In [12]:
mask = y_test != y_pred

true_labels = model._embedder._encoder.inverse_transform(y_test[mask])
predicted_labels = model._embedder._encoder.inverse_transform(y_pred[mask])

labels = set()
for true, pred in zip(true_labels, predicted_labels):
    vals = (true, pred)
    if vals not in labels:
        print(vals)
        labels.add(vals)

('Document', 'String')
('CodiceFiscale', 'String')
('IVA', 'CodiceFiscale')
('Address', 'String')
('NaN', 'Boolean')
('CountryCode', 'NaN')
('Name', 'String')
('Surname', 'String')
('CodiceFiscale', 'Document')
('CountryCode', 'ProvinceCode')
