In [1]:
from italian_csv_type_prediction.dataframe_generators import SimpleDatasetGenerator
from italian_csv_type_prediction.models import TypePredictor
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [2]:
def dataset_generation(number:int):
    return SimpleDatasetGenerator().build(number, verbose=False)

def _dataset_generation(args):
    return dataset_generation(*args)

def parallel_dataset_generation(number:int):
    processes = min(cpu_count()*5, number)
    with Pool(cpu_count()) as p:
        Xs, ys = list(zip(*tqdm(
            p.imap(_dataset_generation, (
                (number//processes, )
                for _ in range(processes)
            )),
            total=processes,
            leave=False
        )))
    return np.vstack(Xs), np.concatenate(ys)

In [3]:
x_train, y_train = parallel_dataset_generation(1000)
x_test, y_test = parallel_dataset_generation(1000)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))

In [4]:
model = TypePredictor()

model.fit(x_train, y_train)



In [5]:
y_pred = model._model.predict(x_test)

In [6]:
accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(0.9988621728428976, 0.9996193190501738)

In [7]:
X, y = SimpleDatasetGenerator().generate_simple_dataframe()

In [8]:
X

Unnamed: 0,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,07488771218,,contratto,GA 123 KW,Via Del Riccio,70018.0,MI,,MONTORO INFERIORE,2024.0,...,Federico,Pastorino,voo,pamella.schmierer@schmierer.org,+39-397-555-403,,1972-04-23,F,0,41140.0
1,02858660232,A821,,GA 123KW,Via 4 Novembre,70027.0,PT,,APICE,,...,Renata,,opposto,chaya@malvin.com,+39-310-555-1208,"80.637,00 €",1978-03-05,F,1,
2,MRURCC77S22G154I,L407,verbali,GA123 KW,Corso Alberto Pio,26040.0,MS,LIGURIA,RIPE SAN GINESIO,,...,Rocco,Mura,nero,deandrea@yahoo.com,+39-371-055-504980,"55.793,00 €",1935-04-01,,,41142.0
3,01561040245,G151,pagamento,ZB 199AS,,98066.0,BL,CALABRIA,OGGEBBIO,2024.0,...,Simona,,il reparto latticini,estrella@aol.com,+39-373-555-455,,1966-01-10,donna,1,41143.0
4,,E250,fattura,,Via A. Nielli,,BO,,GALLO MATESE,2023.0,...,Riccardo,Paolicelli,adesso,gberray@gmail.com,+39-311-055-5565,"50.516,00 €",,maschio,,41144.0
5,03072570785,E423,accertamento,ZB 199AS,Via La Masa,27020.0,SO,SICILIA,,2001.0,...,Gianfranca,,posso aiutarti?,cory.gibes@gmail.com,+39-385-555-789,"26.791,00 €",,uomo,False,41145.0
6,00938080348,G303,verbali,,Borgo Giuseppe Garibaldi,28877.0,RA,,SANTA MARIA NUOVA,2025.0,...,Delfina Vittoria,Leonardi,il cognato,lemuel.latzke@gmail.com,,"85.440,00 €",,,SI,41146.0
7,SCRNMR56P65I040V,A780,ingiunzione,,Via Nazionale Modica-ispica,46041.0,RC,MOLISE,,2026.0,...,Anna Maria,Scarpa,biondo,paris@hotmail.com,+39-324-555-868,,1986-03-27,,0,
8,PSCRSO89S47M092B,F176,,ZB 199AS,Via Col De Lys,,,TOSCANA,BELLANTE,,...,Rosa,Piscitelli,,nicolette_brossart@brossart.com,+39-397-555-2444,"77.168,00 €",1941-03-19,F,False,41148.0
9,,D812,verbali,ZB199 AS,Via Bell'italia,37060.0,CN,,SAN COLOMBANO AL LAMBRO,,...,Osaretin,,mille (mila),dorothy@cox.net,+39-368-555-423,"93.928,00 €",1961-09-13,femmina,false,41149.0


In [9]:
pd.DataFrame(y)

Unnamed: 0,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,IVA,,Document,Plate,Address,CAP,ProvinceCode,,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
1,IVA,CodiceCatasto,,Plate,Address,CAP,ProvinceCode,,Municipality,,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,
2,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,,,NumericId
3,IVA,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,Region,Municipality,Year,...,Name,,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
4,,CodiceCatasto,Document,,Address,,ProvinceCode,,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,,BiologicalSex,,NumericId
5,IVA,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,,Year,...,Name,,String,EMail,PhoneNumber,Currency,,BiologicalSex,Boolean,NumericId
6,IVA,CodiceCatasto,Document,,Address,CAP,ProvinceCode,,Municipality,Year,...,Name,Surname,String,EMail,,Currency,,,Boolean,NumericId
7,CodiceFiscale,CodiceCatasto,Document,,Address,CAP,ProvinceCode,Region,,Year,...,Name,Surname,String,EMail,PhoneNumber,,Date,,Boolean,
8,CodiceFiscale,CodiceCatasto,,Plate,Address,,,Region,Municipality,,...,Name,Surname,,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
9,,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,,Municipality,,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId


In [10]:
model.predict_dataframe(X)

Unnamed: 0,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,IVA,,Document,Plate,Address,CAP,ProvinceCode,,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
1,IVA,CodiceCatasto,,Plate,Address,CAP,ProvinceCode,,Municipality,,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,
2,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,,,NumericId
3,IVA,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,Region,Municipality,Year,...,Name,,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
4,,CodiceCatasto,Document,,Address,,ProvinceCode,,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,,BiologicalSex,,NumericId
5,IVA,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,,Year,...,Name,,String,EMail,PhoneNumber,Currency,,BiologicalSex,Boolean,NumericId
6,IVA,CodiceCatasto,Document,,Address,CAP,ProvinceCode,,Municipality,Year,...,Name,Surname,String,EMail,,Currency,,,Boolean,NumericId
7,CodiceFiscale,CodiceCatasto,Document,,Address,CAP,ProvinceCode,Region,,Year,...,Name,Surname,String,EMail,PhoneNumber,,Date,,Boolean,
8,CodiceFiscale,CodiceCatasto,,Plate,Address,,,Region,Municipality,,...,Name,Surname,,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
9,,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,,Municipality,,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId


In [11]:
from collections import Counter

mask = y_test != y_pred

true_labels = model._embedder._encoder.inverse_transform(y_test[mask])
predicted_labels = model._embedder._encoder.inverse_transform(y_pred[mask])

Counter(zip(true_labels, predicted_labels))

Counter({('NaN', 'CountryCode'): 613,
         ('Surname', 'String'): 173,
         ('NaN', 'Boolean'): 535,
         ('Name', 'String'): 10,
         ('Boolean', 'Surname'): 2,
         ('Boolean', 'ProvinceCode'): 3,
         ('CountryCode', 'Name'): 2,
         ('CountryCode', 'ProvinceCode'): 4})