In [1]:
from italian_csv_type_prediction.dataframe_generators import SimpleDatasetGenerator
from italian_csv_type_prediction.models import TypePredictor
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [2]:
def dataset_generation(number:int):
    return SimpleDatasetGenerator().build(number, verbose=False)

def _dataset_generation(args):
    return dataset_generation(*args)

def parallel_dataset_generation(number:int):
    processes = min(cpu_count()*5, number)
    with Pool(cpu_count()) as p:
        Xs, ys = list(zip(*tqdm(
            p.imap(_dataset_generation, (
                (number//processes, )
                for _ in range(processes)
            )),
            total=processes,
            leave=False
        )))
    return np.vstack(Xs), np.concatenate(ys)

In [3]:
x_train, y_train = parallel_dataset_generation(1000)
x_test, y_test = parallel_dataset_generation(1000)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))

In [4]:
model = TypePredictor()

model.fit(x_train, y_train)

In [5]:
y_pred = model._model.predict(x_test)

In [6]:
accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(0.9989100403031609, 0.9993955489434705)

In [7]:
X, y = SimpleDatasetGenerator().generate_simple_dataframe()

In [8]:
pd.DataFrame(y)

Unnamed: 0,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,IVA,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,,Year,...,Name,Surname,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
1,,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,,Municipality,Year,...,,Surname,String,EMail,PhoneNumber,Currency,Date,,Boolean,
2,IVA,,Document,Plate,Address,CAP,,Region,Municipality,,...,,Surname,,EMail,PhoneNumber,,,BiologicalSex,,NumericId
3,CodiceFiscale,CodiceCatasto,Document,Plate,Address,,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,,,Currency,Date,BiologicalSex,Boolean,NumericId
4,,CodiceCatasto,,Plate,Address,CAP,,Region,Municipality,Year,...,Name,Surname,,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
5,IVA,,Document,,Address,CAP,ProvinceCode,Region,Municipality,,...,,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
6,CodiceFiscale,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,,Municipality,Year,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
7,CodiceFiscale,CodiceCatasto,Document,Plate,Address,,ProvinceCode,Region,Municipality,,...,,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,
8,CodiceFiscale,CodiceCatasto,Document,,Address,,ProvinceCode,Region,Municipality,Year,...,Name,Surname,,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
9,,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,Region,Municipality,Year,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId


In [9]:
model.predict_dataframe(X)

Unnamed: 0,CodiceFiscale,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,IVA,CodiceCatasto,Document,Plate,Address,CAP,ProvinceCode,Region,,Year,...,Name,Surname,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
1,,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,,Municipality,Year,...,,Surname,String,EMail,PhoneNumber,Currency,Date,,Boolean,
2,IVA,,Document,Plate,Address,CAP,,Region,Municipality,,...,,Surname,,EMail,PhoneNumber,,,BiologicalSex,,NumericId
3,CodiceFiscale,CodiceCatasto,Document,Plate,Address,,ProvinceCode,Region,Municipality,Year,...,Name,Surname,String,,,Currency,Date,BiologicalSex,Boolean,NumericId
4,,CodiceCatasto,,Plate,Address,CAP,,Region,Municipality,Year,...,Name,Surname,,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
5,IVA,,Document,,Address,CAP,ProvinceCode,Region,Municipality,,...,,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
6,CodiceFiscale,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,,Municipality,Year,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
7,CodiceFiscale,CodiceCatasto,Document,Plate,Address,,ProvinceCode,Region,Municipality,,...,,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,
8,CodiceFiscale,CodiceCatasto,Document,,Address,,ProvinceCode,Region,Municipality,Year,...,Name,Surname,,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
9,,CodiceCatasto,Document,Plate,,CAP,ProvinceCode,Region,Municipality,Year,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId


In [10]:
from collections import Counter

mask = y_test != y_pred

true_labels = model._embedder._encoder.inverse_transform(y_test[mask])
predicted_labels = model._embedder._encoder.inverse_transform(y_pred[mask])

Counter(zip(true_labels, predicted_labels))

Counter({('Surname', 'String'): 57,
         ('Name', 'String'): 429,
         ('NaN', 'CountryCode'): 418,
         ('NaN', 'Boolean'): 425,
         ('CountryCode', 'ProvinceCode'): 4})