In [1]:
from italian_csv_type_prediction.dataframe_generators import SimpleDatasetGenerator
from italian_csv_type_prediction.models import TypePredictor
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.metrics import accuracy_score, balanced_accuracy_score

In [2]:
def dataset_generation(number:int):
    return SimpleDatasetGenerator().build(number, verbose=False)

def _dataset_generation(args):
    return dataset_generation(*args)

def parallel_dataset_generation(number:int):
    processes = min(cpu_count()*5, number)
    with Pool(cpu_count()) as p:
        Xs, ys = list(zip(*tqdm(
            p.imap(_dataset_generation, (
                (number//processes, )
                for _ in range(processes)
            )),
            total=processes,
            leave=False
        )))
    return np.vstack(Xs), np.concatenate(ys)

In [3]:
x_train, y_train = parallel_dataset_generation(100)
x_test, y_test = parallel_dataset_generation(100)

HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))



HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))



In [4]:
model = TypePredictor()

model.fit(x_train, y_train)

In [5]:
y_pred = model._model.predict(x_test)

In [6]:
accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(0.9989603675241293, 0.9995147914535054)

In [7]:
from random_csv_generator import random_csv

df = random_csv(10)

In [8]:
X, y = SimpleDatasetGenerator().generate_simple_dataframe()

In [9]:
X

Unnamed: 0,IVA,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,Integer,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,RSTLND27R09A464J,esame,ZB199 AS,Via F.lli Rosselli,27010,VR,,SALAPARUTA,2024,-597311,...,,Rosati,il bagaglio,renea@hotmail.com,,"94.256,00 €",1950-03-27,M,falso,98716
1,,accertamento,ZB 199 AS,Viale Degli Alberghi,38013,LU,LAZIO,POLLUTRI,1998.0,-135170,...,,,la rampa,deeanna_juhas@gmail.com,+39-311-005-5503,"82.856,00 €",1929-04-30,maschio,0,98717
2,,accertamento,ZB 199AS,Via Puini Carlo,,PN,,MORNAGO,,-474271,...,Riccardo,,,norah.waymire@gmail.com,+39-320-555-855860,"55.597,00 €",1979-09-13,,vero,98718
3,GNTNTN74R20A772N,contratto,,Contrada Tracino,14010,AR,SARDEGNA,BATTAGLIA TERME,2004,,...,Antonio,,quadrato,nan@koppinger.com,+39-324-555-072885,"89.476,00 €",2000-02-16,uomo,no,98719
4,CPPPLA95D20I377N,bolletta,,Via Marchese Di Roccaforte,10080,BS,MARCHE,,2018,-269896,...,Paolo,Coppola,l'inverno prossimo,erick.ferencz@aol.com,+39-350-555-5456,"40.376,00 €",1946-06-25,femmina,NO,98720
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,12006631001,bolletta,,Via Ponchielli,13898,MO,,VALEGGIO SUL MINCIO,2015,,...,Giuseppina,Gabriele,,,,"89.751,00 €",,,1,98788
73,FRRNRC82B04A012U,ingiunzione,,Via Leonardo Sciascia,56020,,UMBRIA,LANUVIO,2001,,...,Enrico,Ferrari,lo specchietto retrovisore,,+39-373-555-445,"51.302,00 €",1989-12-29,femmina,,98789
74,BRNSRN29D53D310R,ingiunzione,,Via Giacomo Biga,98160,PO,LIGURIA,CANEGRATE,,-752949.0,...,Sharon,,le montagne,dottie@hellickson.org,+39-330-555-6852,"12.366,00 €",1935-06-03,donna,SI,98790
75,00308050699,ordinanza,GA 123 KW,,37058,IS,FRIULI VENEZIA GIULIA,GAVORRANO,,-619194,...,Abderrahmane,Pieri,con,golda_kaniecki@yahoo.com,,"14.092,00 €",1965-05-20,M,vero,98791


In [10]:
from italian_csv_type_prediction.column_types.single_type_column import DocumentType

In [11]:
DocumentType().validate(["accertamento", "fattura", "accertamento", "fattura", "accertamento", "fattura", "accertamento", "fattura", "Nan"])

[True, True, True, True, True, True, True, True, False]

In [12]:
pd.DataFrame(y)

Unnamed: 0,IVA,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,Integer,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,CodiceFiscale,Document,Plate,Address,CAP,ProvinceCode,,Municipality,Year,Integer,...,,Surname,String,EMail,,Currency,Date,BiologicalSex,Boolean,NumericId
1,,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,Integer,...,,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
2,,Document,Plate,Address,,ProvinceCode,,Municipality,,Integer,...,Name,,,EMail,PhoneNumber,Currency,Date,,Boolean,NumericId
3,CodiceFiscale,Document,,Address,CAP,ProvinceCode,Region,Municipality,Year,,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
4,CodiceFiscale,Document,,Address,CAP,ProvinceCode,Region,,Year,Integer,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,IVA,Document,,Address,CAP,ProvinceCode,,Municipality,Year,,...,Name,Surname,,,,Currency,,,Boolean,NumericId
73,CodiceFiscale,Document,,Address,CAP,,Region,Municipality,Year,,...,Name,Surname,String,,PhoneNumber,Currency,Date,BiologicalSex,,NumericId
74,CodiceFiscale,Document,,Address,CAP,ProvinceCode,Region,Municipality,,Integer,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
75,IVA,Document,Plate,,CAP,ProvinceCode,Region,Municipality,,Integer,...,Name,Surname,String,EMail,,Currency,Date,BiologicalSex,Boolean,NumericId


In [13]:
model.predict_dataframe(X)

Unnamed: 0,IVA,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,Integer,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,CodiceFiscale,Document,Plate,Address,CAP,ProvinceCode,,Municipality,Year,Integer,...,,Surname,String,EMail,,Currency,Date,BiologicalSex,Boolean,NumericId
1,,Document,Plate,Address,CAP,ProvinceCode,Region,Municipality,Year,Integer,...,,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
2,,Document,Plate,Address,,ProvinceCode,,Municipality,,Integer,...,Name,,,EMail,PhoneNumber,Currency,Date,,Boolean,NumericId
3,CodiceFiscale,Document,,Address,CAP,ProvinceCode,Region,Municipality,Year,,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
4,CodiceFiscale,Document,,Address,CAP,ProvinceCode,Region,,Year,Integer,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,IVA,Document,,Address,CAP,ProvinceCode,,Municipality,Year,,...,Name,Surname,,,,Currency,,,Boolean,NumericId
73,CodiceFiscale,Document,,Address,CAP,,Region,Municipality,Year,,...,Name,Surname,String,,PhoneNumber,Currency,Date,BiologicalSex,,NumericId
74,CodiceFiscale,Document,,Address,CAP,ProvinceCode,Region,Municipality,,Integer,...,Name,,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
75,IVA,Document,Plate,,CAP,ProvinceCode,Region,Municipality,,Integer,...,Name,Surname,String,EMail,,Currency,Date,BiologicalSex,Boolean,NumericId


In [15]:
from collections import Counter

mask = y_test != y_pred

true_labels = model._embedder._encoder.inverse_transform(y_test[mask])
predicted_labels = model._embedder._encoder.inverse_transform(y_pred[mask])

Counter(zip(true_labels, predicted_labels))

Counter({('Surname', 'String'): 20,
         ('NaN', 'Boolean'): 10,
         ('NaN', 'CountryCode'): 44})