In [1]:
from italian_csv_type_prediction.dataframe_generators import SimpleDatasetGenerator
from italian_csv_type_prediction.models import TypePredictor
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
def dataset_generation(number:int):
    return SimpleDatasetGenerator().build(number, verbose=False)

def _dataset_generation(args):
    return dataset_generation(*args)

def parallel_dataset_generation(number:int):
    processes = min(cpu_count()*5, number)
    with Pool(cpu_count()) as p:
        Xs, ys = list(zip(*tqdm(
            p.imap(_dataset_generation, (
                (number//processes, )
                for _ in range(processes)
            )),
            desc="Creating dataset",
            total=processes,
            leave=False
        )))
    return np.vstack(Xs), np.concatenate(ys)

In [3]:
x_train, y_train = parallel_dataset_generation(5000)
x_test, y_test = parallel_dataset_generation(5000)

HBox(children=(FloatProgress(value=0.0, description='Creating dataset', max=80.0, style=ProgressStyle(descript…

HBox(children=(FloatProgress(value=0.0, description='Creating dataset', max=80.0, style=ProgressStyle(descript…

In [4]:
model = TypePredictor()

model.fit(x_train, y_train)

In [5]:
y_pred = model._model.predict(x_test)
y_train_pred = model._model.predict(x_train)

In [6]:
accuracy_score(y_test, y_pred), balanced_accuracy_score(y_test, y_pred)

(0.9931567008966657, 0.9949513884476372)

In [7]:
accuracy_score(y_train, y_train_pred), balanced_accuracy_score(y_train, y_train_pred)

(0.9965061420289458, 0.9983908087998286)

In [8]:
X, y = SimpleDatasetGenerator().generate_simple_dataframe()

In [9]:
y

Unnamed: 0,ItalianFiscalCode,ItalianVAT,CadastreCode,Document,Plate,Address,ItalianZIPCode,ProvinceCode,Region,Municipality,...,Name,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
0,ItalianFiscalCode,Error,CadastreCode,Document,Plate,,ItalianZIPCode,ProvinceCode,Region,Municipality,...,Name,,String,EMail,,Currency,Date,BiologicalSex,Boolean,NumericId
1,,ItalianVAT,CadastreCode,Document,Plate,Address,ItalianZIPCode,Error,Region,Municipality,...,Name,Surname,String,EMail,Error,Error,Date,BiologicalSex,Boolean,NumericId
2,ItalianFiscalCode,ItalianVAT,CadastreCode,Document,Plate,,,ProvinceCode,Region,Municipality,...,Error,Surname,String,EMail,PhoneNumber,Currency,Date,BiologicalSex,Boolean,NumericId
3,ItalianFiscalCode,ItalianVAT,CadastreCode,Document,Plate,Error,ItalianZIPCode,ProvinceCode,Region,Municipality,...,Name,,String,EMail,Error,Error,Date,Error,,NumericId
4,,ItalianVAT,CadastreCode,Document,Plate,Address,Error,ProvinceCode,Error,Municipality,...,,,String,EMail,,Currency,Date,Error,Boolean,
5,,ItalianVAT,CadastreCode,Document,Plate,Address,ItalianZIPCode,ProvinceCode,Region,Municipality,...,Error,Surname,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
6,ItalianFiscalCode,ItalianVAT,CadastreCode,Document,Plate,Address,,ProvinceCode,,Municipality,...,,,String,,PhoneNumber,Currency,Date,BiologicalSex,,
7,Error,ItalianVAT,CadastreCode,Document,Plate,,ItalianZIPCode,,Region,Municipality,...,Name,Surname,String,EMail,PhoneNumber,,Date,BiologicalSex,Boolean,NumericId
8,ItalianFiscalCode,ItalianVAT,CadastreCode,Error,,Address,ItalianZIPCode,ProvinceCode,Region,Municipality,...,Name,Surname,String,,PhoneNumber,Currency,Date,BiologicalSex,,Error
9,ItalianFiscalCode,ItalianVAT,CadastreCode,Document,Error,,,ProvinceCode,Region,,...,Name,Surname,String,Error,PhoneNumber,Currency,Date,,Boolean,NumericId


In [10]:
(model.predict_dataframe(X) == y).any()

ItalianFiscalCode    True
ItalianVAT           True
CadastreCode         True
Document             True
Plate                True
Address              True
ItalianZIPCode       True
ProvinceCode         True
Region               True
Municipality         True
Year                 True
Integer              True
Float                True
Country              True
CountryCode          True
Name                 True
Surname              True
String               True
EMail                True
PhoneNumber          True
Currency             True
Date                 True
BiologicalSex        True
Boolean              True
NumericId            True
dtype: bool

In [11]:
from collections import Counter

mask = y_test != y_pred

true_labels = model._embedder._encoder.inverse_transform(y_test[mask])
predicted_labels = model._embedder._encoder.inverse_transform(y_pred[mask])

Counter(zip(true_labels, predicted_labels))

Counter({('Integer', 'Error'): 600,
         ('Surname', 'Error'): 990,
         ('Error', 'ItalianZIPCode'): 178,
         ('Error', 'Name'): 2968,
         ('Error', 'Surname'): 2480,
         ('String', 'Error'): 1287,
         ('Error', 'String'): 4928,
         ('Name', 'Error'): 1487,
         ('Error', 'ProvinceCode'): 333,
         ('Error', 'Integer'): 802,
         ('Error', 'EMail'): 14,
         ('Error', 'Address'): 534,
         ('Error', 'NumericId'): 442,
         ('ProvinceCode', 'CountryCode'): 65,
         ('Error', 'CountryCode'): 465,
         ('Error', 'Date'): 13,
         ('NaN', 'CountryCode'): 414,
         ('Error', 'ItalianFiscalCode'): 153,
         ('PhoneNumber', 'Error'): 31,
         ('Address', 'Error'): 491,
         ('CountryCode', 'NaN'): 359,
         ('Error', 'Document'): 618,
         ('ItalianVAT', 'Error'): 129,
         ('Float', 'Error'): 33,
         ('EMail', 'Error'): 17,
         ('Plate', 'Error'): 20,
         ('CadastreCode', 'Error')