# Automatic Data Type Discovery

In [1]:
from scipy.io import savemat
import openml as oml
import pandas as pd
import numpy as np
import random

## 1. Heuristic Methods

In [8]:
def infer_feature_type(feature):
    """
    Infer data types for the given feature using simple logic
    Possible data types to infer: boolean, date, float, integer, string
    Feature that is not either a boolean, a date, a float or an integer, is classified as a string
    """
    data_type = ""
    types = ["date","float64","int64","string"]
    weights = [0,0,0,0] #Weights corresponding to the data types
    feature_len = len(feature)
    
    indices_number = int(0.1 * feature_len) #Number of different values to check in a feature
    indices = random.sample(range(0,feature_len), min(indices_number,feature_len)) #Array of random indices
    
    # If the feature only contains two different unique values, then infer it as boolean
    if len(pd.unique(feature)) == 2:
        data_type = "bool"
    else:
        for i in indices:
            try:
                if (len(feature[i]) <= 10 
                    and (((feature[i][2:3]=='-' or feature[i][2:3]=='/') 
                    and (feature[i][5:6]=='-' or feature[i][5:6]=='/')) 
                    or ((feature[i][4:5]=='-' or feature[i][4:5]=='/')
                    and (feature[i][7:8]=='-' or feature[i][7:8]=='/')))):
                    weights[0] += 1 #Date
                else:
                    weights[3] += 1 #String
            except (TypeError,ValueError,IndexError):
                try:
                    int(feature[i]) # numeric
                    if ('.' in str(feature[i])):
                        if isinstance(feature[i], np.float64):
                            if feature[i].is_integer():
                                weights[2] += 1 # Integer
                            else:
                                weights[1] += 1 # Float
                        else:
                            weights[1] += 1 # Float
                    else:
                        weights[2] += 1 #Integer
                except (TypeError,ValueError,IndexError):
                    weights[3] += 1 #String
#         print ("Date: {}, Float64: {}, Int64: {}, String: {}".format(weights[0],weights[1],weights[2],weights[3])) #For debugging purposes
        data_type = types[weights.index(max(weights))]
        
    return data_type

def discover_type_heuristic(data):
    """
    Automatically infer data types for each feature in the dataframe using simple logic
    ---
    input:

    data : numpy array or dataframe (numeric data needs to be 64 bit)
        
    """
#     df = pd.DataFrame(data)
#     print(df)
    result = []
    if isinstance(data, np.ndarray):
        # convert float32 to float64
        data = np.array(data, dtype='float64')
        df = pd.DataFrame(data)
    else:
        df = data

    for column in df.columns:        
        #print ("Trying to automatically infer the data type of the",column,"feature...") #For debugging purposes
        type_inferred = infer_feature_type(df[column])
        result.append(type_inferred)
        #print ("Result:",inferredType) #For debugging purposes
    return result

### 1.1 Get data from csv

In [24]:
df = pd.read_csv("datasets_csv/tips.csv")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [25]:
discover_type_heuristic(df)

['float64', 'float64', 'bool', 'bool', 'string', 'bool', 'int64']

### 1.2 Get data from openml

In [98]:
data = oml.datasets.get_dataset(1480)
Xy = data.get_data()
Xy.shape

(583, 11)

In [99]:
discover_type_heuristic(Xy)

['int64',
 'bool',
 'float64',
 'float64',
 'int64',
 'int64',
 'int64',
 'float64',
 'float64',
 'float64',
 'bool']

## 2. Bayesian Method
To apply bayesian methos, dataset needs to be parsed into tabular numeric form beforehand.

### Get data from openml

In [2]:
data = oml.datasets.get_dataset(40498)
Xy = data.get_data()
Xy.shape

(4898, 12)

### Generate mat file

In [16]:
def generate_mat(data):
    simple_types = discover_type_heuristic(data)
    # map simple types to meta types
    # 1: real (w positive: all real | positive | interval)
    # 2: real (w/o positive: all real | interval)
    # 3: binary data
    # 4: discrete (non-binary: categorical | ordinal | count)
    meta_types = [] 
    for i in range(len(simple_types)):
#         print(simple_types[i])
        if simple_types[i] == "bool":
            meta_types.append(3)
        elif simple_types[i] == "int64" or simple_types[i] == "float64":
            if (len(set(data[:,i])) < 0.05 * len(data[:,i]) and len(set(data[:,i])) < 50):
                meta_types.append(4)
            else:
                if (data[:,i] > 0).all():
                    meta_types.append(1)
                else:
                    meta_types.append(2)
    discrete_cardinality = [] # number of unique for discrete feature, 1 for others
    for i in range(len(meta_types)):
        if (meta_types[i] == 4):
            discrete_cardinality.append(len(set(data[:,i])))
        else:
            discrete_cardinality.append(1)
    data_dict = {'X': data,
                 'T': np.asarray(meta_types),
                 'R': np.asarray(discrete_cardinality)}
    print(data_dict)
    savemat('bayesian/data/temp/data.mat', data_dict, oned_as='row')

In [17]:
generate_mat(Xy)

{'X': array([[ 7.  ,  0.27,  0.36, ...,  0.45,  8.8 ,  3.  ],
       [ 6.3 ,  0.3 ,  0.34, ...,  0.49,  9.5 ,  3.  ],
       [ 8.1 ,  0.28,  0.4 , ...,  0.44, 10.1 ,  3.  ],
       ...,
       [ 6.5 ,  0.24,  0.19, ...,  0.46,  9.4 ,  3.  ],
       [ 5.5 ,  0.29,  0.3 , ...,  0.38, 12.8 ,  4.  ],
       [ 6.  ,  0.21,  0.38, ...,  0.32, 11.8 ,  3.  ]], dtype=float32), 'T': array([1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4]), 'R': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7])}


### Call bayesian model

In [13]:
from bayesian.bin import abda

  from ._conv import register_converters as _register_converters


In [21]:
weights = abda.main(seed=1337, dataset='bayesian/data/temp/data.mat', exp_id=None, args_output='./exp/temp/', args_miss=None, verbose=1,
         args_col_split_threshold=0.8, args_min_inst_slice=500, args_leaf_type='pm',
         args_type_param_map='spicky-prior-1', args_param_init='default', args_param_weight_init='uniform',
         args_n_iters=1, args_burn_in=4000, args_w_unif_prior=100, args_save_samples=1,
         args_ll_history=1, args_omega_prior='uniform', args_plot_iter=10, args_omega_unif_prior=10,
         args_leaf_omega_unif_prior=0.1, args_cat_unif_prior=1)


Loaded bayesian/data/temp/data.mat
	with shape: (4898, 12)
	meta-types:[1 1 2 1 1 1 1 1 1 1 1 4]
	maximal discrete cardinality: [1 1 1 1 1 1 1 1 1 1 1 7]
	meta types [<MetaType.REAL: 1> <MetaType.REAL: 1> <MetaType.REAL: 1>
 <MetaType.REAL: 1> <MetaType.REAL: 1> <MetaType.REAL: 1>
 <MetaType.REAL: 1> <MetaType.REAL: 1> <MetaType.REAL: 1>
 <MetaType.REAL: 1> <MetaType.REAL: 1> <MetaType.DISCRETE: 3>]
	domains [array([ 3.79999995, 14.19999981]), array([0.08      , 1.10000002]), array([0.        , 1.65999997]), array([ 0.60000002, 65.80000305]), array([0.009     , 0.34599999]), array([  2., 289.]), array([  9., 440.]), array([0.98711002, 1.03898001]), array([2.72000003, 3.81999993]), array([0.22      , 1.08000004]), array([ 8.        , 14.19999981]), array([0, 1, 2, 3, 4, 5, 6], dtype=int64)]
	translated discrete features (starting from 0)
MT MetaType.REAL OrderedDict([(<Type.REAL: (1, <MetaType.REAL: 1>)>, OrderedDict([(<class 'bayesian.spn.structure.leaves.parametric.Parametric.Gaussia

{'params': {'alpha': 1.9967129409128155, 'beta': 1}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorGamma object at 0x000000250F5E7128>}
{'params': {'l': 2}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorGamma object at 0x000000250F5E7438>}
gamma fit params (10.443556298462036, 0, 0.003776674490135308)
{'params': {'mean': 0, 'stdev': 1}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorNormalInverseGamma object at 0x000000250F5E77F0>}
{'params': {'alpha': 10.443556298462036, 'beta': 1}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorGamma object at 0x000000250F5E7400>}
{'params': {'l': 2}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorGamma object at 0x00000025146B5860>}
gamma fit params (4.007340933194108, 0, 7.1208967527828975)
{'params': {'mean': 0, 'stdev': 1}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorNormalInverseGamma object at 0x000000250F61B940>}
{'params': {'alpha': 4.007340933194108, 'beta': 1}, 'prior': <bayesian.spn.algorithms.Posteriors.PriorGamma o

 0.00149421])) + 0.3333333333333333*(Geometric(V11|p=0.5)) + 0.3333333333333333*(Poisson(V11|mean=2))))))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Done iteration 1/1 in 0.24897165965057866


------------------
Done in 0.26894672683806675
Dumped id to ./exp/temp/data.mat\20181019-013914\global-W.pklz
Dumped id to ./exp/temp/data.mat\20181019-013914\global-type-W.pklz
seeeeeeeeeeeeeeeeeeeeeeeeeee
./exp/temp/data.mat\20181019-013914


In [19]:
weights

{0: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.0029031102398664257,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.9970968897601337},
 1: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.25956414639120895,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.740435853608791},
 2: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.18659193346501912,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.8134080665349811},
 3: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.21373986328127573,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.7862601367187244},
 4: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.19188235951493987,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.8081176404850602},
 5: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.0012366027546818928,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.9987633972453183},
 6: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.001235951670606022,
  <Type.POSITIVE: (3, <MetaType.REAL: 1>)>: 0.9987640483293941},
 7: {<Type.REAL: (1, <MetaType.REAL: 1>)>: 0.40433179047769985,
  <Type.POSITIVE: (3, <MetaTy