## Person Entity Data
 
----

In [1]:
%pip install datasets --quiet
%pip install transformers --quiet

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import datasets 
from transformers import BertTokenizer

# squad_ds_list = [ ds for ds in datasets.list_datasets() if 'squad' in ds.lower()  ]

In [14]:
## Example of loading data with streaming = True
dataset = datasets.load_dataset('squad', streaming=True)

print(list(dataset.keys()))
print(dataset['train'].description)
print(f"The Length of Data is : {dataset['train'].dataset_size: ,} records")
print(dataset['train'].features)

## define the transformation to dataset['train'] here
dataset['train'] = dataset['train'].map(
    lambda x: {
        'id': x['id'],
        'answers': { 
            **x['answers'], 
            **{
                'answer_end': [ x['answers']['answer_start'][0] + \
                               len(x['answers']['text'][0]) ]
            }
        },
        'context': x['context'],
        'question': x['question'],
        'title': x['title']
    }
)

## Since streaming true then beaware of lazy exec
print("Printing Samples from Dataset")
for i, sample in enumerate(dataset['train']):
    print(sample['answers'], '\n')
    if i > 4: break

['train', 'validation']
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.

The Length of Data is :  89,819,092 records
{'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}
Printing Samples from Dataset
{'text': ['Saint Bernadette Soubirous'], 'answer_start': [515], 'answer_end': [541]} 

{'text': ['a copper statue of Christ'], 'answer_start': [188], 'answer_end': [213]} 

{'text': ['the Main Building'], 'answer_start': [279], 'answer_end': [296]} 

{'text': ['a Marian place of prayer and re

In [4]:
## Example of same as above when streaming is False
dataset = datasets.load_dataset('squad', streaming=False)

print(list(dataset.keys()))
print(dataset['train'].description)
print(f"The Length of Data is : {dataset['train'].dataset_size: ,} records")
print(dataset['train'].features)

## define the transformation to dataset['train'] here
dataset['train'] = dataset['train'].map(
    lambda x: {
        'answers': { 
            **x['answers'], 
            **{
                'answer_end': [ x['answers']['answer_start'][0] + \
                               len(x['answers']['text'][0]) ]
            }
        }
    }
)

print(f"Printing Features from train dataset Dataset: {dataset['train'].features}")

# rename column
dataset['train'] = dataset['train'].rename_column('title', 'topic') 

# filter 
dataset['train'] = dataset['train'].filter(
    lambda x: x['topic'] == 'University_of_Notre_Dame' 
)

['train', 'validation']
Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.

The Length of Data is :  89,819,092 records
{'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)}


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Printing Features from train dataset Dataset: {'id': Value(dtype='string', id=None), 'title': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'question': Value(dtype='string', id=None), 'answers': {'answer_end': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'answer_start': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), 'text': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}}


Filter:   0%|          | 0/87599 [00:00<?, ? examples/s]

In [5]:
len(dataset['train'])

269

In [23]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

dataset['train'] = dataset['train'].map(
    lambda x: tokenizer(
        x['question'], 
        x['context'],
        max_length = 512,
        padding = 'max_length',
        truncation = True
    ), batched = True, batch_size=32
)

Map:   0%|          | 0/269 [00:00<?, ? examples/s]

----

In [2]:
from pyent.datasets import remove_nan, sample_xy, generate_febrl_data, train_test_validate_strat_split

In [3]:
d = remove_nan(generate_febrl_data())
d.head()

Before Dropping NaN's shape of data is (86506, 23)
After Dropping NaN's shape of data is (52560, 24)


Unnamed: 0,index,rec_idL,rec_idR,given_name_l,surname_l,street_number_l,address_1_l,address_2_l,suburb_l,postcode_l,...,surname_r,street_number_r,address_1_r,address_2_r,suburb_r,postcode_r,state_r,date_of_birth_r,soc_sec_id_r,labels
0,2,rec-1866-org,rec-27-dup-0,nathan,campbell,14,la perouse street,st francis room,glengowrie,6148,...,campbell,190,jackie howe crescent,bugoren,woorim,6352,nsw,19531108,8948230,no_match
1,3,rec-2941-org,rec-1744-dup-0,liam,green,2,benny place,the gums,crescent head,3067,...,green,32,sid barnes crescent,dudley specialistm edical centre,wingfield,2027,qld,19860328,4528322,no_match
2,5,rec-3748-org,rec-2550-dup-0,iain,noble,84,rischbieth crescent,the big tree,albany creek,6391,...,noble,16,torrens street,old hiloside,rose bay,4510,nsw,19050728,6816111,no_match
3,6,rec-1595-org,rec-4413-dup-0,gus,white,109,bundey street,ingevale,clayton,6155,...,white,178,carliles treet,laureldale,harris park,6180,nsw,19470605,1655664,no_match
4,7,rec-4919-org,rec-2013-dup-0,mia,jolly,15,findlay street,hayfield vlge,kincumber,3995,...,jolly,25,shout place,jinmara,eaglehawk,2229,nsw,19381031,3608034,no_match


In [4]:
# noinspection PyTupleAssignmentBalance
features_train, features_test, features_validate, targets_train, targets_test, targets_validate = sample_xy(
    X=d.loc[:, list(d.columns[:-1])], y=d.loc[:, d.columns[-1]], num=100) 

ValueError: not enough values to unpack (expected 6, got 2)

In [14]:
??sample_xy

[0;31mSignature:[0m
[0msample_xy[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mX[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0my[0m[0;34m:[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mseries[0m[0;34m.[0m[0mSeries[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnum[0m[0;34m:[0m [0mint[0m [0;34m|[0m [0;32mNone[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0mTuple[0m[0;34m[[0m[0mAny[0m[0;34m,[0m [0mpandas[0m[0;34m.[0m[0mcore[0m[0;34m.[0m[0mframe[0m[0;34m.[0m[0mDataFrame[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
sample any df or series by record count and series val

:param num: this is an integer that
:param y: this is the column representing the target
:param X: this is the dataframe of features

:returns Tuple[pandas.DataFrame, pandas.DataFrame]: this is the 2 
    