# Names Dataset with ExeTera

This notebook will demonstrate using ExeTera to load a dataset of given names from the ["Gender by Name Data Set"](https://archive.ics.uci.edu/ml/datasets/Gender+by+Name).

Each row has four fields:
* Name: String
* Gender: M/F (category/string)
* Count: Integer, total number of instances of this name in the dataset
* Probability: Float, chance of a randomly drawn person from the population having this name


In [1]:
import sys
sys.path.insert(0,"..")


from urllib.request import urlretrieve

SRC_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00591/name_gender_dataset.csv"
FILENAME = "name_gender_dataset.csv"

_ = urlretrieve(SRC_URL, FILENAME)

In [2]:
%%writefile name_gender_dataset_schema.json

{
  "exetera": {
    "version": "1.0.0"
  },
  "schema": {
    "name_gender_dataset": {
      "primary_keys": [
        "Name"
      ],
      "fields": {
        "Name": {
          "field_type": "string"
        },
        "Gender": {
          "field_type": "categorical",
          "categorical": {
            "value_type": "int8",
            "strings_to_values": {
              "M": 1,
              "F": 2
            }
          }
        },
        "Count": {
          "field_type": "numeric",
          "value_type": "int32"
        },
        "Probability": {
          "field_type": "numeric",
          "value_type": "float32"
        }   
      }
    }
  }
}

Overwriting name_gender_dataset_schema.json


In [3]:
from exetera.io import importer
from exetera.core import session
from datetime import datetime, timezone

with session.Session() as s:
    importer.import_with_schema(
        session=s,
        timestamp=str(datetime.now(timezone.utc)),
        dataset_name="NameGender",
        dest_file_name="name_gender_dataset.hdf5",
        schema_file="name_gender_dataset_schema.json",
        files={"name_gender_dataset": "name_gender_dataset.csv"},
        overwrite=True,
    )

read_file_using_fast_csv_reader: 1 chunks, 147269 accumulated_written_rows parsed in 1.1086580753326416s
completed in 1.1138238906860352 seconds
Total time 1.1140117645263672s


In [40]:
import csv 
import unicodecsv


with open("name_gender_dataset.csv","rb") as o:
    d=o.read(5)
#     print(len(d),len(d.encode()),d.encode())
    o.seek(0)
    
    csvf = unicodecsv.DictReader(o, delimiter=',', quotechar='"',)
    csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
    print(csvf_fieldnames)

['\ufeffName', 'Gender', 'Count', 'Probability']


In [60]:

dat=b"\xef\xbb\xbfName,Gender,Count,Probability\r\nJames,M,5304407,0.014516787"

# with open("test_bom.csv","bw") as o:
#     o.write(dat)
    
# with open("test_nobom.csv","bw") as o:
#     o.write(dat[3:])
    
with open("test_bom_utf16.csv","bw") as o:
    o.write(dat[3:].decode().encode("utf-16"))
    
with open("test_bom_utf32.csv","bw") as o:
    o.write(dat[3:].decode().encode("utf-32"))
    
with open("test_nobom_utf16.csv","bw") as o:
    o.write(dat[3:].decode().encode("utf-16")[2:])
    
with open("test_nobom_utf32.csv","bw") as o:
    o.write(dat[3:].decode().encode("utf-32")[4:])
    
with open("test_nobom.csv","r",encoding="utf-8-sig") as o:
    csvf = csv.DictReader(o, delimiter=',', quotechar='"',)
    csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
    print(csvf_fieldnames)
    
with open("test_bom.csv","r",encoding="utf-8-sig") as o:
    csvf = csv.DictReader(o, delimiter=',', quotechar='"',)
    csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
    print(csvf_fieldnames)
    
with open("test_bom_utf16.csv","r",encoding="utf-16") as o:
    csvf = csv.DictReader(o, delimiter=',', quotechar='"',)
    csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
    print(csvf_fieldnames)
    
with open("test_bom_utf32.csv","r",encoding="utf-32") as o:
    csvf = csv.DictReader(o, delimiter=',', quotechar='"',)
    csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
    print(csvf_fieldnames)
    
# with open("test_nobom_utf16.csv","r",encoding="utf-16") as o:
#     csvf = csv.DictReader(o, delimiter=',', quotechar='"',)
#     csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
#     print(csvf_fieldnames)
    
# with open("test_nobom_utf32.csv","r",encoding="utf-32") as o:
#     csvf = csv.DictReader(o, delimiter=',', quotechar='"',)
#     csvf_fieldnames = [k.strip() for k in csvf.fieldnames]
#     print(csvf_fieldnames)


['Name', 'Gender', 'Count', 'Probability']
['Name', 'Gender', 'Count', 'Probability']
['Name', 'Gender', 'Count', 'Probability']
['Name', 'Gender', 'Count', 'Probability']


In [62]:
from codecs import BOM_UTF8, BOM_UTF16_BE, BOM_UTF16_LE, BOM_UTF32_BE, BOM_UTF32_LE
from glob import glob
def guess_encoding(filename):
    with open(filename,"rb") as o:
        dat=o.read(4)
        
    if BOM_UTF32_BE in dat or BOM_UTF32_LE in dat:
        return "utf-32"
    elif BOM_UTF16_BE in dat or BOM_UTF16_LE in dat:
        return "utf-16"
    elif BOM_UTF8 in dat:
        return "utf-8-sig"
    else:
        return "utf-8"
    

for fn in glob("test_*.csv"):
    print(fn,guess_encoding(fn))

test_bom.csv utf-8-sig
test_nobom_utf32.csv utf-8
test_bom_utf32.csv utf-32
test_nobom.csv utf-8
test_bom_utf16.csv utf-16
test_nobom_utf16.csv utf-8
