## Test RunTime for different reading methods.

In [57]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime 
import gc
import re


################
# File Reading #
################

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)


def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string) 
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


In [58]:
%%writefile file.yaml
file_type: csv
dataset_name: artists
file_name: artists
table_name: artists_data
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns: 
    - id
    - name
    - years
    - genre
    - bio
    - wikipedia
    - nationality
    - paintings

Overwriting file.yaml


In [59]:
import testutility as util
config_data = util.read_config_file("file.yaml")

In [60]:
config_data['columns']

['id',
 'name',
 'years',
 'genre',
 'bio',
 'wikipedia',
 'nationality',
 'paintings']

In [61]:
config_data

{'file_type': 'csv',
 'dataset_name': 'artists',
 'file_name': 'artists',
 'table_name': 'artists_data',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['id',
  'name',
  'years',
  'genre',
  'bio',
  'wikipedia',
  'nationality',
  'paintings']}

In [69]:
import pandas as pd
df_sample = pd.read_csv("artists_large.csv",delimiter=',')

In [73]:
df_sample= df_sample.drop("Unnamed: 0", axis=1)

In [75]:
df_sample.shape

(5000000, 8)

In [76]:
# read the file using config file
file_type = config_data['file_type']
source_file = "./" + config_data['file_name'] + f'.{file_type}'
#print("",source_file)
df = pd.read_csv(source_file,config_data['inbound_delimiter'])
df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,name,years,genre,nationality,bio,wikipedia,paintings
0,0,Amedeo Modigliani,1884 - 1920,Expressionism,Italian,Amedeo Clemente Modigliani (Italian pronunciat...,http://en.wikipedia.org/wiki/Amedeo_Modigliani,193
1,1,Vasiliy Kandinskiy,1866 - 1944,"Expressionism,Abstractionism",Russian,Wassily Wassilyevich Kandinsky (Russian: Васи́...,http://en.wikipedia.org/wiki/Wassily_Kandinsky,88
2,2,Diego Rivera,1886 - 1957,"Social Realism,Muralism",Mexican,Diego María de la Concepción Juan Nepomuceno E...,http://en.wikipedia.org/wiki/Diego_Rivera,70
3,3,Claude Monet,1840 - 1926,Impressionism,French,Oscar-Claude Monet (; French: [klod mɔnɛ]; 14 ...,http://en.wikipedia.org/wiki/Claude_Monet,73
4,4,Rene Magritte,1898 - 1967,"Surrealism,Impressionism",Belgian,René François Ghislain Magritte (French: [ʁəne...,http://en.wikipedia.org/wiki/René_Magritte,194


In [77]:
util.col_header_val(df,config_data)

column name and column length validation passed


1

In [78]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['id', 'name', 'years', 'genre', 'nationality', 'bio', 'wikipedia',
       'paintings'],
      dtype='object')
columns of YAML are: ['id', 'name', 'years', 'genre', 'bio', 'wikipedia', 'nationality', 'paintings']


In [79]:
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine

column name and column length validation passed
col validation passed


In [82]:
### Creating test file for this demo:
testdata = {
    'name' : ['Amedeo Modigliani', 'Diego Rivera', 'Rene Magritte'],
    'years' : ["1884 - 1920", "1886 - 1957", "1898 - 1967"],
    'genre' : ['Expressionism','Social Realism,Muralism','Surrealism,Impressionism'],
    'paintings': [193,70,194]
}
import pandas as pd
df_ = pd.DataFrame(testdata, columns=['name', 'years','genre', 'paintings'])
df_.to_csv("test_data.csv",index=False, sep='|')

## Summary of the file:

#### Total number of rows: 5,000,000

#### Total number of columns: 8

#### file size: 3.78 GB