In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#!pip install opendatasets --upgrade

In [3]:
%%writefile testutility.py
import pandas as pd
import yaml
import opendatasets as od
import datetime
import re

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string)
    return string

def col_header_val(df,table_config):
    '''
    replace whitespaces in the column
    and standardized column names
    '''
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(),  table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    #df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns) == len(expected_col) and list(expected_col)  == list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns: {df.columns}')
        logging.info(f'expected columns: {expected_col}')
        return 0

Overwriting testutility.py


# Write YAML file

In [4]:
%%writefile file.yaml
file_type: csv
dataset_name: testfile
file_name: en-fr
table_name: edsurv
inbound_delimiter: ","
outbound_delimiter: "|"
skip_leading_rows: 1
columns:
    - en
    - fr


Overwriting file.yaml


In [5]:
#!pip install opendatasets

In [6]:
# Read config file
import testutility as util
import opendatasets as od
config_data = util.read_config_file("file.yaml")

In [7]:
config_data['inbound_delimiter']

','

In [8]:
#inspecting data of config file
config_data

{'file_type': 'csv',
 'dataset_name': 'testfile',
 'file_name': 'en-fr',
 'table_name': 'edsurv',
 'inbound_delimiter': ',',
 'outbound_delimiter': '|',
 'skip_leading_rows': 1,
 'columns': ['en', 'fr']}

In [9]:
import dask.dataframe as dd
df_sample = dd.read_csv("/content/drive/MyDrive/Data-Glacier/notebooks/Data/en-fr.csv", delimiter=',')
df_sample.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [10]:
# read the file using config file
file_type = config_data['file_type']
source_file = "/content/drive/MyDrive/Data-Glacier/notebooks/Data/" + config_data['file_name'] + f'.{file_type}'


In [11]:
#print("",source_file)

df = dd.read_csv(source_file,thousands = config_data['inbound_delimiter'])
df.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [12]:
#validate the header of the file
util.col_header_val(df,config_data)

column name and column length validation passed


1

In [13]:
print("columns of files are:" ,df.columns)
print("columns of YAML are:" ,config_data['columns'])

columns of files are: Index(['en', 'fr'], dtype='object')
columns of YAML are: ['en', 'fr']


In [14]:
#df.shape

In [15]:
import os

file_size = os.path.getsize(source_file)
print("There are {rows} rows and {col} columns".format(rows = df.shape[0].compute(), col=df.shape[1]))
print("File Size is :", file_size, "bytes")

There are 22520376 rows and 2 columns
File Size is : 8410507707 bytes


In [16]:
if util.col_header_val(df,config_data)==0:
    print("validation failed")
    # write code to reject the file
else:
    print("col validation passed")
    # write the code to perform further action
    # in the pipleine

column name and column length validation passed
col validation passed


In [17]:
dd.read_csv("/content/drive/MyDrive/Data-Glacier/notebooks/Data/en-fr.csv").head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [18]:
df.head()

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


In [19]:
df.shape

(Delayed('int-5b53f390-7453-4372-ae00-77342a4f91a6'), 2)

In [20]:
### Creating test file for this demo:
testdata = {
    'en' : ['Hello', 'Goodbye', 'Good Luck!','Coffee'],
    'fr' : ['Bonjour', 'Au revoir', 'Bonne chance!', 'Café']
}
import pandas as pd
df = pd.DataFrame(testdata, columns=['en', 'fr'])
df.to_csv("en-fr.csv",index=False)


In [21]:
df

Unnamed: 0,en,fr
0,Hello,Bonjour
1,Goodbye,Au revoir
2,Good Luck!,Bonne chance!
3,Coffee,Café


In [22]:
testdata


{'en': ['Hello', 'Goodbye', 'Good Luck!', 'Coffee'],
 'fr': ['Bonjour', 'Au revoir', 'Bonne chance!', 'Café']}

# Create a summary of the file:

Total number of rows,

total number of columns

file size

In [23]:

print("There are {rows} rows and {col} columns".format(rows = df.shape[0], col=df.shape[1]))

There are 4 rows and 2 columns
