In [1]:
%%writefile file.yaml
file_type: csv
dataset_name: testfile
file_name: test_data
table_name: edsurv
inbound_delimeter: ','
outbout_delimeter: '|'
skip_leading_rows: 1
columns:
    city
    price
    distance

Overwriting file.yaml


In [2]:
%%writefile testutility.py
import logging
import os
import subprocess
import yaml
import pandas as pd
import datetime
import gc
import re

def read_config_file(filepath):
    with open(filepath, 'r') as stream:
        try:
            return yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            logging.error(exc)

def replacer(string, char):
    pattern = char + '{2,}'
    string = re.sub(pattern, char, string)
    return string

def col_header_val(df, table_config):
    df.columns = df.columns.str.lower()
    df.columns = df.columns.str.replace('[^\w]','_',regex=True)
    df.columns = list(map(lambda x: x.strip('_'), list(df.columns)))
    df.columns = list(map(lambda x: replacer(x,'_'), list(df.columns)))
    expected_col = list(map(lambda x: x.lower(), table_config['columns']))
    expected_col.sort()
    df.columns =list(map(lambda x: x.lower(), list(df.columns)))
    df = df.reindex(sorted(df.columns), axis=1)
    if len(df.columns)==len(expected_col) and list(expected_col)==list(df.columns):
        print("column name and column length validation passed")
        return 1
    else:
        print("column name and column length validation failed")
        mismatched_columns_file = list(set(df.columns).difference(expected_col))
        print("Following File columns are not in the YAML file",mismatched_columns_file)
        missing_YAML_file = list(set(expected_col).difference(df.columns))
        print("Following YAML columns are not in the file uploaded",missing_YAML_file)
        logging.info(f'df columns:{df.columns}')
        logging.info(f'expected columns:{expected_col}')
        return 0

Overwriting testutility.py


In [3]:
import testutility as util
config_data = util.read_config_file("file.yaml")

In [4]:
config_data['file_type']

'csv'

In [5]:
config_data

{'file_type': 'csv',
 'dataset_name': 'testfile',
 'file_name': 'test_data',
 'table_name': 'edsurv',
 'inbound_delimeter': ',',
 'outbout_delimeter': '|',
 'skip_leading_rows': 1,
 'columns': 'city price distance'}

In [6]:
import pandas as pd
df = pd.read_csv(r"C:\Users\user\OneDrive\Desktop\Data Glacier Internship\ingestion\test_data.csv", delimiter = ',')
df.head()

Unnamed: 0,city,age,Country
0,Delhi,34,India
1,Lima,30,Peru
2,Istanbul,16,Turkey
3,Riyadh,33,Saudi Arabia


In [7]:
util.col_header_val(df, config_data)

column name and column length validation failed
Following File columns are not in the YAML file ['country', 'city', 'age']
Following YAML columns are not in the file uploaded ['t', ' ', 'd', 'n', 'i', 'a', 'e', 'y', 'c', 'p', 's', 'r']


0

In [8]:
print("columns of files are:", df.columns)
print("columns of YAML are:", config_data['columns'])

columns of files are: Index(['city', 'age', 'country'], dtype='object')
columns of YAML are: city price distance


In [9]:
if util.col_header_val(df, config_data)==0:
    print("validation failed")
else:
    print("col validation passed")

column name and column length validation failed
Following File columns are not in the YAML file ['country', 'city', 'age']
Following YAML columns are not in the file uploaded ['t', ' ', 'd', 'n', 'i', 'a', 'e', 'y', 'c', 'p', 's', 'r']
validation failed


In [10]:
testdata = {
    'city' : ['Delhi', 'Lima', 'Istanbul','Riyadh'],
    'age' : [34, 30, 16,33],
    'Country' : ['India','Peru','Turkey','Saudi Arabia']
}
import pandas as pd
df = pd.DataFrame(testdata, columns=['city', 'age','Country'])
df.to_csv("test_data.csv",index=False)

In [11]:
df

Unnamed: 0,city,age,Country
0,Delhi,34,India
1,Lima,30,Peru
2,Istanbul,16,Turkey
3,Riyadh,33,Saudi Arabia


In [12]:
import csv 
with open(r'C:\Users\user\OneDrive\Desktop\Data Glacier Internship\ingestion\test_data.csv') as fin:
    with open(r'C:\Users\user\OneDrive\Desktop\Data Glacier Internship\ingestion\test_data1.csv', 'w', newline='') as fout:
        reader = csv.DictReader(fin, delimiter=',')
        writer = csv.DictWriter(fout, reader.fieldnames, delimiter='|')
        writer.writeheader()
        writer.writerows(reader)

In [13]:
df1 = pd.read_csv(r'C:\Users\user\OneDrive\Desktop\Data Glacier Internship\ingestion\test_data1.csv')
df1

Unnamed: 0,city|age|Country
0,Delhi|34|India
1,Lima|30|Peru
2,Istanbul|16|Turkey
3,Riyadh|33|Saudi Arabia


In [14]:
df1.to_csv('dfsavename.csv.gz', compression='gzip')
df1 = pd.read_csv('dfsavename.csv.gz', compression='gzip')

In [15]:
df1.describe()

Unnamed: 0.1,Unnamed: 0
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [17]:
# computing number of rows
rows = len(df1.axes[0])
 
# computing number of columns
cols = len(df1.axes[1])
 
print(df1)
print("Number of Rows: ", rows)
print("Number of Columns: ", cols)

   Unnamed: 0        city|age|Country
0           0          Delhi|34|India
1           1            Lima|30|Peru
2           2      Istanbul|16|Turkey
3           3  Riyadh|33|Saudi Arabia
Number of Rows:  4
Number of Columns:  2


In [20]:
#obtaining file size
import os
os.path.getsize(r"C:\Users\user\OneDrive\Desktop\Data Glacier Internship\ingestion\test_data1.csv")

92