# Data to build the Final Object

1. Ingest Data Files: The ingestion process will load files from external bucket (Ex:GCP) and load the files into a staging table using Snowpipe.For the purpose of this exercise, I will simulate that we are already in the Snowflake data warehouse. Reading the csv files into a df.


In [1]:
import pandas as pd
import sqlite3
import glob
import os

data_path = '../data/'

all_files = glob.glob(os.path.join(data_path, "*.csv"))

# Check if files are found
if not all_files:
    raise ValueError("No CSV files found in the specified directory.")

# create a list of df in order to join all the files
df_list = []
for file in all_files:
    try:
        df = pd.read_csv(file)
        df_list.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")


df = pd.concat(df_list, ignore_index=True)

# Connect to an in-memory SQLite database
conn = sqlite3.connect(':memory:')
cursor = conn.cursor()


2. Load the concatenated df into a temporary table 'temp_table'.

In [2]:
df.to_sql('temp_data', conn, index=False, if_exists='replace')

4000

3.Create companies table and inserting distinct values from 'temp_data' 

In [3]:
companies_table_query = """
CREATE TABLE companies (
    company_id INTEGER PRIMARY KEY AUTOINCREMENT,
    company_name VARCHAR(100) NOT NULL,
    web VARCHAR(100)
);

INSERT INTO companies (company_name, web)
SELECT DISTINCT company_name, web FROM temp_data;
"""
cursor.executescript(companies_table_query)

<sqlite3.Cursor at 0x7f90185f5340>

4.Create customers table with FK relationships

In [4]:
customers_table_query = """
CREATE TABLE customers (
    customer_id INTEGER PRIMARY KEY AUTOINCREMENT,
    first_name VARCHAR(50) NOT NULL,
    last_name VARCHAR(50) NOT NULL,
    address VARCHAR(255),
    city VARCHAR(100),
    province VARCHAR(50),
    postal VARCHAR(20),
    email VARCHAR(100) NOT NULL,
    phone1 VARCHAR (20),
    phone2 VARCHAR (20),
    company_id INTEGER,
    FOREIGN KEY (company_id) REFERENCES companies (company_id)
);

INSERT INTO customers (
    first_name, last_name, address, city, province, postal, email, phone1, phone2, company_id
)
SELECT 
    first_name, last_name, address, city, province, postal, LOWER(TRIM(email)) AS email, phone1, phone2,
    (SELECT company_id FROM companies WHERE companies.company_name = temp_data.company_name)
FROM temp_data
GROUP BY first_name, last_name, address, city, province, postal, email, phone1, phone2;
"""
cursor.executescript(customers_table_query)


<sqlite3.Cursor at 0x7f90185f5340>

5.Load Final Data and Save to CSV

In [5]:
final_data_query = """
SELECT 
    customers.customer_id, customers.first_name, customers.last_name, customers.address, 
    customers.city, customers.province, customers.postal, customers.email, 
    customers.phone1, customers.phone2, companies.company_name, companies.web, companies.company_id
FROM customers
JOIN companies ON customers.company_id = companies.company_id;
"""
final_df = pd.read_sql(final_data_query, conn)

# Save the consolidated data to a new CSV file
output_path = '../data/'
final_df.to_csv(os.path.join(output_path, 'consolidated_customer_data.csv'), index=False)

# Display a sample of the data
final_df.head()

Unnamed: 0,customer_id,first_name,last_name,address,city,province,postal,email,phone1,phone2,company_name,web,company_id
0,1,Aaron,Kloska,423 S Navajo St #56,Brookhill,,,aaron_kloska@kloska.net.au,07-9896-4827,0473-600-733,"Radecker, H Philip Jr",http://www.radeckerhphilipjr.com.au,69
1,2,Abel,Maclead,37275 St Rt 17m M,Middle Island,,,amaclead@gmail.com,631-335-3414,631-677-3675,Rangoni Of Florence,http://www.rangoniofflorence.com,1008
2,3,Abraham,Cratch,41 Benedict St,Aldborough Ward,,IG2 7QG,acratch@gmail.com,01599-245408,01695-305111,"Cavuto, John A",http://www.cavutojohna.co.uk,567
3,4,Adaline,Galagher,32716 N Michigan Ave #82,Barooga,,,adaline.galagher@galagher.com.au,02-3225-1954,0416-156-336,Debbie Reynolds Hotel,http://www.debbiereynoldshotel.com.au,328
4,5,Adela,Echegoyen,128 W Kellogg Dr,Burnaby,BC,V5B 4L5,adela.echegoyen@echegoyen.org,604-571-8392,604-693-8094,Fpa Corp,http://www.fpacorp.com,1497


P1.4 How will you keep the data secure and confidential?
1. Data encryption
2. Access control - strict access control
3. Data masking - obscure sensitive information
4. Enable audit logging-keep track of access and changes
5. Security audits
6. DB security configuraion- strong passwords, regularly updating db
