### EXTRACT

In [1]:
# Source JSON
import requests

data = requests.get("https://dst-module-integration.s3.eu-west-1.amazonaws.com/TestData").json()
print (data)

[{'ClientID': 1, 'ClientName': 'Diane Froman', 'ClientAddress': '6315 North Oro Vista Court', 'ClientCity': 'Litchfield Park', 'ClientState': 'AZ', 'ClientZipCode': 85340, 'ClientRegister': '2022-07-08 11:27:16'}, {'ClientID': 2, 'ClientName': 'Melba Mack', 'ClientAddress': '6108 Iris Way', 'ClientCity': 'Arvada', 'ClientState': 'CO', 'ClientZipCode': 80004, 'ClientRegister': '2022-07-08 07:42:17'}, {'ClientID': 3, 'ClientName': 'Paula Christenson', 'ClientAddress': '10911 Torrington Road', 'ClientCity': 'Louisville', 'ClientState': 'KY', 'ClientZipCode': 40272, 'ClientRegister': '2022-08-07 07:24:03'}, {'ClientID': 4, 'ClientName': 'Larry Miller', 'ClientAddress': '316 20th Street Northeast', 'ClientCity': 'Washington', 'ClientState': 'DC', 'ClientZipCode': 20002, 'ClientRegister': '2022-06-02 03:21:23'}, {'ClientID': 5, 'ClientName': 'Sharon Sivertsen', 'ClientAddress': '2441 Chase Park Drive', 'ClientCity': 'Montgomery', 'ClientState': 'AL', 'ClientZipCode': 36110, 'ClientRegister':

In [2]:
# Vérification de l'intégrité des données
import pandas as pd

def is_proper_type(data_record):
    verifClientID = isinstance(data_record["ClientID"], int)
    verifClientState = isinstance(data_record["ClientState"], str)
    return (verifClientID and verifClientState)

def is_valid_record(data_record):
    
    verifTypes = is_proper_type(data_record)
    
    verifNone = True
    for value in data_record.values():
        if value is None:
            verifNone = False
    
    states = ['AZ', 'CO', 'KY', 'DC', 'AL', 'MA', 'TN', 'CT', 'CA', 'GA', 'AR','OK', 'AK', 'FL', 'MD', 'VT']
    verifState = data_record['ClientState'] in states
    
    verifZipCode = isinstance(data_record["ClientZipCode"], int) and 1000 <= data_record["ClientZipCode"] <= 100000 
    
    verifClientRegister = pd.to_datetime("1-1-22") <= pd.to_datetime(data_record["ClientRegister"]) <= pd.to_datetime("31-12-22")
    
    return (verifTypes and verifNone and verifState and verifZipCode and verifClientRegister)

print (is_valid_record(data[133]))
print (is_valid_record(data[153]))
print (is_valid_record(data[0]))

False
False
True


In [3]:
# Extraction des données valides et invalides
def extract_data(url):
    response_json = requests.get(url).json()
    list_valid = []
    list_invalid = []
    for record in response_json:
        if is_valid_record(record):
            list_valid.append(record)
        else:
            list_invalid.append(record)
    return pd.DataFrame(list_valid),pd.DataFrame(list_invalid)

test_data_valid,test_data_invalid = extract_data("https://dst-module-integration.s3.eu-west-1.amazonaws.com/TestData")

# Affichage des données invalides
display (test_data_invalid)

Unnamed: 0,ClientID,ClientName,ClientAddress,ClientCity,ClientState,ClientZipCode,ClientRegister
0,13,,1736 81st Avenue,Oakland,CA,94621,2022-03-31 03:42:10
1,136,Terry Rivera,2632 Jackson Street East,Carson,FR,90810,2022-01-13 01:37:02
2,156,Mitchell Maffei,1865 North Higley Road,,AZ,85205,2022-01-29 01:07:39
3,159,Ryan Leonard,5624 Oakes Drive,Brentwood,TN,37027,2021-03-31 03:42:10
4,216,Judy Isaacson,64 Roseberry Circle,Port Wentworth,GA,999999,2022-12-29 05:58:45
5,226,Bonnie Leach,503 Orchard Road,Glen Burnie,,21061,2022-01-23 02:50:46


In [4]:
# Extraction des clients valides de Glen Burnie
def extract_glen_burnie_data(df):
    return (df.loc[df['ClientCity']=='Glen Burnie'])

glen_burnie_data = extract_glen_burnie_data(test_data_valid)

display (glen_burnie_data)

Unnamed: 0,ClientID,ClientName,ClientAddress,ClientCity,ClientState,ClientZipCode,ClientRegister
173,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17
208,217,Eunice Vrbka,200 Hialeah Drive,Glen Burnie,MD,21060,2022-02-17 09:37:19
219,229,Dan Griffin,303 Addison Drive,Glen Burnie,MD,21060,2022-08-04 14:19:41


In [5]:
# Source CSV 
transactions = pd.read_csv("https://dst-module-integration.s3.eu-west-1.amazonaws.com/Transactions")
products = pd.read_csv("https://dst-module-integration.s3.eu-west-1.amazonaws.com/Products")

display (transactions)
display (products)

Unnamed: 0,TransactionID,TransactionDate,ClientID,ProductID,ProductQuantity
0,1239129,2022-04-28 15:16:00,4,53,9
1,1239130,2022-05-03 14:41:00,61,7,1
2,1239131,2022-10-20 22:52:00,9,92,9
3,1239132,2022-05-18 12:45:00,159,13,14
4,1239133,2022-08-14 06:35:00,135,80,8
...,...,...,...,...,...
123117,1362246,2022-04-24 04:36:00,104,10,10
123118,1362247,2022-04-21 18:23:00,252,3,12
123119,1362248,2022-09-07 12:07:00,135,95,6
123120,1362249,2022-11-07 12:29:00,150,40,10


Unnamed: 0,ProductID,ProductName,ProductUnitPrice,ProductDiscount
0,1,grid paper,4.140616,25
1,2,plate,10.056399,0
2,3,rubber band,7.412333,0
3,4,key chain,9.632363,50
4,5,bread,20.728457,25
...,...,...,...,...
94,95,zipper,21.453425,0
95,96,soda can,17.284503,0
96,97,bed,7.627264,0
97,98,cell phone,7.883925,0


### TRANSFORM

In [6]:
# Jointure des sources en un dataframe
joined_glen_burnie_data = glen_burnie_data.merge(transactions, on = "ClientID").merge(products, on = "ProductID")
joined_glen_burnie_data.head()

Unnamed: 0,ClientID,ClientName,ClientAddress,ClientCity,ClientState,ClientZipCode,ClientRegister,TransactionID,TransactionDate,ProductID,ProductQuantity,ProductName,ProductUnitPrice,ProductDiscount
0,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17,1239182,2022-03-07 05:37:00,66,13,table,8.580139,75
1,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17,1239369,2022-11-07 20:52:00,36,8,cookie jar,1.29952,0
2,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17,1239637,2022-11-05 07:52:00,41,11,sandal,1.009652,50
3,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17,1239794,2022-01-02 00:00:00,48,10,perfume,4.164029,0
4,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17,1239887,2022-08-30 22:11:00,41,9,sandal,1.009652,50


In [7]:
# Vérifications de base
print(joined_glen_burnie_data.duplicated().sum())
print("\nIl n'y a pas de doublons\n")
print(joined_glen_burnie_data.isna().sum())
print("\nIl n'y a pas de valeurs manquantes\n")


0

Il n'y a pas de doublons

ClientID            0
ClientName          0
ClientAddress       0
ClientCity          0
ClientState         0
ClientZipCode       0
ClientRegister      0
TransactionID       0
TransactionDate     0
ProductID           0
ProductQuantity     0
ProductName         0
ProductUnitPrice    0
ProductDiscount     0
dtype: int64

Il n'y a pas de valeurs manquantes



In [8]:
# Normalisation : schéma en étoile
transformed_transactions = joined_glen_burnie_data[transactions.columns].drop_duplicates().sort_values("TransactionID").reset_index(drop=True)
transformed_products = joined_glen_burnie_data[products.columns].drop_duplicates().sort_values("ProductID").reset_index(drop=True)
transformed_clients = glen_burnie_data

display(transformed_transactions)
display(transformed_products)
display(transformed_clients)

print("Après la transformation, la base transactions regroupe seulement 1488 enregistrements, contre 123122 initialement. La base products quant à elle n'a subi aucune modification.")
print("\nSi ces données devaient être chargées dans un Data Warehouse, transformed_transactions serait la table de faits")
print("Et transformed_products et transformed_clients seraient 2 tables de dimensions.")

Unnamed: 0,TransactionID,TransactionDate,ClientID,ProductID,ProductQuantity
0,1239182,2022-03-07 05:37:00,181,66,13
1,1239209,2022-11-27 04:28:00,229,89,9
2,1239328,2022-01-16 18:25:00,217,76,8
3,1239340,2022-11-11 03:56:00,229,39,5
4,1239369,2022-11-07 20:52:00,181,36,8
...,...,...,...,...,...
1483,1361664,2022-06-21 15:19:00,181,97,12
1484,1361672,2022-01-01 03:53:00,217,11,10
1485,1361800,2022-10-09 14:50:00,217,65,8
1486,1362144,2022-03-05 20:27:00,217,55,10


Unnamed: 0,ProductID,ProductName,ProductUnitPrice,ProductDiscount
0,1,grid paper,4.140616,25
1,2,plate,10.056399,0
2,3,rubber band,7.412333,0
3,4,key chain,9.632363,50
4,5,bread,20.728457,25
...,...,...,...,...
94,95,zipper,21.453425,0
95,96,soda can,17.284503,0
96,97,bed,7.627264,0
97,98,cell phone,7.883925,0


Unnamed: 0,ClientID,ClientName,ClientAddress,ClientCity,ClientState,ClientZipCode,ClientRegister
173,181,Christopher Shuman,219 Shana Road,Glen Burnie,MD,21060,2022-07-02 06:42:17
208,217,Eunice Vrbka,200 Hialeah Drive,Glen Burnie,MD,21060,2022-02-17 09:37:19
219,229,Dan Griffin,303 Addison Drive,Glen Burnie,MD,21060,2022-08-04 14:19:41


Après la transformation, la base transactions regroupe seulement 1488 enregistrements, contre 123122 initialement. La base products quant à elle n'a subi aucune modification.

Si ces données devaient être chargées dans un Data Warehouse, transformed_transactions serait la table de faits
Et transformed_products et transformed_clients seraient 2 tables de dimensions.


### LOAD

In [9]:
# Mise à disposition de chaque table sous forme de fichier csv
transformed_transactions.to_csv("ETL_transactions.csv")
transformed_products.to_csv("ETL_products.csv")
transformed_clients.to_csv("ETL_clients.csv")