In [90]:
import yaml
from sqlalchemy import create_engine, inspect
import pandas as pd
from dateutil.parser import parse
import tabula
import requests
import json


class DatabaseConnector:
    def __init__(self, file=None):
        self.file = file
        self.db_creds = self.read_db_creds()
        self.db_engine = self.init_db_engine()
        self.db_table_list = self.list_db_tables()

    def read_db_creds(self):
        with open(self.file, 'r') as f:
            db_creds = yaml.safe_load(f)
            return db_creds
    
    def init_db_engine(self):
        db_engine = create_engine(f"postgresql://{self.db_creds['RDS_USER']}:{self.db_creds['RDS_PASSWORD']}@{self.db_creds['RDS_HOST']}:{self.db_creds['RDS_PORT']}/{self.db_creds['RDS_DATABASE']}")
        return db_engine

    def list_db_tables(self):
        insp = inspect(self.db_engine)
        db_table_list = insp.get_table_names()
        return db_table_list
    
    def upload_to_db(self, clean_dataframe, table_name):
        db_to_sql = clean_dataframe.to_sql(table_name, self.db_engine, if_exists='replace', index=False)
        return db_to_sql


class DataExtractor:
    def __init__(self, database=None):
        self.database = database

    def read_rds_table(self, table_name):
        table_data = pd.read_sql_table(table_name, self.database).set_index('index')
        return table_data

    def retrieve_pdf_data(self, pdf_path):
        pdf_df_page = tabula.read_pdf(pdf_path, pages='all')
        pdf_df = pd.concat(pdf_df_page, ignore_index=True)
        return pdf_df
    
    def list_number_of_stores(self, number_of_stores_endpoint, header):
        response = requests.get(number_of_stores_endpoint, headers=header)
        number_of_stores_data = response.json()
        return number_of_stores_data['number_stores']
    
    def retrieve_stores_data(self, store_endpoint, number_of_stores, header):
        store_df = []
        for store_number in range(number_of_stores):
            response = requests.get(f'{store_endpoint}{store_number}', headers=header).json()
            store = pd.json_normalize(response)
            store_df.append(store)
        stores_df = pd.concat(store_df).set_index('index')
        return stores_df
    
class DataCleaning:
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def clean_user_data(self):
        # Remove NULL values and duplicates
        self.dataframe = self.dataframe.dropna().drop_duplicates()

        # Clean country code
        self.dataframe['country_code'] = self.dataframe['country_code'].replace('GGB', 'GB')
        self.dataframe = self.dataframe[self.dataframe['country_code'].str.len() == 2]

        # Clean dates
        self.dataframe.loc[:,'date_of_birth'] = pd.to_datetime(self.dataframe['date_of_birth'].apply(parse))
        self.dataframe.loc[:,'join_date'] = pd.to_datetime(self.dataframe['join_date'].apply(parse))

        # Clean phone numbers
        regex = '^(\(?\+?[0-9]*\)?)?[0-9_\- \(\)]*$'
        self.dataframe.loc[:,'phone_number'] = self.dataframe['phone_number'].str.replace('(0)', '', regex=False)
        self.dataframe.loc[:,'phone_number'] = self.dataframe['phone_number'].replace(r'\D+', '', regex=True)

        return self.dataframe

    def clean_card_data(self):
        card_provider_list = ['Diners Club / Carte Blanche', 'American Express', 'JCB 16 digit',
                             'JCB 15 digit', 'Maestro', 'Mastercard', 'Discover',
                             'VISA 19 digit', 'VISA 16 digit', 'VISA 13 digit']

        # Filter card data based on card providers
        self.dataframe = self.dataframe[self.dataframe['card_provider'].isin(card_provider_list)]

        # Clean and format date columns
        self.dataframe.loc[:,'expiry_date'] = pd.to_datetime(self.dataframe['expiry_date'], errors='coerce', format='%m/%y')
        self.dataframe.loc[:,'date_payment_confirmed'] = pd.to_datetime(self.dataframe['date_payment_confirmed'], errors='coerce', format='%Y-%m-%d')

        # Drop NULL values and duplicates
        self.dataframe = self.dataframe.dropna().drop_duplicates()

        return self.dataframe

    def clean_store_data(self):
        self.dataframe = self.dataframe[self.dataframe['country_code'].str.len() == 2]
        self.dataframe.loc[:, 'opening_date'] = pd.to_datetime(self.dataframe['opening_date'], errors='coerce', format='%Y-%m-%d')

        self.dataframe['continent'] = self.dataframe['continent'].replace(['eeEurope', 'eeAmerica'], ['Europe', 'America'])

        self.dataframe = self.dataframe.drop(columns='lat')
        self.dataframe['staff_numbers'] = self.dataframe['staff_numbers'].apply(lambda x: "".join(filter(str.isdigit, str(x))))

        self.dataframe = self.dataframe.dropna().drop_duplicates()

        return self.dataframe
        

yaml_database = DatabaseConnector(file='db_creds.yaml').init_db_engine()

user_data_df = DataExtractor(yaml_database).read_rds_table('legacy_users')

cleaned_df = DataCleaning(user_data_df).clean_user_data()

user_data_to_sql = DatabaseConnector(file='sales_data_creds.yaml').upload_to_db(cleaned_df, 'dim_users')

In [86]:
pdf_file = DataExtractor().retrieve_pdf_data("https://data-handling-public.s3.eu-west-1.amazonaws.com/card_details.pdf")

In [87]:
data_pdf = DataCleaning(dataframe=pdf_file).clean_card_data()
card_details_to_sql = DatabaseConnector(file='sales_data_creds.yaml').upload_to_db(data_pdf, 'dim_card_details')

In [88]:
data_pdf

Unnamed: 0,card_number,expiry_date,card_provider,date_payment_confirmed
0,30060773296197,2026-09-01 00:00:00,Diners Club / Carte Blanche,2015-11-25 00:00:00
1,349624180933183,2023-10-01 00:00:00,American Express,2001-06-18 00:00:00
2,3529023891650490,2023-06-01 00:00:00,JCB 16 digit,2000-12-26 00:00:00
3,213142929492281,2027-09-01 00:00:00,JCB 15 digit,2011-02-12 00:00:00
4,502067329974,2025-10-01 00:00:00,Maestro,1997-03-13 00:00:00
...,...,...,...,...
15304,180036921556789,2028-12-01 00:00:00,JCB 15 digit,1997-06-06 00:00:00
15305,180018030448512,2024-11-01 00:00:00,JCB 15 digit,2004-06-16 00:00:00
15306,3569953313547220,2024-04-01 00:00:00,JCB 16 digit,2020-02-05 00:00:00
15307,4444521712606810,2027-06-01 00:00:00,VISA 16 digit,2008-06-16 00:00:00


In [7]:
headers = {'x-api-key': 'yFBQbwXe9J3sd6zWVAMrK6lcxxr0q1lr2PT6DDMX'}

number_of_stores = DataExtractor().list_number_of_stores('https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/number_stores', headers)
stores_data = DataExtractor().retrieve_stores_data(f'https://aqj7u5id95.execute-api.eu-west-1.amazonaws.com/prod/store_details/', number_of_stores, headers)

stores_data

Unnamed: 0_level_0,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,GB,Europe
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe
...,...,...,...,...,...,...,...,...,...,...,...
446,"Täschestraße 25\n39039 Nördlingen, Kirchlengern",52.2,,Kirchlengern,KI-78096E8C,61,2005-05-12,Super Store,8.63333,DE,Europe
447,K0ODETRLS3,K8CXLZDP07,UXMWDMX1LC,3VHFDNP8ET,9D4LK7X4LZ,D23PCWSM6S,36IIMAQD58,NN04B3F6UQ,JZP8MIJTPZ,B3EH2ZGQAV,1WZB1TE1HL
448,"Studio 8\nMoss mall\nWest Linda\nM0E 6XR, High...",51.62907,,High Wycombe,HI-EEA7AE62,33,1998-05-14,Local,-0.74934,GB,Europe
449,"Baumplatz 6\n80114 Kötzting, Bretten",49.03685,,Bretten,BR-662EC74C,35,2020-10-17,Local,8.70745,DE,Europe


In [11]:
stores_data.head(50)

Unnamed: 0_level_0,address,longitude,lat,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,,,,,WEB-1388012W,325,2010-06-12,Web Portal,,GB,Europe
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,,High Wycombe,HI-9B97EE4E,34,1996-10-25,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,,Landshut,LA-0772C7B9,92,2013-04-12,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,,Westbury,WE-1DE82CEE,69,2014-01-02,Super Store,-2.1875,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,,Belper,BE-18074576,35,2019-09-09,Local,-1.48119,GB,Europe
5,Flat 92u\nChristian harbors\nPort Charlotte\nN...,53.38333,,Gainsborough,GA-CAD01AC2,36,1995-05-15,Local,-0.76667,GB,Europe
6,"7 Gillian rue\nWest Robertside\nPH4 8NY, Ruthe...",55.82885,,Rutherglen,RU-C603E990,92,2001-01-04,Super Store,-4.21376,GB,Europe
7,"Lilija-Heß-Allee 660\n34566 Regensburg, Stuttgart",48.78232,,Stuttgart,ST-229D997E,34,2000-06-01,Local,9.17702,DE,Europe
8,"510 Jill Mill\nSouth Laura, FL 38723, Kaukauna",44.27804,,Kaukauna,KA-FA7ED3B8,31,2022-09-05,Local,-88.27205,US,America
9,"3 Lee valleys\nWest Janetview\nDY4M 2RL, Hartley",51.38673,,Hartley,HA-974352FE,20,2004-09-11,Local,0.30367,GB,Europe


In [79]:
stores_df = stores_data.copy()



stores_df = stores_df[stores_df['country_code'].str.len() == 2]
stores_df.loc[:,'opening_date'] = pd.to_datetime(stores_df['opening_date'], errors='coerce', format='%Y-%m-%d')

stores_df['continent'] = stores_df['continent'].replace(['eeEurope', 'eeAmerica'], ['Europe', 'America'])

stores_df = stores_df.drop(columns='lat')
stores_df['staff_numbers'] = stores_df['staff_numbers'].apply(lambda x: "".join(filter(str.isdigit, str(x))))




stores_df = stores_df.dropna().drop_duplicates()
stores_df


Unnamed: 0_level_0,address,longitude,locality,store_code,staff_numbers,opening_date,store_type,latitude,country_code,continent
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"Flat 72W\nSally isle\nEast Deantown\nE7B 8EB, ...",51.62907,High Wycombe,HI-9B97EE4E,34,1996-10-25 00:00:00,Local,-0.74934,GB,Europe
2,"Heckerstraße 4/5\n50491 Säckingen, Landshut",48.52961,Landshut,LA-0772C7B9,92,2013-04-12 00:00:00,Super Store,12.16179,DE,Europe
3,"5 Harrison tunnel\nSouth Lydia\nWC9 2BE, Westbury",51.26,Westbury,WE-1DE82CEE,69,2014-01-02 00:00:00,Super Store,-2.1875,GB,Europe
4,Studio 6\nStephen landing\nSouth Simon\nB77 2W...,53.0233,Belper,BE-18074576,35,2019-09-09 00:00:00,Local,-1.48119,GB,Europe
5,Flat 92u\nChristian harbors\nPort Charlotte\nN...,53.38333,Gainsborough,GA-CAD01AC2,36,1995-05-15 00:00:00,Local,-0.76667,GB,Europe
...,...,...,...,...,...,...,...,...,...,...
445,"Flat 7\nStephanie lake\nMorrisside\nHP8 8LH, C...",50.76306,Cowes,CO-473A9FBB,94,2008-06-08 00:00:00,Super Store,-1.29772,GB,Europe
446,"Täschestraße 25\n39039 Nördlingen, Kirchlengern",52.2,Kirchlengern,KI-78096E8C,61,2005-05-12 00:00:00,Super Store,8.63333,DE,Europe
448,"Studio 8\nMoss mall\nWest Linda\nM0E 6XR, High...",51.62907,High Wycombe,HI-EEA7AE62,33,1998-05-14 00:00:00,Local,-0.74934,GB,Europe
449,"Baumplatz 6\n80114 Kötzting, Bretten",49.03685,Bretten,BR-662EC74C,35,2020-10-17 00:00:00,Local,8.70745,DE,Europe


In [91]:
stores_data_clean = DataCleaning(stores_data).clean_store_data()

store_data_to_sql = DatabaseConnector('sales_data_creds.yaml').upload_to_db(stores_data_clean, 'dim_store_details')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.dataframe['continent'] = self.dataframe['continent'].replace(['eeEurope', 'eeAmerica'], ['Europe', 'America'])
