My Data Cleaning experience:

In [1]:
import yaml
import pandas as pd
from datetime import datetime
from sqlalchemy import create_engine, inspect
from database_utils import DatabaseConnector
from data_extraction import DataExtractor

yaml_file_path = 'db_creds.yaml'
db_connector = DatabaseConnector(yaml_file_path)

if db_connector.engine:
    tables = db_connector.list_db_tables(db_connector.engine)
    if tables:
        print("Tables in the Database:")
        for table in tables:
            print(table)
    else:
        print("Failed to list database tables.")
else:
    print("Failed to initialize the database engine.")

Tables in the Database:
legacy_store_details
legacy_users
orders_table


Extract the user table

In [2]:
table_name = 'legacy_users'
df = DataExtractor(db_connector).read_rds_table(table_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 15320 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     15320 non-null  object
 1   last_name      15320 non-null  object
 2   date_of_birth  15320 non-null  object
 3   company        15320 non-null  object
 4   email_address  15320 non-null  object
 5   address        15320 non-null  object
 6   country        15320 non-null  object
 7   country_code   15320 non-null  object
 8   phone_number   15320 non-null  object
 9   join_date      15320 non-null  object
 10  user_uuid      15320 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


Remove duplicates

In [3]:
df = df[~df.isin(['NULL']).any(axis=1)]

In [4]:
df.drop_duplicates(inplace=True)
df.drop_duplicates(subset=['phone_number'], keep=False, inplace=True)
df.drop_duplicates(subset=['email_address'], keep=False, inplace=True)
df.drop_duplicates(subset=['user_uuid'], keep=False, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14889 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   first_name     14889 non-null  object
 1   last_name      14889 non-null  object
 2   date_of_birth  14889 non-null  object
 3   company        14889 non-null  object
 4   email_address  14889 non-null  object
 5   address        14889 non-null  object
 6   country        14889 non-null  object
 7   country_code   14889 non-null  object
 8   phone_number   14889 non-null  object
 9   join_date      14889 non-null  object
 10  user_uuid      14889 non-null  object
dtypes: object(11)
memory usage: 1.4+ MB


Identify potential category columns

In [None]:
df['country_code'].value_counts()

It is clear there are only 3 valid country codes: GB, DE, US. GGB an easily corrected typo.
Viewing the remaining invalid rows shows they can all be removed, also eliminating all 'NULL' values

In [5]:
valid_country_code = ('GB', 'DE', 'US')
df['country'] = df['country'].astype('category')
df['country_code'] = df['country_code'].astype('category')
df['country_code'] = df['country_code'].replace('GGB', 'GB')
incorrect_codes = df[~df['country_code'].isin(valid_country_code)]


In [6]:
df = df.drop(incorrect_codes.index)

Locate the incorrect date formats that would not be converted with pd.to_datetime

In [7]:
date_columns = ['date_of_birth', 'join_date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format='%Y-%m-%d', errors='ignore')
    df[col] = pd.to_datetime(df[col], format='%Y %B %d', errors='ignore')
    df[col] = pd.to_datetime(df[col], format='%B %Y %d', errors='ignore')
    df[col] = pd.to_datetime(df[col], errors='coerce')

In [8]:
column_data_types = {
    'first_name': 'string',
    'last_name': 'string',
    'company': 'string',
    'email_address': 'string',
    'address': 'string',
    'phone_number': 'string',
    'user_uuid': 'string'
    }
for col, data_type in column_data_types.items():
    if col in df.columns:
        try:
            df[col] = df[col].astype(data_type)
        except ValueError as e:
            print(f"Error converting '{col}' to '{data_type}': {e}")
    else:
        print(f"Column '{col}' not found in the DataFrame.")
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 14874 entries, 0 to 1249
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   first_name     14874 non-null  string        
 1   last_name      14874 non-null  string        
 2   date_of_birth  14848 non-null  datetime64[ns]
 3   company        14874 non-null  string        
 4   email_address  14874 non-null  string        
 5   address        14874 non-null  string        
 6   country        14874 non-null  category      
 7   country_code   14874 non-null  category      
 8   phone_number   14874 non-null  string        
 9   join_date      14852 non-null  datetime64[ns]
 10  user_uuid      14874 non-null  string        
dtypes: category(2), datetime64[ns](2), string(7)
memory usage: 1.2 MB


In [None]:
display(df['phone_number'])

Locate the incorrectly formatted email addresses

In [None]:
df['phone_number'] = df['phone_number'].str.replace('[^a-zA-Z0-9+]', '', regex=True)

In [None]:
try:
    df[['phone_number', 'phone_ext']] = df['phone_number'].str.split('x', expand=True)
except ValueError as e:
    print(f"No extensions found")
df['phone_number'] = df['phone_number'].str.replace('(0)', '')
df['phone_number'] = df['phone_number'].str.replace('[^a-zA-Z0-9+]', '', regex=True)

In [None]:
df['phone_number'].str.replace('[^a-zA-Z0-9+]', '', regex=True)

In [None]:
df['phone_number'].str.replace('(0)', '').head(20)

In [None]:
email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+'
invalid_email_df = df[~df['email_address'].str.contains(email_pattern, na=False)]
invalid_email_df

Correct input errors

In [None]:
# Replace @@ with @ in the "email" column
df['email_address'] = df['email_address'].str.replace('@@', '@')

Clean phone number data

In [None]:
def format_gb_to_e164(phone_number):
    # Remove non-digit characters
    phone_number = ''.join(filter(str.isdigit, phone_number))
    
    # Check if the first two digits are "44"
    if phone_number[:3] == "440":
        # Phone number has an unecessary 0 at index 2
        return "+" + phone_number[0:2] + phone_number[3:]
    elif phone_number[:3] == "441":
        # Phone number is already in E.164 format
        return "+" + phone_number
    elif phone_number.startswith("0"):
        # Remove the leading "0" and add "44" in front
        return "+44" + phone_number[1:]
    else:
        # Phone number does not start with "44" or "0"
        return None  # Invalid format, you can handle it accordingly
    
def format_de_to_e164(phone_number):
    # Remove non-digit characters
    phone_number = ''.join(filter(str.isdigit, phone_number))
    
    # Check if the first two digits are "49"
    if phone_number[:3] == "490":
        # Phone number has an unecessary 0 at index 2
        return "+" + phone_number[0:2] + phone_number[3:]
    elif phone_number[:3] == "491":
        # Phone number is already in E.164 format
        return "+" + phone_number
    elif phone_number.startswith("0"):
        # Remove the leading "0" and add "49" in front
        return "+49" + phone_number[1:]
    else:
        # Phone number does not start with "49" or "0"
        return None  # Invalid format, you can handle it accordingly    
    


In [None]:
df.loc[df['country_code']=='GB', 'phone_number'] = df.loc[df['country_code']=='GB']['phone_number'].apply(format_gb_to_e164)

In [None]:
# Convert to string
df['phone_number'] = df['phone_number'].apply(lambda x: str(x))

In [None]:
df.loc[df['country_code'] == 'GB', 'phone_number'] = df.loc[df['country_code']=='GB']['phone_number'].apply(lambda x: x[0:3] + ' ' + x[3:])
df.loc[df['country_code'] == 'GB', 'phone_number'] = df.loc[df['country_code']=='GB']['phone_number'].apply(lambda x: x[:-6] + ' ' + x[-6:])

In [None]:
df.loc[df['country_code'] == 'GB', 'phone_number'] = df.loc[df['country_code']=='GB']['phone_number'].apply(lambda x: x[:-4] + ' ' + x[-4:])
df.loc[df['country_code'] == 'GB', 'phone_number'] = df.loc[df['country_code']=='GB']['phone_number'].apply(lambda x: x[:-8] + ' ' + x[-8:])

In [None]:
df.loc[df['country_code'] == 'GB', 'phone_number']

In [None]:
df.loc[df['country_code'] == 'DE', 'phone_number'] = df.loc[df['country_code'] == 'DE']['phone_number'].apply(format_de_to_e164) 

In [None]:
df.loc[df['country_code'] == 'DE', 'phone_number']

In [None]:
df.loc[df['country_code'] == 'US', 'phone_number'].tail(20)

In [None]:
df['address'] = df['address'].str.replace("\n", ' ')
df['address'] = df['address'].str.title()
df['address'] = df['address'].str.split().apply(lambda x: ' '.join(x[:-2] + [word.upper() for word in x[-2:]]))


In [None]:
# Define a custom function to uppercase the last two words
def uppercase_last_two_words(text):
    words = text.split()
    return ' '.join(words[:-2] + [word.upper() for word in words[-2:]])

# Apply the custom function to the 'address' column based on the 'country_code' condition
df.loc[df['country_code'] == 'GB', 'address'] = df.loc[df['country_code'] == 'GB']['address'].apply(uppercase_last_two_words)



In [None]:
df.loc[df['country_code'] == 'DE', 'address']

In [None]:
df.head(20)