In [20]:
import re
import pathlib
import dateutil
import pandas as pd
from hashlib import sha256
from datetime import datetime

# from utils import check_mobile, check_email, format_name, gen_member_id
import logging

In [49]:
def check_mobile(data: pd.Series) -> pd.Series:
    """
    Checks if a provided mobile number is valid.
    Determines if alphabets are present in the number, replaces "+", "-" and whitespaces.
    Removes 65 if the number starts with it.
    
    Args:
        mobile (str): Mobile number provided by the user
        
    Returns:
        Formatted mobile number or None if the number is invalid
        
    Raises:
        ValueError: If provided value is not str type
    """
    result = None 
    
    try:
        mobile = data['mobile_no']
        logging.info(f"Processing number: {mobile}")
        
        alphabet_pattern = re.compile(r"[A-Za-z]+")

        # If number contains alphabets, return "False"
        if re.findall(alphabet_pattern, mobile):
            return result
    
        pattern = re.compile(r"\+|\-|\s")
        mobile = re.sub(pattern=pattern, repl="", string=mobile)
        result = mobile if len(mobile) == 8 else result
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Mobile result: {result}")
        return result

In [22]:
def check_email(data: pd.Series) -> pd.Series:
    """
    Checks and validates provided email address.
    Before "@", match at least 1 or more characters/numbers before ".", "-", or "_" (Can be zero matches) 
    and at least 1 or more character/numbers 

    After "@", match at least 1 or more characters/numbers including "-"
    Uses capture group to determine domain information, matches ".", at least 2 or more characters 2 times.
    
    Args:
        email (str): Email address provided by the user
        
    Returns:
        result (str): Email address is valid, else None
        
    Raises:
        ValueError: If provided value is not str type

    """
    
    result = None
    email = data['email']
    logging.info(f"Processing email: {email}")
    pattern = re.compile("([A-Za-z0-9]+[.\-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+")
        
    try:
        result = re.fullmatch(pattern, email).group()
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Email result: {result}")
        return result

In [103]:
def format_name(name: str) -> str:
    """
    Splits provided name into first and last names.
    Removes affixes from the customer names
        
    Args:
        name (str): Name provided by the user
        
    Returns:
        result (tuple): First and last name, lowercase
        
    Raises:
        ValueError: If provided value is not str type

    """

    result = ()
    nname_affixes = re.compile(r"[mM]r\.\s|[mM]rs\.\s|[mM]s\.\s|[dD]r\.\s|\s[jJ]r\.|MD\s|DDS\s|PhD\s|DVM\s")
                                    
    try:
        logging.info(f"Processing name: {name}")
        remove_affixes = re.sub(name_affixes, "", name)
        split_names = remove_affixes.split(" ")
        result = (split_names[0].strip(), split_names[1].strip())
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Name result: {result}")
        return result

In [98]:
def gen_member_id(data: pd.Series) -> pd.Series:
    """
    Generates the membership ID for the customer
    
    Args:
        lastname (str): last name of the customer
        dob (str): DoB name of the customer
        
    Returns:
        result (str): Membership ID
        
    Raises:
        ValueError: If provided values are not of str type

    """

    last_name = data['last_name']
    dob = data['date_of_birth']
    logging.info(f"Processing lastname: {last_name} and DoB: {dob}")
    result = None
        
    try:
        # Truncate hash to first 5 characters
        dob_hash = sha256(dob.encode("utf-8")).hexdigest()
        result = f"{last_name}_{dob_hash[:5]}".strip()
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Membership ID result: {result}")
        return result

In [25]:
def format_dob(dob: str) -> str:
    """
    Formats the customer's date of birth to YYYYMMDD format.
    
    Args:
        dob (str): Date of birth name of the user
        
    Returns:
        result (str): Formatted date of birth
        
    Raises:
        ValueError: If provided values are not of str type

    """
    
    logging.info(f"Processing DoB: {dob}")
    result = None
    
    try:
        result = datetime.strftime(dateutil.parser.parse(dob), "%Y%m%d")
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Date of birth result: {result}")
        return result

In [26]:
def calculate_age(format_dob: str) -> str:
    """
    Calculates the age of the customer based on date of birth.
    
    Args:
        format_dob (str): Date of birth name of the customer
        
    Returns:
        result (str): Age of the customer rounded to 2 decimal points
        
    Raises:
        ValueError: If provided values are not of str type

    """
    
    logging.info(f"Processing formatted DoB: {format_dob}")
    result = None
    
    try:
        age_datetime = datetime.strptime(format_dob, "%Y%m%d")
        result = round(((datetime(2022,1,1) - age_datetime).days / 365), 2)
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Age result: {result}")
        return result

In [27]:
data_path = pathlib.Path("../data/mock_data")

dfs = []

for filename in data_path.iterdir():
    df = pd.read_csv(filename, index_col=None, header=0)
    dfs.append(df)

data = pd.concat(dfs, axis=0, ignore_index=True)

In [28]:
data.head(4)

Unnamed: 0,name,email,date_of_birth,mobile_no
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359


In [101]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4999 non-null   object
 1   email          4999 non-null   object
 2   date_of_birth  4999 non-null   object
 3   mobile_no      4999 non-null   object
dtypes: object(4)
memory usage: 156.3+ KB


In [29]:
data_valid_names = data[~data["name"].isnull()]

In [30]:
data_invalid_name = data[data["name"].isnull()]

In [31]:
data_valid_names["format_dob"] = data_valid_names["date_of_birth"].apply(lambda x: format_dob(x))

In [32]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,mobile_no,format_dob
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,19860110
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931,19740910
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047,19740227
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359,19680205
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429,19881211


In [33]:
data_valid_names["age"] =  data_valid_names["format_dob"].apply(lambda x: calculate_age(x))

In [34]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,mobile_no,format_dob,age
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,19860110,36.0
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931,19740910,47.34
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047,19740227,47.88
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359,19680205,53.94
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429,19881211,33.08


In [35]:
data_valid_names["above_18"] = data_valid_names["age"].apply(lambda x: x > 18)

In [36]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,mobile_no,format_dob,age,above_18
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,19860110,36.0,True
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931,19740910,47.34,True
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047,19740227,47.88,True
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359,19680205,53.94,True
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429,19881211,33.08,True


In [37]:
data_valid_names["tmp_mobile"] = data_valid_names.apply(check_mobile,
                                                                 axis=1)

In [38]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,mobile_no,format_dob,age,above_18,tmp_mobile
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,19860110,36.0,True,40601711.0
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931,19740910,47.34,True,
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047,19740227,47.88,True,
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359,19680205,53.94,True,
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429,19881211,33.08,True,


In [51]:
data_valid_names["tmp_email"] = data_valid_names.apply(check_email,
                                                               axis=1)

In [52]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,mobile_no,format_dob,age,above_18,tmp_mobile,tmp_email
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,19860110,36.0,True,40601711.0,William_Dixon@woodward-fuller.biz
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931,19740910,47.34,True,,Kristen_Horn@lin.com
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047,19740227,47.88,True,,Kimberly_Chang@johnson-lopez.biz
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359,19680205,53.94,True,,Mary_Ball@stevens.biz
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429,19881211,33.08,True,,Benjamin_Craig@berry.net


In [84]:
data_valid_names["first_name"] = data_valid_names["name"].apply(lambda x: format_name(x)[0])
data_valid_names["last_name"] = data_valid_names["name"].apply(lambda x: format_name(x)[1])

In [85]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,mobile_no,format_dob,age,above_18,tmp_mobile,tmp_email,first_name,last_name
0,William Dixon,William_Dixon@woodward-fuller.biz,1986/01/10,40601711,19860110,36.0,True,40601711.0,William_Dixon@woodward-fuller.biz,William,Dixon
1,Kristen Horn,Kristen_Horn@lin.com,1974-09-10,737931,19740910,47.34,True,,Kristen_Horn@lin.com,Kristen,Horn
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,02/27/1974,2692047,19740227,47.88,True,,Kimberly_Chang@johnson-lopez.biz,Kimberly,Chang
3,Mary Ball,Mary_Ball@stevens.biz,02/05/1968,886359,19680205,53.94,True,,Mary_Ball@stevens.biz,Mary,Ball
4,Benjamin Craig,Benjamin_Craig@berry.net,12/11/1988,696429,19881211,33.08,True,,Benjamin_Craig@berry.net,Benjamin,Craig


In [87]:
data_valid_names = data_valid_names.drop(["tmp_email",
                                          "date_of_birth",
                                          "mobile_no"],
                                         axis=1)

In [88]:
data_valid_names.head()

Unnamed: 0,name,email,format_dob,age,above_18,tmp_mobile,first_name,last_name
0,William Dixon,William_Dixon@woodward-fuller.biz,19860110,36.0,True,40601711.0,William,Dixon
1,Kristen Horn,Kristen_Horn@lin.com,19740910,47.34,True,,Kristen,Horn
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,19740227,47.88,True,,Kimberly,Chang
3,Mary Ball,Mary_Ball@stevens.biz,19680205,53.94,True,,Mary,Ball
4,Benjamin Craig,Benjamin_Craig@berry.net,19881211,33.08,True,,Benjamin,Craig


In [89]:
data_valid_names = data_valid_names.rename(
    columns={
        "format_dob":"date_of_birth",
        "tmp_mobile":"mobile"
    }
)

In [91]:
data_valid_names.head()

Unnamed: 0,name,email,date_of_birth,age,above_18,mobile,first_name,last_name
0,William Dixon,William_Dixon@woodward-fuller.biz,19860110,36.0,True,40601711.0,William,Dixon
1,Kristen Horn,Kristen_Horn@lin.com,19740910,47.34,True,,Kristen,Horn
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,19740227,47.88,True,,Kimberly,Chang
3,Mary Ball,Mary_Ball@stevens.biz,19680205,53.94,True,,Mary,Ball
4,Benjamin Craig,Benjamin_Craig@berry.net,19881211,33.08,True,,Benjamin,Craig


In [92]:
valid_data = data_valid_names.loc[(~data_valid_names["mobile"].isnull()) & \
                                 (data_valid_names["above_18"] == True) & \
                                 (~data_valid_names["email"].isnull())]

In [93]:
valid_data.head(10)

Unnamed: 0,name,email,date_of_birth,age,above_18,mobile,first_name,last_name
0,William Dixon,William_Dixon@woodward-fuller.biz,19860110,36.0,True,40601711,William,Dixon
20,Patty Smith,Patty_Smith@ross.com,19750827,46.38,True,59428759,Patty,Smith
27,Sarah Jacobson,Sarah_Jacobson@mclean-jacobs.info,19580309,63.86,True,61071779,Sarah,Jacobson
28,Kayla Shannon,Kayla_Shannon@hendricks-hunter.org,19840517,37.65,True,51323743,Kayla,Shannon
29,Sean Wang DDS,Sean_Wang@gibson-calderon.com,19600311,61.85,True,25595367,Sean,Wang
30,Richard Estrada,Richard_Estrada@malone.com,19921015,29.23,True,22821527,Richard,Estrada
34,Jackson Cline,Jackson_Cline@hudson.net,19710121,50.98,True,48056519,Jackson,Cline
36,Allen Williams,Allen_Williams@sanchez.net,19971109,24.16,True,77991519,Allen,Williams
43,Eric Flores,Eric_Flores@dillon-patterson.com,19820107,40.01,True,36641663,Eric,Flores
44,Anna Richardson,Anna_Richardson@perry.com,19670223,54.89,True,64083047,Anna,Richardson


In [94]:
valid_data.count()

name             990
email            990
date_of_birth    990
age              990
above_18         990
mobile           990
first_name       990
last_name        990
dtype: int64

In [95]:
invalid_data = data_valid_names.loc[(data_valid_names["mobile"].isnull()) | \
                                 (~data_valid_names["above_18"] == True) | \
                                 (data_valid_names["email"].isnull())]

In [96]:
invalid_data.head()

Unnamed: 0,name,email,date_of_birth,age,above_18,mobile,first_name,last_name
1,Kristen Horn,Kristen_Horn@lin.com,19740910,47.34,True,,Kristen,Horn
2,Kimberly Chang,Kimberly_Chang@johnson-lopez.biz,19740227,47.88,True,,Kimberly,Chang
3,Mary Ball,Mary_Ball@stevens.biz,19680205,53.94,True,,Mary,Ball
4,Benjamin Craig,Benjamin_Craig@berry.net,19881211,33.08,True,,Benjamin,Craig
5,Cathy Werner,Cathy_Werner@martinez.net,20180925,3.27,False,,Cathy,Werner


In [99]:
valid_data["member_id"] = valid_data.apply(gen_member_id,
                                           axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data["member_id"] = valid_data.apply(gen_member_id,


In [100]:
valid_data.head()

Unnamed: 0,name,email,date_of_birth,age,above_18,mobile,first_name,last_name,member_id
0,William Dixon,William_Dixon@woodward-fuller.biz,19860110,36.0,True,40601711,William,Dixon,Dixon_3864b
20,Patty Smith,Patty_Smith@ross.com,19750827,46.38,True,59428759,Patty,Smith,Smith_c7677
27,Sarah Jacobson,Sarah_Jacobson@mclean-jacobs.info,19580309,63.86,True,61071779,Sarah,Jacobson,Jacobson_e151e
28,Kayla Shannon,Kayla_Shannon@hendricks-hunter.org,19840517,37.65,True,51323743,Kayla,Shannon,Shannon_dd402
29,Sean Wang DDS,Sean_Wang@gibson-calderon.com,19600311,61.85,True,25595367,Sean,Wang,Wang_04168
