In [1]:
import re
import pandas as pd
import logging
from datetime import datetime
# from utils import check_mobile, check_email, format_name, gen_member_id
from hashlib import sha256

In [2]:
import random

def random_mobile_generator(num_to_gen: int) -> list:
    results = []
    
    for i in range(num_to_gen):
        number = random.randint(80000000, 99999999)
        number = str(number)
        if random.random() < 0.1:
            choice = random.choice([1, 2, 3])
            if choice == 1:
                number = f"65{number}"
            elif choice == 2:
                number =f"+65{number}"
            else:
                number = number[:5]
        results.append(number)
    return results

In [3]:
print(random_mobile_generator(10))

['96779786', '91653891', '81209996', '87421657', '98746567', '86982408', '86115229', '99401625', '86552797', '92622264']


In [4]:
def check_mobile(data: pd.Series) -> pd.Series:
    """
    Checks if a provided mobile number is valid.
    Determines if alphabets are present in the number, replaces "+", "-" and whitespaces.
    Removes 65 if the number starts with it.
    
    Args:
        mobile (str): Mobile number provided by the user
        
    Returns:
        Formatted mobile number or None if the number is invalid
        
    Raises:
        ValueError: If provided value is not str type
    """
    
    try:
        logging.info(f"Processing number: {data['mobile']}")
        mobile = data['mobile']
        result = None 
        alphabet_pattern = re.compile(r"[A-Za-z]+")

        # If number contains alphabets, return "False"
        if re.findall(alphabet_pattern, mobile):
            return result
    
        pattern = re.compile(r"\+|\-|\s")
        mobile = re.sub(pattern=pattern, repl="", string=mobile)
        mobile = mobile if not mobile.startswith("65") else mobile[2:]
        result = mobile if len(mobile) == 8 else "False"
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Mobile result: {result}")
        return result

In [5]:
def check_email(data: pd.Series) -> pd.Series:
    """
    Checks and validates provided email address.
    Before "@", match at least 1 or more characters/numbers before ".", "-", or "_" (Can be zero matches) 
    and at least 1 or more character/numbers 

    After "@", match at least 1 or more characters/numbers including "-"
    Uses capture group to determine domain information, matches ".", at least 2 or more characters 2 times.
    
    Args:
        email (str): Email address provided by the user
        
    Returns:
        result (str): Email address is valid, else None
        
    Raises:
        ValueError: If provided value is not str type

    """
    
    logging.info(f"Processing email: {data['email']}")
    email = data['email']
    result = None
    pattern = re.compile("([A-Za-z0-9]+[.\-_])*[A-Za-z0-9]+@[A-Za-z0-9-]+(\.[A-Z|a-z]{2,})+")
        
    try:
        result = re.fullmatch(pattern, email).group()
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Email result: {result}")
        return result

In [30]:
def format_name(name: str) -> str:
    """
    Splits provided name into first and last names.
    Checks for patronymic terms and splits/joins based on those terms for Indian and Malay names.
    
    For Chinese names, the first word is considered the last name and the rest, first name hence, the results will need to be reversed.    
    
    Args:
        name (str): Name provided by the user
        
    Returns:
        result (tuple): First and last name, lowercase
        
    Raises:
        ValueError: If provided value is not str type

    """

    result = ()
    indian_patronymic = re.compile(r"s\/o|d\/o")
    malay_patronymic = re.compile(r"Binte|Bin")
    sub_pattern = re.compile(r"\.\ |\s")
                                    
    try:
        logging.info(f"Processing name: {name}")
        
        if re.findall(indian_patronymic, name):
            tmp = re.split(indian_patronymic, name)
            result = (re.sub(sub_pattern, "_", tmp[0].strip()).lower(), \
                      re.sub(sub_pattern, "_", tmp[1].strip()).lower())
        elif re.findall(malay_patronymic, name):
            tmp = re.split(malay_patronymic, name)
            result = (re.sub(sub_pattern, "_", tmp[0].strip()).lower(), \
                      re.sub(sub_pattern, "_", (re.findall(malay_patronymic, name)[0].strip()+" "+tmp[1].strip())).lower())
        else:
            result = (" ".join(name.split()[1:]).lower(), name.split()[0].lower())
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Name result: {result}")
        return result

In [9]:
def gen_member_id(data: pd.Series) -> pd.Series:
    """
    Generates the membership ID for the user
    
    Args:
        lastname (str): last name of the user
        dob (str): DoB name of the user
        
    Returns:
        result (str): Membership ID
        
    Raises:
        ValueError: If provided values are not of str type

    """

    logging.info(f"Processing lastname: {data['last_name']} and DoB: {data['dob']}")
    last_name = data['last_name']
    dob = data['dob']
    result = None
        
    try:
        # Truncate hash to first 5 characters
        dob_hash = sha256(dob.encode("utf-8")).hexdigest()
        result = f"{last_name}_{dob_hash[:5]}".strip()
    except Exception as e:
        logging.error("Execption occurred", exc_info=True)
    finally:
        logging.info(f"Membership ID result: {result}")
        return result


In [10]:
data = pd.read_csv("../data/mock_data/mock_data.csv")

In [11]:
data.head(4)

Unnamed: 0,name,mobile,email,dob
0,Aaryan Sathasivam s/o P. Nilanga,656822 4145,kgrogonoi@mac.com,11/Jun/1986
1,Anjali Kalai d/o N. Thevar,+65 9503 6865,kcravene@ycombinator.com,16/Jun/1981
2,Ashvin Khera d/o S. Raj,+65 6723 0247,,17/Feb/1984
3,Baey Jun Rui,6675 9858,hhavick2k@goo.ne.jp,2/Oct/1999


In [12]:
data_valid_names = data[~data["name"].isnull()]

In [13]:
data_invalid_name = data[data["name"].isnull()]

In [14]:
data_valid_names["tmp_dob_datetime"] = data_valid_names["dob"].apply(lambda x: datetime.strptime(x, "%d/%b/%Y"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid_names["tmp_dob_datetime"] = data_valid_names["dob"].apply(lambda x: datetime.strptime(x, "%d/%b/%Y"))


In [15]:
data_valid_names["age"] = (datetime(2022,1,1) - data_valid_names["tmp_dob_datetime"]).dt.days / 365

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid_names["age"] = (datetime(2022,1,1) - data_valid_names["tmp_dob_datetime"]).dt.days / 365


In [16]:
data_valid_names["above_18"] = data_valid_names["age"].apply(lambda x: x > 18)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid_names["above_18"] = data_valid_names["age"].apply(lambda x: x > 18)


In [17]:
data_valid_names["tmp_dob"] = data_valid_names["dob"].apply(lambda x: datetime.strftime(datetime.strptime(x, "%d/%b/%Y"), "%Y%m%d"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid_names["tmp_dob"] = data_valid_names["dob"].apply(lambda x: datetime.strftime(datetime.strptime(x, "%d/%b/%Y"), "%Y%m%d"))


In [18]:
data_valid_names["tmp_mobile"] = data_valid_names.apply(check_mobile,
                                                                 axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_valid_names["tmp_mobile"] = data_valid_names.apply(check_mobile,


In [19]:
data_valid_names["tmp_email"] = data_valid_names.apply(check_email,
                                                               axis=1)

ERROR:root:Execption occurred
Traceback (most recent call last):
  File "C:\Users\LEOTAY~1\AppData\Local\Temp/ipykernel_33580/745447137.py", line 27, in check_email
    result = re.fullmatch(pattern, email).group()
  File "C:\Anaconda3\lib\re.py", line 196, in fullmatch
    return _compile(pattern, flags).fullmatch(string)
TypeError: expected string or bytes-like object
ERROR:root:Execption occurred
Traceback (most recent call last):
  File "C:\Users\LEOTAY~1\AppData\Local\Temp/ipykernel_33580/745447137.py", line 27, in check_email
    result = re.fullmatch(pattern, email).group()
  File "C:\Anaconda3\lib\re.py", line 196, in fullmatch
    return _compile(pattern, flags).fullmatch(string)
TypeError: expected string or bytes-like object
ERROR:root:Execption occurred
Traceback (most recent call last):
  File "C:\Users\LEOTAY~1\AppData\Local\Temp/ipykernel_33580/745447137.py", line 27, in check_email
    result = re.fullmatch(pattern, email).group()
AttributeError: 'NoneType' object has n

In [20]:
data_valid_names.head()

Unnamed: 0,name,mobile,email,dob,tmp_dob_datetime,age,above_18,tmp_dob,tmp_mobile,tmp_email
0,Aaryan Sathasivam s/o P. Nilanga,656822 4145,kgrogonoi@mac.com,11/Jun/1986,1986-06-11,35.583562,True,19860611,68224145,kgrogonoi@mac.com
1,Anjali Kalai d/o N. Thevar,+65 9503 6865,kcravene@ycombinator.com,16/Jun/1981,1981-06-16,40.572603,True,19810616,95036865,kcravene@ycombinator.com
2,Ashvin Khera d/o S. Raj,+65 6723 0247,,17/Feb/1984,1984-02-17,37.89863,True,19840217,67230247,
3,Baey Jun Rui,6675 9858,hhavick2k@goo.ne.jp,2/Oct/1999,1999-10-02,22.265753,True,19991002,66759858,hhavick2k@goo.ne.jp
4,Baruni Sathasivam d/o J. Navin,9094 2411,uhuncoteg@ezinearticles.com,27/Feb/1981,1981-02-27,40.871233,True,19810227,90942411,uhuncoteg@ezinearticles.com


In [21]:
data_valid_names = data_valid_names.drop(["tmp_dob_datetime",
                                          "dob",
                                          "mobile",
                                         "email"],
                                         axis=1)

In [22]:
data_valid_names = data_valid_names.rename(
    columns={
        "tmp_dob":"dob",
        "tmp_mobile":"mobile",
        "tmp_email":"email"
    }
)

In [23]:
valid_data = data_valid_names.loc[(~data_valid_names["mobile"].isnull()) & \
                                 (data_valid_names["above_18"] == True) & \
                                 (~data_valid_names["email"].isnull())]

In [24]:
valid_data.columns

Index(['name', 'age', 'above_18', 'dob', 'mobile', 'email'], dtype='object')

In [25]:
valid_data.head(10)

Unnamed: 0,name,age,above_18,dob,mobile,email
0,Aaryan Sathasivam s/o P. Nilanga,35.583562,True,19860611,68224145,kgrogonoi@mac.com
1,Anjali Kalai d/o N. Thevar,40.572603,True,19810616,95036865,kcravene@ycombinator.com
3,Baey Jun Rui,22.265753,True,19991002,66759858,hhavick2k@goo.ne.jp
4,Baruni Sathasivam d/o J. Navin,40.871233,True,19810227,90942411,uhuncoteg@ezinearticles.com
5,Baruni Sathasivam d/o M. Kumaran,36.649315,True,19850518,81030167,ahazeldene23@examiner.com
6,Dina Mastura Binte Mohamad Anuar,32.90137,True,19890214,69139769,hdemattia2b@si.edu
7,Ding Yi Hao,37.117808,True,19841128,60221005,aollerhadu@deliciousdays.com
8,Don Thevar s/o N. Prakash,22.408219,True,19990811,62164198,pfergyson1x@wordpress.com
9,Eliza Sobia Binte Mohammad Irfan,39.523288,True,19820704,64898551,hpesseltr@printfriendly.com
10,Fan Kai Feng,24.682192,True,19970503,66426863,mwhichelowc@nydailynews.com


In [26]:
valid_data.count()

name        72
age         72
above_18    72
dob         72
mobile      72
email       72
dtype: int64

In [27]:
invalid_data = data_valid_names.loc[(data_valid_names["mobile"].isnull()) | \
                                 (~data_valid_names["above_18"] == True) | \
                                 (data_valid_names["email"].isnull())]

In [28]:
invalid_data.head()

Unnamed: 0,name,age,above_18,dob,mobile,email
2,Ashvin Khera d/o S. Raj,37.89863,True,19840217,67230247,
11,Fong Zheng En,23.032877,True,19981226,63131233,
22,Izhar Izuddin Bin Haikel Irfan,25.293151,True,19960922,92982661,
23,Izhar Izuddin Bin Hairul Azmi,29.810959,True,19920318,62333811,
24,Jeyaraj Indra Raj d/o K. Kumar,23.484932,True,19980714,82904923,


In [32]:
valid_data["first_name"] = valid_data["name"].apply(lambda x: format_name(x)[0])
valid_data["last_name"] = valid_data["name"].apply(lambda x: format_name(x)[1])

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data["first_name"] = valid_data["name"].apply(lambda x: format_name(x)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data["last_name"] = valid_data["name"].apply(lambda x: format_name(x)[1])


In [33]:
valid_data["member_id"] = valid_data.apply(gen_member_id,
                                           axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data["member_id"] = valid_data.apply(gen_member_id,


In [34]:
valid_data.head()

Unnamed: 0,name,age,above_18,dob,mobile,email,first_name,last_name,member_id
0,Aaryan Sathasivam s/o P. Nilanga,35.583562,True,19860611,68224145,kgrogonoi@mac.com,aaryan_sathasivam,p_nilanga,p_nilanga_bc6f7
1,Anjali Kalai d/o N. Thevar,40.572603,True,19810616,95036865,kcravene@ycombinator.com,anjali_kalai,n_thevar,n_thevar_7e932
3,Baey Jun Rui,22.265753,True,19991002,66759858,hhavick2k@goo.ne.jp,jun rui,baey,baey_b1db8
4,Baruni Sathasivam d/o J. Navin,40.871233,True,19810227,90942411,uhuncoteg@ezinearticles.com,baruni_sathasivam,j_navin,j_navin_25e89
5,Baruni Sathasivam d/o M. Kumaran,36.649315,True,19850518,81030167,ahazeldene23@examiner.com,baruni_sathasivam,m_kumaran,m_kumaran_49f5b
