In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions  import *

import os


HOME_DIR = '/opt/workspace/DE-project-a/'
FIXTURE_DIR = HOME_DIR + 'fixtures/'


spark = SparkSession.\
        builder.\
        appName("pyspark-nb-1").\
        master("spark://spark-master:7077").\
        config("spark.executor.memory", "512m").\
        config("spark.eventLog.enabled", "true").\
        config("spark.eventLog.dir", "file:///opt/workspace/events").\
        getOrCreate()
    
sc = spark.sparkContext
spark

In [3]:

def normalize_path(path):
    path = os.path.join(path)
    return os.path.normpath(path)


def from_xlsx(session, path, **kwargs):
    path = normalize_path(path)
    df_pandas = pd.read_excel(path, **kwargs)
    return session.createDataFrame(df_pandas)




In [52]:
path = normalize_path(f'{FIXTURE_DIR}clients.csv')
clients = spark.read.csv(path, header=True, inferSchema=True)
clients.persist()
clients.show(20, False)
clients.printSchema()

[Stage 2:>                                                          (0 + 1) / 1]

+---------+-------------------+----------------------------+----------------------+--------------------------------------------------------+
|client_id|client_name        |client_email                |client_phone          |client_address                                          |
+---------+-------------------+----------------------------+----------------------+--------------------------------------------------------+
|1        |Douglas Mann       |malexander@example.net      |7887828319            |447 Sanders Forge Apt. 868 Port Lisafort, MT 13535      |
|2        |Linda Willis       |sherryriddle@example.org    |648.448.5400x89273    |4186 Richard Turnpike Cynthiaton, GA 06049              |
|3        |Mr. Samuel Gonzalez|adam37@example.com          |+1-727-264-1902x10293 |5333 Bradley Corner Apt. 956 Port Michael, UT 04993     |
|4        |Elijah Duke        |alexis42@example.com        |+1-913-911-7052x28833 |6919 Strong Square South Catherine, OR 42933            |
|5        |An

                                                                                

**Cleaning clients name from degrees and statuses**


In [70]:
# from DE-project-a/e-consultant/patterns/names import TITLES_REGEX, DEGREES_REGEX

DEGREES = [
    'DDS',  # Doctor of Dental Surgery
    'PhD',  # Doctor of Philosophy
    'DVM',  # Doctor of Veterinary Medicine
    'MD',   # Doctor of Medicine
    'EdD',  # Doctor of Education
    'DPT',  # Doctor of Physical Therapy
    'JD',   # Juris Doctor
    'MS',   # Master of Science
    'MA',   # Master of Arts
    'MBA',  # Master of Business Administration
    'MFA',  # Master of Fine Arts
    'MEng',  # Master of Engineering
    'MPhil', # Master of Philosophy
    'DSc',  # Doctor of Science
    'DSW',  # Doctor of Social Work
    'ScD',  # Doctor of Science
    'DNP',  # Doctor of Nursing Practice
    'PharmD', # Doctor of Pharmacy
    'CRNA', # Certified Registered Nurse Anesthetist
    'MHA',  # Master of Health Administration
    'MEd',  # Master of Education
    'OTD',  # Doctor of Occupational Therapy
]

SHORT_TITLES = [
    'Mr',    # Mister
    'Mrs',   # Mistress (married woman)
    'Ms',    # Miss (woman, marital status unknown)
    'Dr',    # Doctor
    'Prof',   # Professor
    'Jr',    # Junior
    'Sr',    # Senior
    'Mx',    # Gender-neutral title
    'Rev',   # Reverend
    'Hon',   # Honorable
    'Capt',  # Captain
    'Lt',    # Lieutenant
    'Col',   # Colonel
    'Maj',   # Major
    'Sgt',   # Sergeant
    'Cpl',   # Corporal
    'Eng',   # Engineer
    'Dame',  # Title for women equivalent to knighthood
    'Sir',   # Title for men equivalent to knighthood
    
]

TITLES = [
    'Miss'
]

TITLES_REGEX = "|".join([f'{i}\.' for i in SHORT_TITLES])

TITLES_REGEX += f'|{"|".join([i for i in TITLES])}' # Add titles from TITLES with another logic


DEGREES_REGEX = "|".join(DEGREES)



filtered_by_name = clients \
    .withColumn('names_len', size(split(col('client_name'), r"\s+"))) \
    .filter(col('names_len') >= 3) \
    .withColumn(
        'client_name',
        regexp_replace(
            col('client_name'),
            rf"(^({TITLES_REGEX})\s*)|(\s+({DEGREES_REGEX})$)",
            "")
    )
clients.select('client_phone').show(100, 100)

# filtered_by_name.filter(size(split(col('client_name'), r"\s+")) > 2).show(100, 100)




+----------------------+
|          client_phone|
+----------------------+
|            7887828319|
|    648.448.5400x89273|
| +1-727-264-1902x10293|
| +1-913-911-7052x28833|
|            5118303852|
|     (747)809-1366x312|
|       +1-688-726-1099|
|  001-843-418-7765x267|
|001-279-321-5583x84972|
| 001-442-823-6839x9550|
|          636-294-5977|
|      247-773-9486x102|
|         (241)750-2592|
|001-647-922-4944x04173|
|      001-948-831-3483|
|          384.901.5718|
|            5847057388|
|      473.345.2546x343|
|            7879025767|
| 001-226-546-3952x2667|
| +1-278-332-2042x23796|
|  001-640-861-2391x392|
|      001-305-463-1628|
|     257.930.4890x1827|
|         (304)703-3905|
|     660-392-7735x5214|
|  +1-520-224-2758x4774|
|   +1-272-773-7115x061|
|          464.660.3591|
| +1-676-791-2174x26658|
|    910-446-2901x84752|
|       +1-797-597-7342|
|     792.770.9340x8734|
|      765-819-8723x467|
|   (964)777-7908x77123|
|      226-695-0178x280|
|   +1-808-362-3652x203|


In [49]:
clients.unpersist()
spark.stop()