In [2]:
from pyspark.sql import SparkSession

In [6]:
spark = SparkSession.builder\
    .appName('Cleaned_Customer_Date')\
        .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/08/20 13:20:20 WARN Utils: Your hostname, GUNAs-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.190 instead (on interface en0)
25/08/20 13:20:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/20 13:20:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
#Loading the Data from csv file
df = spark.read.option("header",True).csv("us_customer_data 2.csv")

In [11]:
df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address: string (nullable = true)
 |-- registration_date: string (nullable = true)
 |-- loyalty_status: string (nullable = true)



In [14]:
df.head(2)

[Row(customer_id='1', name='Michelle Kidd', email='vayala@example.net', phone=None, address='USNS Santiago, FPO AE 80872', registration_date='2025-01-25', loyalty_status='Gold'),
 Row(customer_id='2', name='Brad Newton', email='taylorcatherine@example.net', phone='(759)518-8536x738', address='38783 Oliver Street, West Kristenborough, MT 99752', registration_date='2023-07-13', loyalty_status='Silver')]

In [16]:
df.show(truncate = False)

+-----------+---------------------+---------------------------+----------------------+--------------------------------------------------------+-----------------+--------------+
|customer_id|name                 |email                      |phone                 |address                                                 |registration_date|loyalty_status|
+-----------+---------------------+---------------------------+----------------------+--------------------------------------------------------+-----------------+--------------+
|1          |Michelle Kidd        |vayala@example.net         |NULL                  |USNS Santiago, FPO AE 80872                             |2025-01-25       |Gold          |
|2          |Brad Newton          |taylorcatherine@example.net|(759)518-8536x738     |38783 Oliver Street, West Kristenborough, MT 99752      |2023-07-13       |Silver        |
|3          |Larry Torres         |dsanchez@example.net       |001-323-525-3094x96062|6845 Steele Turnpike, West Er

In [17]:
df.show(5)

+-----------+----------------+--------------------+--------------------+--------------------+-----------------+--------------+
|customer_id|            name|               email|               phone|             address|registration_date|loyalty_status|
+-----------+----------------+--------------------+--------------------+--------------------+-----------------+--------------+
|          1|   Michelle Kidd|  vayala@example.net|                NULL|USNS Santiago, FP...|       2025-01-25|          Gold|
|          2|     Brad Newton|taylorcatherine@e...|   (759)518-8536x738|38783 Oliver Stre...|       2023-07-13|        Silver|
|          3|    Larry Torres|dsanchez@example.net|001-323-525-3094x...|6845 Steele Turnp...|       2023-08-18|        Bronze|
|          4|  Kimberly Price|jessicaknight@exa...|001-947-633-4224x...|1631 Alexis Meado...|       2024-12-08|          Gold|
|          5|Matthew Phillips|qwilliams@example...|001-869-650-5682x...|2274 Williams Hei...|       2024-02-03|

In [19]:
df.show(5,truncate=False)

+-----------+----------------+---------------------------+----------------------+--------------------------------------------------------+-----------------+--------------+
|customer_id|name            |email                      |phone                 |address                                                 |registration_date|loyalty_status|
+-----------+----------------+---------------------------+----------------------+--------------------------------------------------------+-----------------+--------------+
|1          |Michelle Kidd   |vayala@example.net         |NULL                  |USNS Santiago, FPO AE 80872                             |2025-01-25       |Gold          |
|2          |Brad Newton     |taylorcatherine@example.net|(759)518-8536x738     |38783 Oliver Street, West Kristenborough, MT 99752      |2023-07-13       |Silver        |
|3          |Larry Torres    |dsanchez@example.net       |001-323-525-3094x96062|6845 Steele Turnpike, West Erikabury, UT 37487          |20

In [26]:
rows = df.count()
rows

1000

In [27]:
distinct_rows = df.distinct().count()

In [28]:
distinct_rows

1000

In [29]:
#From the above we can see that there are no duplicates in the data but for the safety we can drop the duplicate=s from the data
# Step 1: Deduplicate
df = df.dropDuplicates(["customer_id"])

In [30]:
df.count()

1000

In [33]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [34]:
#The format of the phone number is incorrect so we need to replace those characters or un necessary values with empty
# Step 2: Clean phone numbers
df = df.withColumn("clean_phone", regexp_replace("phone", r"x\d+", "")) \
       .withColumn("clean_phone", regexp_replace("clean_phone", r"\D", "")) \
       .withColumn("valid_phone", when(length("clean_phone") >= 10, col("clean_phone")).otherwise(lit(None)))


In [42]:
df_selected = df.select("customer_id", "name", "email","clean_phone","valid_phone")
df_selected.show()


+-----------+----------------+--------------------+-------------+-------------+
|customer_id|            name|               email|  clean_phone|  valid_phone|
+-----------+----------------+--------------------+-------------+-------------+
|          1|   Michelle Kidd|  vayala@example.net|         NULL|         NULL|
|         10|    Rachel White|whitemichael@exam...|0019207934515|0019207934515|
|        100|     Aaron Irwin|  wdavis@example.org|  13235518227|  13235518227|
|       1000|     Jason Walls|michael42@example...|   5773048260|   5773048260|
|        101|      Amy Graham|nicolehorton@exam...|   8417222216|   8417222216|
|        102|      Larry Hill|garrettrebecca@ex...|  14517023092|  14517023092|
|        103|    Lindsey Ruiz|  yortiz@example.net|   8463062446|   8463062446|
|        104|  Jacob Williams|                NULL|   4372542080|   4372542080|
|        105|    Lisa Esparza| ygibson@example.org|   4925831364|   4925831364|
|        106| Edward Williams| gjordan@e

In [53]:
#In the above we can see the registration_date column is a string so now we can cast it to date type
df= df.withColumn("registration_date",col("registration_date").cast(DateType()))

In [54]:
df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- address: string (nullable = true)
 |-- registration_date: date (nullable = true)
 |-- loyalty_status: string (nullable = true)
 |-- clean_phone: string (nullable = true)
 |-- valid_phone: string (nullable = true)



In [55]:
#In the dataset we can see for the string columns for name,email,adress,and some columns has the spaces at the start and at the end 
#so we can trim them now

df = df.withColumn("name", trim(col("name"))) \
       .withColumn("email", trim(lower(col("email")))) \
       .withColumn("address", trim(col("address"))) \
       .withColumn("loyalty_status", trim(col("loyalty_status")))

In [56]:
#The updated dataset is 
df = df.select("customer_id","name","email","valid_phone","address","registration_date","loyalty_status")

In [57]:
df.show()

+-----------+----------------+--------------------+-------------+--------------------+-----------------+--------------+
|customer_id|            name|               email|  valid_phone|             address|registration_date|loyalty_status|
+-----------+----------------+--------------------+-------------+--------------------+-----------------+--------------+
|          1|   Michelle Kidd|  vayala@example.net|         NULL|USNS Santiago, FP...|       2025-01-25|          Gold|
|         10|    Rachel White|whitemichael@exam...|0019207934515|6182 Brown Mounta...|       2024-02-25|        Bronze|
|        100|     Aaron Irwin|  wdavis@example.org|  13235518227|4938 Natalie Cove...|       2024-06-15|        Bronze|
|       1000|     Jason Walls|michael42@example...|   5773048260|95002 Bruce Rue, ...|       2024-12-10|        Bronze|
|        101|      Amy Graham|nicolehorton@exam...|   8417222216|236 Robert Mall, ...|       2024-02-25|        Bronze|
|        102|      Larry Hill|garrettreb

                                                                                