In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import (StructField,StringType,IntegerType,StructType,TimestampType)
from pyspark.sql.functions import *
import pandas as pd


In [2]:
spark = SparkSession.builder.appName('Custmer data').getOrCreate()

In [3]:
df = spark.read.json("../../dataset/creditcard_dataset/cdw_sapp_custmer.json")

df.show()


+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+
|APT_NO|  CREDIT_CARD_NO|   CUST_CITY| CUST_COUNTRY|          CUST_EMAIL|CUST_PHONE|CUST_STATE|CUST_ZIP|FIRST_NAME|LAST_NAME|        LAST_UPDATED|MIDDLE_NAME|      SSN|      STREET_NAME|
+------+----------------+------------+-------------+--------------------+----------+----------+--------+----------+---------+--------------------+-----------+---------+-----------------+
|   656|4210653310061055|     Natchez|United States| AHooper@example.com|   1237818|        MS|   39120|      Alec|   Hooper|2018-04-21T12:49:...|         Wm|123456100|Main Street North|
|   829|4210653310102868|Wethersfield|United States| EHolman@example.com|   1238933|        CT|   06109|      Etta|   Holman|2018-04-21T12:49:...|    Brendan|123453023|    Redwood Drive|
|   683|4210653310116272|     Huntley|United States| WDunham@exam

In [None]:
df.printSchema()

In [4]:
df.columns

['APT_NO',
 'CREDIT_CARD_NO',
 'CUST_CITY',
 'CUST_COUNTRY',
 'CUST_EMAIL',
 'CUST_PHONE',
 'CUST_STATE',
 'CUST_ZIP',
 'FIRST_NAME',
 'LAST_NAME',
 'LAST_UPDATED',
 'MIDDLE_NAME',
 'SSN',
 'STREET_NAME']

In [None]:
df.describe().show()

cleaning custmer data

In [4]:
df = df.withColumn("SSN",col("SSN").cast(IntegerType()))\
       .withColumn("CUST_ZIP",col("CUST_ZIP").cast(IntegerType()))\
       .withColumn("LAST_UPDATED",col("LAST_UPDATED").cast(TimestampType()))\
       .withColumn("CUST_PHONE",col("CUST_PHONE").cast(StringType())) 
df = df.withColumn("FIRST_NAME", initcap(col('FIRST_NAME'))).withColumn("LAST_NAME",initcap(col('LAST_NAME')))
df = df.withColumn('MIDDLE_NAME',lower(col('MIDDLE_NAME')))
df = df.withColumn("FULL_STREET_ADDRESS", concat_ws(",",col('APT_NO'),col('STREET_NAME'))).drop("APT_NO").drop("STREET_NAME")
df = df.withColumn("CUST_PHONE", regexp_replace(df.CUST_PHONE, "(\d{3})(\d{3})(\d{1})", "($1) $2-$3"))

In [6]:
df.printSchema()

root
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_CITY: string (nullable = true)
 |-- CUST_COUNTRY: string (nullable = true)
 |-- CUST_EMAIL: string (nullable = true)
 |-- CUST_PHONE: string (nullable = true)
 |-- CUST_STATE: string (nullable = true)
 |-- CUST_ZIP: integer (nullable = true)
 |-- FIRST_NAME: string (nullable = true)
 |-- LAST_NAME: string (nullable = true)
 |-- LAST_UPDATED: timestamp (nullable = true)
 |-- MIDDLE_NAME: string (nullable = true)
 |-- SSN: integer (nullable = true)
 |-- FULL_STREET_ADDRESS: string (nullable = false)



In [5]:
df.collect()

[Row(CREDIT_CARD_NO='4210653310061055', CUST_CITY='Natchez', CUST_COUNTRY='United States', CUST_EMAIL='AHooper@example.com', CUST_PHONE='(123) 781-8', CUST_STATE='MS', CUST_ZIP=39120, FIRST_NAME='Alec', LAST_NAME='Hooper', LAST_UPDATED=datetime.datetime(2018, 4, 21, 12, 49, 2), MIDDLE_NAME='wm', SSN=123456100, FULL_STREET_ADDRESS='656,Main Street North'),
 Row(CREDIT_CARD_NO='4210653310102868', CUST_CITY='Wethersfield', CUST_COUNTRY='United States', CUST_EMAIL='EHolman@example.com', CUST_PHONE='(123) 893-3', CUST_STATE='CT', CUST_ZIP=6109, FIRST_NAME='Etta', LAST_NAME='Holman', LAST_UPDATED=datetime.datetime(2018, 4, 21, 12, 49, 2), MIDDLE_NAME='brendan', SSN=123453023, FULL_STREET_ADDRESS='829,Redwood Drive'),
 Row(CREDIT_CARD_NO='4210653310116272', CUST_CITY='Huntley', CUST_COUNTRY='United States', CUST_EMAIL='WDunham@example.com', CUST_PHONE='(124) 301-8', CUST_STATE='IL', CUST_ZIP=60142, FIRST_NAME='Wilber', LAST_NAME='Dunham', LAST_UPDATED=datetime.datetime(2018, 4, 21, 12, 49, 2)

Changing the datatypes

Convert the first_name to Title case and lastname

change the middle name to lowercase

Concatenate aptno and streetname with comma and give column name full_street_address