In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import*

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, BooleanType, DoubleType

In [2]:
# 2. Initializing SparkSession

spark = SparkSession.builder.appName("loadcredit").getOrCreate()

Extracting files 

In [3]:
# 3. Read credit-json File with Spark DataFrame API

df_credit = spark.read.load(r"C:\Users\Learner_XZHCG217\Desktop\TEK-Dataengineering\Git\Capstone\files\cdw_sapp_credit.json",\
     format="json", header = True,inferSchema = True)

# df_customer = spark.read.json(r"C:\Users\Learner_XZHCG217\Desktop\TEK-Dataengineering\Git\Capstone\files\cdw_sapp_custmer.json")

df_credit.printSchema()


root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: long (nullable = true)
 |-- MONTH: long (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: long (nullable = true)



Transformation

In [4]:
# Convert Year, Month and Day into strings
# we have to lpad with 0 if the month and date has 1 number.... ex: feb(2 to 02)

df_credit = df_credit.withColumn('YEAR', df_credit['YEAR'].cast('string'))
df_credit = df_credit.withColumn('MONTH', lpad(df_credit['MONTH'], 2, '0'))
df_credit = df_credit.withColumn('DAY', lpad(df_credit['DAY'], 2, '0'))


In [5]:
# convert Year, Month and day in String

# df_credit = df_credit.withColumn('DAY', df_credit['DAY'].cast('string'))
# df_credit = df_credit.withColumn('MONTH', df_credit['MONTH'].cast('string'))
# df_credit = df_credit.withColumn('YEAR', df_credit['YEAR'].cast('string'))

In [5]:
# verify whether the datatypes are changed into strings

df_credit.printSchema()

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CREDIT_CARD_NO: string (nullable = true)
 |-- CUST_SSN: long (nullable = true)
 |-- DAY: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- TRANSACTION_ID: long (nullable = true)
 |-- TRANSACTION_TYPE: string (nullable = true)
 |-- TRANSACTION_VALUE: double (nullable = true)
 |-- YEAR: string (nullable = true)



In [6]:
# verify whether lpad month and date

df_credit.show(5)

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
|        114|4210653349028689|123459988| 14|   02|             1|       Education|             78.9|2018|
|         35|4210653349028689|123459988| 20|   03|             2|   Entertainment|            14.24|2018|
|        160|4210653349028689|123459988| 08|   07|             3|         Grocery|             56.7|2018|
|        114|4210653349028689|123459988| 19|   04|             4|   Entertainment|            59.73|2018|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+
only showing top 5 rows



In [7]:
# Concatenate Year, Month and Day

df_credit= df_credit.withColumn('TIMEID', concat(df_credit['YEAR'],df_credit['MONTH'],df_credit['DAY']))


In [8]:
df_credit.show(5)

+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+--------+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|DAY|MONTH|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|YEAR|  TIMEID|
+-----------+----------------+---------+---+-----+--------------+----------------+-----------------+----+--------+
|        114|4210653349028689|123459988| 14|   02|             1|       Education|             78.9|2018|20180214|
|         35|4210653349028689|123459988| 20|   03|             2|   Entertainment|            14.24|2018|20180320|
|        160|4210653349028689|123459988| 08|   07|             3|         Grocery|             56.7|2018|20180708|
|        114|4210653349028689|123459988| 19|   04|             4|   Entertainment|            59.73|2018|20180419|
|         93|4210653349028689|123459988| 10|   10|             5|             Gas|             3.59|2018|20181010|
+-----------+----------------+---------+---+-----+--------------+---------------

In [9]:
# drop the Day, Moth, Year columns

df_credit = df_credit.drop('DAY','MONTH','YEAR')
df_credit.show(5)

+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|BRANCH_CODE|  CREDIT_CARD_NO| CUST_SSN|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|  TIMEID|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|        114|4210653349028689|123459988|             1|       Education|             78.9|20180214|
|         35|4210653349028689|123459988|             2|   Entertainment|            14.24|20180320|
|        160|4210653349028689|123459988|             3|         Grocery|             56.7|20180708|
|        114|4210653349028689|123459988|             4|   Entertainment|            59.73|20180419|
|         93|4210653349028689|123459988|             5|             Gas|             3.59|20181010|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
only showing top 5 rows



In [10]:
# Change the column name of CREDIT_CARD_NO to CUST_CC_NO

df_credit = df_credit.withColumnRenamed('CREDIT_CARD_NO', 'CUST_CC_NO')

In [11]:
# Verify the target data types and Column names
df_credit.show(5)
df_credit.printSchema()

+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|BRANCH_CODE|      CUST_CC_NO| CUST_SSN|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|  TIMEID|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|        114|4210653349028689|123459988|             1|       Education|             78.9|20180214|
|         35|4210653349028689|123459988|             2|   Entertainment|            14.24|20180320|
|        160|4210653349028689|123459988|             3|         Grocery|             56.7|20180708|
|        114|4210653349028689|123459988|             4|   Entertainment|            59.73|20180419|
|         93|4210653349028689|123459988|             5|             Gas|             3.59|20181010|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
only showing top 5 rows

root
 |-- BRANCH_CODE: long (nullable = true)
 |-- CUST_CC_NO: string (null

LOADING

In [12]:
df_credit.write.format("jdbc")\
                .mode("overwrite")\
                .option("url","jdbc:mysql://localhost:3306/creditcard_capstone")\
                .option("dbtable","CDW_SAPP_CREDIT_CARD")\
                .option("user","root")\
                .option("password","password").save()

In [13]:
# read data from table
df_new1 = spark.read.format("jdbc") \
  .option("url", "jdbc:mysql://localhost:3306/creditcard_capstone") \
  .option("dbtable", "CDW_SAPP_CREDIT_CARD") \
  .option("user", "root") \
  .option("password", "password") \
  .load()

df_new1.show(5)


+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|BRANCH_CODE|      CUST_CC_NO| CUST_SSN|TRANSACTION_ID|TRANSACTION_TYPE|TRANSACTION_VALUE|  TIMEID|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
|        180|4210653342242023|123451310|         45069|           Bills|            77.79|20180315|
|        156|4210653312478046|123455692|         22562|         Grocery|            91.08|20180813|
|        114|4210653349028689|123459988|             1|       Education|             78.9|20180214|
|        107|4210653342242023|123451310|         45070|      Healthcare|            20.47|20180419|
|         58|4210653342242023|123451310|         45071|      Healthcare|             1.61|20180628|
+-----------+----------------+---------+--------------+----------------+-----------------+--------+
only showing top 5 rows

