Credit Card Analysis Assessment

Step 1:
- Create a pyspark session
- Read from credit card transactions json file

In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("JSON Reader").getOrCreate()

25/03/18 16:51:34 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [14]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType

# 2. Define Schema with Comments
schema = StructType([
    StructField("Unnamed: 0", IntegerType(), True),  # Index
    StructField("trans_date_trans_time", TimestampType(), True),  # Transaction Time
    StructField("cc_num", StringType(), True),  # Credit Card Number
    StructField("merchant", StringType(), True),  # Merchant Name
    StructField("category", StringType(), True),  # Merchant Category
    StructField("amt", DoubleType(), True),  # Transaction Amount
    StructField("first", StringType(), True),  # Card Owner's First Name
    StructField("last", StringType(), True),  # Card Owner's Last Name
    StructField("gender", StringType(), True),  # Card Owner's Gender
    StructField("street", StringType(), True),  # Card Owner's Street Address
    StructField("city", StringType(), True),  # Card Owner's City
    StructField("state", StringType(), True),  # Card Owner's State
    StructField("zip", StringType(), True),  # Card Owner's Zip Code
    StructField("lat", DoubleType(), True),  # Card Owner's Latitude
    StructField("long", DoubleType(), True),  # Card Owner's Longitude
    StructField("city_pop", IntegerType(), True),  # City Population
    StructField("job", StringType(), True),  # Card Owner's Job
    StructField("dob", StringType(), True),  # Card Owner's Date of Birth
    StructField("trans_num", StringType(), True),  # Transaction Number
    StructField("merch_lat", DoubleType(), True),  # Merchant Latitude
    StructField("merch_long", DoubleType(), True),  # Merchant Longitude
    StructField("is_fraud", IntegerType(), True),  # Fraud Case Indicator
    StructField("merch_zipcode", StringType(), True),  # Merchant Zip Code
    StructField("merch_last_update_time", TimestampType(), True),  # Merchant Last Update Time
    StructField("merch_eff_time", TimestampType(), True),  # Merchant Effective Registration Time
    StructField("cc_bic", StringType(), True)  # Credit Card BIC Code
])

In [15]:
from pyspark.sql.functions import from_utc_timestamp, col, date_format

# Convert timestamps to human-readable format in UTC+8
df = df.withColumn("trans_date_trans_time", date_format(from_utc_timestamp(col("trans_date_trans_time"), "Asia/Singapore"), "yyyy-MM-dd HH:mm"))
df = df.withColumn("merch_last_update_time", date_format(from_utc_timestamp(col("merch_last_update_time"), "Asia/Singapore"), "yyyy-MM-dd HH:mm"))
df = df.withColumn("merch_eff_time", date_format(from_utc_timestamp(col("merch_eff_time"), "Asia/Singapore"), "yyyy-MM-dd HH:mm"))