In [0]:
# Load dataset from Databricks workspace
fraud_df = spark.read.table("workspace.default.credit_card_fraud_detection")

# Display first few rows
display(fraud_df)

# Check schema
fraud_df.printSchema()

# Register as SQL temporary view
fraud_df.createOrReplaceTempView("credit_card_data")


Transaction ID,Cardholder ID,Transaction Amount,Merchant Type,Location,Time of Transaction,Is Fraud
TID000001,CH001,1878.96,Travel,Houston,2025-01-01T00:00:00.000Z,0
TID000002,CH002,4754.06,Travel,Los Angeles,2025-01-01T01:00:00.000Z,0
TID000003,CH003,3662.65,Electronics,Los Angeles,2025-01-01T02:00:00.000Z,0
TID000004,CH004,2997.31,Grocery,Los Angeles,2025-01-01T03:00:00.000Z,0
TID000005,CH005,788.53,Grocery,Chicago,2025-01-01T04:00:00.000Z,0
TID000006,CH006,788.41,Travel,Los Angeles,2025-01-01T05:00:00.000Z,0
TID000007,CH007,299.84,Electronics,Chicago,2025-01-01T06:00:00.000Z,0
TID000008,CH008,4332.22,Grocery,New York,2025-01-01T07:00:00.000Z,0
TID000009,CH009,3009.56,Travel,New York,2025-01-01T08:00:00.000Z,0
TID000010,CH010,3543.28,Retail,Chicago,2025-01-01T09:00:00.000Z,0


root
 |-- Transaction ID: string (nullable = true)
 |-- Cardholder ID: string (nullable = true)
 |-- Transaction Amount: double (nullable = true)
 |-- Merchant Type: string (nullable = true)
 |-- Location: string (nullable = true)
 |-- Time of Transaction: timestamp (nullable = true)
 |-- Is Fraud: long (nullable = true)



In [0]:
print(f"Total Transactions: {fraud_df.count()}")
print(f"Total Columns: {len(fraud_df.columns)}")


Total Transactions: 1000
Total Columns: 7


In [0]:
from pyspark.sql.functions import col, sum

null_counts = fraud_df.select([
    sum(col(c).isNull().cast("int")).alias(c) for c in fraud_df.columns
])
display(null_counts)


Transaction ID,Cardholder ID,Transaction Amount,Merchant Type,Location,Time of Transaction,Is Fraud
0,0,0,0,0,0,0


In [0]:
class_dist = spark.sql("""
SELECT 
  `Is Fraud` AS Class,
  COUNT(*) AS Transaction_Count,
  ROUND((COUNT(*) * 100) / (SELECT COUNT(*) FROM credit_card_data), 2) AS Percentage
FROM credit_card_data
GROUP BY `Is Fraud`
ORDER BY `Is Fraud`
""")

display(class_dist)

Class,Transaction_Count,Percentage
0,941,94.1
1,59,5.9


Databricks visualization. Run in Databricks to view.

In [0]:
amount_stats = spark.sql("""
SELECT 
  `Is Fraud` AS Class,
  ROUND(AVG(`Transaction Amount`), 2) AS Avg_Amount,
  ROUND(MAX(`Transaction Amount`), 2) AS Max_Amount,
  ROUND(MIN(`Transaction Amount`), 2) AS Min_Amount,
  ROUND(STDDEV(`Transaction Amount`), 2) AS Std_Amount
FROM credit_card_data
GROUP BY `Is Fraud`
ORDER BY `Is Fraud`
""")

display(amount_stats)

Class,Avg_Amount,Max_Amount,Min_Amount,Std_Amount
0,2448.25,4998.59,33.11,1456.56
1,2585.98,4849.85,92.77,1483.46


In [0]:
from pyspark.sql.functions import col, hour, sum

time_analysis = fraud_df.withColumn(
    "Hour",
    hour(col("Time of Transaction"))
)

time_summary = time_analysis.groupBy("Hour").agg(
    sum((col("Is Fraud") == 1).cast("int")).alias("Fraud_Count"),
    sum((col("Is Fraud") == 0).cast("int")).alias("Non_Fraud_Count")
).orderBy("Hour")

display(time_summary)

Hour,Fraud_Count,Non_Fraud_Count
0,2,40
1,2,40
2,2,40
3,1,41
4,4,38
5,3,39
6,1,41
7,1,41
8,6,36
9,1,41


Databricks visualization. Run in Databricks to view.

In [0]:
feature_compare = spark.sql("""
SELECT 
  ROUND(AVG(`Is Fraud`), 4) AS Avg_Is_Fraud,
  `Is Fraud`
FROM credit_card_data
GROUP BY `Is Fraud`
""")
display(feature_compare)

Avg_Is_Fraud,Is Fraud
0.0,0
1.0,1


In [0]:
fraud_summary = spark.sql("""
SELECT 
  `Is Fraud` AS Fraud_Flag,
  COUNT(*) AS Transaction_Count,
  ROUND(AVG(`Transaction Amount`), 2) AS Avg_Amount,
  ROUND(SUM(`Transaction Amount`), 2) AS Total_Amount,
  ROUND(STDDEV(`Transaction Amount`), 2) AS Std_Amount
FROM credit_card_data
GROUP BY `Is Fraud`
""")

display(fraud_summary)

fraud_summary.write.mode("overwrite").saveAsTable("workspace.default.fraud_summary")

Fraud_Flag,Transaction_Count,Avg_Amount,Total_Amount,Std_Amount
0,941,2448.25,2303807.73,1456.56
1,59,2585.98,152572.58,1483.46


In [0]:
from pyspark.sql.functions import when, col

fraud_scored = fraud_df.withColumn(
    "Fraud_Risk_Score",
    when(
        (col("Transaction Amount") > 2000) & (col("Merchant Type") == "Online"), 0.9
    ).when(
        (col("Transaction Amount") > 1000) & (col("Merchant Type") == "Retail"), 0.7
    ).when(
        col("Transaction Amount") > 500, 0.5
    ).otherwise(0.1)
)

display(
    fraud_scored.select(
        "Transaction Amount", "Merchant Type", "Fraud_Risk_Score", "Is Fraud"
    ).limit(10)
)

Transaction Amount,Merchant Type,Fraud_Risk_Score,Is Fraud
1878.96,Travel,0.5,0
4754.06,Travel,0.5,0
3662.65,Electronics,0.5,0
2997.31,Grocery,0.5,0
788.53,Grocery,0.5,0
788.41,Travel,0.5,0
299.84,Electronics,0.1,0
4332.22,Grocery,0.5,0
3009.56,Travel,0.5,0
3543.28,Retail,0.7,0


Databricks visualization. Run in Databricks to view.