In [1]:
# Pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# PySpark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, countDistinct, to_date, to_timestamp, when, expr, date_trunc, count, avg
from pyspark.sql.functions import explode, split, trim
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import count as Fcount

spark = (SparkSession.builder
         .appName("BNPL_external_data")
         .master("local[*]")
         .config("spark.driver.bindAddress","127.0.0.1")
         .config("spark.driver.host","127.0.0.1")
         .config("spark.ui.enabled","true")     # 如不看UI可设 false
         .config("spark.ui.port","0")           # 0=随机端口，避免 4040 冲突
         .config("spark.blockManager.port","0") # 随机
         .config("spark.driver.memory","6g")
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/11 18:03:54 WARN Utils: Your hostname, xuzhengs-MacBook-Pro-2.local, resolves to a loopback address: 127.0.0.1; using 10.12.248.242 instead (on interface en0)
25/09/11 18:03:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/11 18:03:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Create the crime rate of Australia during 2021-2022
crime_data = [
    # NSW
    ["New South Wales", 2021, "Homicide", 1.0],
    ["New South Wales", 2021, "Murder", 0.7],
    ["New South Wales", 2021, "Attempted murder", 0.3],
    ["New South Wales", 2021, "Manslaughter", 0.0],
    ["New South Wales", 2021, "Assault", 798.9],
    ["New South Wales", 2021, "Sexual assault", 141.8],
    ["New South Wales", 2021, "Kidnapping", 2.6],
    ["New South Wales", 2022, "Homicide", 1.0],
    ["New South Wales", 2022, "Murder", 0.7],
    ["New South Wales", 2022, "Attempted murder", 0.1],
    ["New South Wales", 2022, "Manslaughter", 0.1],
    ["New South Wales", 2022, "Assault", 852.0],
    ["New South Wales", 2022, "Sexual assault", 152.2],
    ["New South Wales", 2022, "Kidnapping", 2.6],

    # VIC
    ["Victoria", 2021, "Homicide", 1.3],
    ["Victoria", 2021, "Murder", 0.7],
    ["Victoria", 2021, "Attempted murder", 0.3],
    ["Victoria", 2021, "Manslaughter", 0.3],
    ["Victoria", 2021, "Assault", None],  # np = not published
    ["Victoria", 2021, "Sexual assault", 94.1],
    ["Victoria", 2021, "Kidnapping", 2.2],
    ["Victoria", 2022, "Homicide", 1.3],
    ["Victoria", 2022, "Murder", 0.8],
    ["Victoria", 2022, "Attempted murder", 0.3],
    ["Victoria", 2022, "Manslaughter", 0.2],
    ["Victoria", 2022, "Assault", None],
    ["Victoria", 2022, "Sexual assault", 100.9],
    ["Victoria", 2022, "Kidnapping", 2.7],

    # Queensland
    ["Queensland", 2021, "Homicide", 1.6],
    ["Queensland", 2021, "Murder", 0.7],
    ["Queensland", 2021, "Attempted murder", 0.7],
    ["Queensland", 2021, "Manslaughter", 0.1],
    ["Queensland", 2021, "Assault", None],
    ["Queensland", 2021, "Sexual assault", 132.1],
    ["Queensland", 2021, "Kidnapping", 0.5],
    ["Queensland", 2022, "Homicide", 2.1],
    ["Queensland", 2022, "Murder", 1.1],
    ["Queensland", 2022, "Attempted murder", 0.9],
    ["Queensland", 2022, "Manslaughter", 0.0],
    ["Queensland", 2022, "Assault", 983.3],
    ["Queensland", 2022, "Sexual assault", 139.5],
    ["Queensland", 2022, "Kidnapping", 0.8],

    # South Australia
    ["South Australia", 2021, "Homicide", 1.9],
    ["South Australia", 2021, "Murder", 0.7],
    ["South Australia", 2021, "Attempted murder", 1.3],
    ["South Australia", 2021, "Manslaughter", 0.0],
    ["South Australia", 2021, "Assault", 1008.2],
    ["South Australia", 2021, "Sexual assault", 98.9],
    ["South Australia", 2021, "Kidnapping", 2.3],
    ["South Australia", 2022, "Homicide", 1.6],
    ["South Australia", 2022, "Murder", 0.7],
    ["South Australia", 2022, "Attempted murder", 0.9],
    ["South Australia", 2022, "Manslaughter", 0.0],
    ["South Australia", 2022, "Assault", 1083.8],
    ["South Australia", 2022, "Sexual assault", 99.7],
    ["South Australia", 2022, "Kidnapping", 2.3],

    # Western Australia
    ["Western Australia", 2021, "Homicide",1.7],
    ["Western Australia", 2021, "Murder",0.9],
    ["Western Australia", 2021, "Attempted murder",0.7],
    ["Western Australia", 2021, "Manslaughter",0.4],
    ["Western Australia", 2021, "Assault",1315.8],
    ["Western Australia", 2021, "Sexual assault",129.6],
    ["Western Australia", 2021, "Kidnapping",0.5],

    ["Western Australia", 2022, "Homicide",1.8],
    ["Western Australia", 2022, "Murder",0.9],
    ["Western Australia", 2022, "Attempted murder",0.4],
    ["Western Australia", 2022, "Manslaughter",0.3],
    ["Western Australia", 2022, "Assault",1389.7],
    ["Western Australia", 2022, "Sexual assault",113.3],
    ["Western Australia", 2022, "Kidnapping",0.9],
   
    # Tasmania
    ["Tasmania", 2021, "Homicide",1.9],
    ["Tasmania", 2021, "Murder",0.9],
    ["Tasmania", 2021, "Attempted murder",0.7],
    ["Tasmania", 2021, "Manslaughter",0.0],
    ["Tasmania", 2021, "Assault",629.9],
    ["Tasmania", 2021, "Sexual assault",86.9],
    ["Tasmania", 2021, "Kidnapping",0.0],

    ["Tasmania", 2022, "Homicide",2.1],
    ["Tasmania", 2022, "Murder",1.0],
    ["Tasmania", 2022, "Attempted murder",1.2],
    ["Tasmania", 2022, "Manslaughter",0.0],
    ["Tasmania", 2022, "Assault",661.7],
    ["Tasmania", 2022, "Sexual assault",100.8],
    ["Tasmania", 2022, "Kidnapping",0.5],
    
    # Northern Territory
    ["Northern Territory", 2021, "Homicide",5.2],
    ["Northern Territory", 2021, "Murder",1.2],
    ["Northern Territory", 2021, "Attempted murder",3.2],
    ["Northern Territory", 2021, "Manslaughter",1.2],
    ["Northern Territory", 2021, "Assault",3648.6],
    ["Northern Territory", 2021, "Sexual assault",143.1],
    ["Northern Territory", 2021, "Kidnapping",0.0],

    ["Northern Territory", 2022, "Homicide",6.0],
    ["Northern Territory", 2022, "Murder",2.4],
    ["Northern Territory", 2022, "Attempted murder",0.0],
    ["Northern Territory", 2022, "Manslaughter",2.4],
    ["Northern Territory", 2022, "Assault",4159.1],
    ["Northern Territory", 2022, "Sexual assault",144.4],
    ["Northern Territory", 2022, "Kidnapping",2.0],
    
    # ACT
    ["Australian Capital Territory", 2021, "Homicide",2.7],
    ["Australian Capital Territory", 2021, "Murder",0.7],
    ["Australian Capital Territory", 2021, "Attempted murder",0.7],
    ["Australian Capital Territory", 2021, "Manslaughter",0.7],
    ["Australian Capital Territory", 2021, "Assault",511.4],
    ["Australian Capital Territory", 2021, "Sexual assault",75.4],
    ["Australian Capital Territory", 2021, "Kidnapping",0.0],

    ["Australian Capital Territory", 2022, "Homicide",2.0],
    ["Australian Capital Territory", 2022, "Murder",1.3],
    ["Australian Capital Territory", 2022, "Attempted murder",0.7],
    ["Australian Capital Territory", 2022, "Manslaughter",0.0],
    ["Australian Capital Territory", 2022, "Assault",538.7],
    ["Australian Capital Territory", 2022, "Sexual assault",71.2],
    ["Australian Capital Territory", 2022, "Kidnapping", 0.7],
]

# 创建 DataFrame
crime_df = pd.DataFrame(crime_data, columns=["State", "Year", "Offence", "Rate"])
crime_path = "external_data/raw_crime_rate.csv"
crime_df.to_csv(crime_path, index=False, encoding="utf-8")

print(f" Data saved to {crime_path}")

 Data saved to external_data/raw_crime_rate.csv


In [None]:
# 读取 CSV
crime_df = pd.read_csv("external_data/raw_crime_rate.csv")

# 选择我们关心的犯罪类型
selected_offences = ["Homicide", "Assault", "Sexual assault", "Kidnapping"]
crime_selected = crime_df[crime_df["Offence"].isin(selected_offences)].copy()

# 将 np / None 转为 0
crime_selected["Rate"] = pd.to_numeric(crime_selected["Rate"], errors="coerce").fillna(0)

# 透视表：每个州-年份一行，每个犯罪类别一列
crime_pivot = crime_selected.pivot_table(
    index=["State", "Year"],
    columns="Offence",
    values="Rate",
    aggfunc="sum"
).reset_index()

# 确保缺失的犯罪类别补 0
for col in selected_offences:
    if col not in crime_pivot.columns:
        crime_pivot[col] = 0

# 计算综合犯罪率 (加权，可以按需调整)
crime_pivot["Crime_Index"] = (
    0.5 * crime_pivot["Assault"] +
    0.2 * crime_pivot["Sexual assault"] +
    0.2 * crime_pivot["Homicide"] +
    0.1 * crime_pivot["Kidnapping"]
)

crime_pivot["Crime_Index"] = crime_pivot["Crime_Index"].round(1)

# 定义映射字典
state_mapping = {
    "New South Wales": "NSW",
    "Victoria": "VIC",
    "Queensland": "QLD",
    "South Australia": "SA",
    "Western Australia": "WA",
    "Tasmania": "TAS",
    "Northern Territory": "NT",
    "Australian Capital Territory": "ACT"
}

# 替换 State 列为缩写
crime_pivot["State"] = crime_pivot["State"].map(state_mapping)

# 保存输出
crime_pivot.to_csv("external_data/comprehensive_crime_rate.csv", index=False, encoding="utf-8")

print(crime_pivot.head(20))

Offence State  Year  Assault  Homicide  Kidnapping  Sexual assault  \
0         ACT  2021    511.4       2.7         0.0            75.4   
1         ACT  2022    538.7       2.0         0.7            71.2   
2         NSW  2021    798.9       1.0         2.6           141.8   
3         NSW  2022    852.0       1.0         2.6           152.2   
4          NT  2021   3648.6       5.2         0.0           143.1   
5          NT  2022   4159.1       6.0         2.0           144.4   
6         QLD  2021      0.0       1.6         0.5           132.1   
7         QLD  2022    983.3       2.1         0.8           139.5   
8          SA  2021   1008.2       1.9         2.3            98.9   
9          SA  2022   1083.8       1.6         2.3            99.7   
10        TAS  2021    629.9       1.9         0.0            86.9   
11        TAS  2022    661.7       2.1         0.5           100.8   
12        VIC  2021      0.0       1.3         2.2            94.1   
13        VIC  2022 

25/09/12 10:29:03 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 188065 ms exceeds timeout 120000 ms
25/09/12 10:29:03 WARN SparkContext: Killing executors is not supported by current scheduler.
25/09/12 10:36:48 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:53)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:342)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:132)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$