# 代码来自Groundbreaker 
# 想联系我的话花点心思不难找到我的联系方式
# 如果今年题目变了代码用不了了，可以在Github上找到我，我会尽快更新代码

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import re

plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False

In [None]:
s = open("advertise.txt").read()
s = re.sub(r"\n第\d+页\n", "\n", s)
s = re.sub(r"\n\n", "\n", s)
print(s)

In [None]:
lines = s.strip().split("\n")
df = pd.DataFrame(
    [item.split("|") for item in lines],
    columns=["Position", "Company", "Salary", "Location", "Date"],
)
df

In [None]:
def salary_medium(salary_str):
    try:
        if "(" in salary_str:
            salary_str = salary_str.split(")")[1]
        min_salary, max_salary = map(
            lambda x: float(x.replace("K", "")), salary_str.split("-")
        )
        mid_salary = (min_salary + max_salary) / 2
        return mid_salary
    except Exception:
        return pd.NA

df["Salary mid"] = df["Salary"].apply(salary_medium)

In [None]:
def extract_city(location):
    # 匹配 "中山市/阜沙镇" 这种格式
    match = re.search(r"([^/]+市)/", location)
    if match:
        return match.group(1)
    # 匹配 "广东/中山市" 这种格式
    match = re.search(r"/([^/]+市)", location)
    if match:
        return match.group(1)
    return '其他'


df["City"] = df["Location"].apply(
    extract_city
)

In [None]:
plt.figure(dpi=150)
plt.title("各城市平均薪资")
df.groupby("City")["Salary mid"].mean().sort_values(ascending=False).plot.bar()
plt.ylabel("平均薪资")
plt.xlabel("城市")
plt.tight_layout()
plt.savefig("各城市平均薪资.png")

In [None]:
plt.figure(dpi=150, figsize=(15, 15))
df["City"].value_counts().plot.pie(
    autopct="%1.1f%%", startangle=120, wedgeprops=dict(width=0.3)
)
plt.title("招聘地域分布")
plt.ylabel('')
plt.tight_layout()
plt.savefig("招聘地域分布.png")

In [None]:
plt.figure(dpi=150)
df["Company"].value_counts().value_counts().sort_values(ascending=False).plot(kind='line')
plt.title("企业岗位数")
plt.xlabel("岗位个数")
plt.ylabel("企业数")
plt.tight_layout()
plt.savefig("企业岗位数.png")

In [None]:
plt.figure(dpi=150)
df["Position"].value_counts().value_counts().sort_values(ascending=False).plot.bar()
plt.title('单个职业岗位数')
plt.xlabel('岗位数')
plt.ylabel('职业数')
plt.tight_layout()
plt.savefig("单个职业岗位数.png")

In [None]:
df["Salary mid"] = df["Salary mid"].fillna(0)
sns.histplot(df["Salary mid"])
plt.title("薪资分布")
plt.xlabel("薪资（千元）")
plt.ylabel("个/千元区间")
plt.tight_layout()
plt.savefig("薪资分布.png")
