In [3]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv("clustered_jobs_with_0.9.csv")




In [6]:

# 检查工作地址列中是否存在缺失值，并删除这些行
df = df.dropna(subset=['工作地址'])

# 根据职位名称和招聘人数对数据进行聚合
job_counts = df.groupby('职位名称')['招聘人数'].sum().reset_index()

# 输出整体数据中招聘人数最多的前10个工作岗位
top_10_jobs = job_counts.sort_values(by='招聘人数', ascending=False).head(10)
print("整体数据中招聘人数最多的前10个工作岗位:")
print(top_10_jobs)

# 筛选工作地址在北京的数据
beijing_jobs = df[df['省']=='北京']

# 对北京的工作岗位进行聚合
beijing_job_counts = beijing_jobs.groupby('职位名称')['招聘人数'].sum().reset_index()

# 输出在北京数量最多的前10个工作岗位
top_10_beijing_jobs = beijing_job_counts.sort_values(by='招聘人数', ascending=False).head(10)
print("\n北京数据中招聘人数最多的前10个工作岗位:")
print(top_10_beijing_jobs)


整体数据中招聘人数最多的前10个工作岗位:
                职位名称   招聘人数
173         大数据开发工程师  201.0
22         .net开发工程师  177.0
129        python工程师  104.0
245            软件工程师   36.0
0               .NET   35.0
85            Python   26.0
54            Hadoop   25.0
60   Hadoop中级运维开发工程师   20.0
157          后端开发工程师   16.0
206            开发工程师   14.0

北京数据中招聘人数最多的前10个工作岗位:
                   职位名称  招聘人数
70             大数据开发工程师  95.0
43            python工程师  86.0
25               Python  23.0
9                Hadoop  14.0
13      Hadoop中级运维开发工程师  13.0
117               软件工程师  10.0
58              后端开发工程师   8.0
91                开发工程师   7.0
19   Java研发工程师-【用户画像方向】   6.0
33               RPA工程师   4.0


In [7]:
from datetime import datetime

def filter_and_sort_data(df, province, start_date=None, end_date=None, job_title=None):
    """
    Filter and sort job data based on province, time interval, and job title.
    
    Parameters:
    - df (DataFrame): The input DataFrame containing job data.
    - province (str): The province to filter by.
    - start_date (str, optional): The start date of the time interval in 'YYYY-MM-DD' format.
    - end_date (str, optional): The end date of the time interval in 'YYYY-MM-DD' format.
    - job_title (str, optional): The job title to filter by.
    
    Returns:
    - DataFrame: The filtered and sorted DataFrame.
    """
    
    # Filter by province
    filtered_df = df[df['工作地址'].str.contains(province)]
    
    # Filter by time interval if start_date and end_date are provided
    if start_date and end_date:
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
        filtered_df = filtered_df[(filtered_df['更新日期'] >= start_date) & (filtered_df['更新日期'] <= end_date)]
    
    # Filter by job title if provided
    if job_title:
        filtered_df = filtered_df[filtered_df['职位名称'] == job_title]
    
    # Sort the data by recruitment numbers in descending order
    sorted_df = filtered_df.groupby('职位名称')['招聘人数'].sum().reset_index()
    sorted_df = sorted_df.sort_values(by='招聘人数', ascending=False)
    
    return sorted_df

# Example usage:
filtered_data = filter_and_sort_data(df, '北京', '2024-01-01', '2024-06-30', '大数据开发工程师')
print(filtered_data)


TypeError: '>=' not supported between instances of 'str' and 'datetime.datetime'