In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score

In [5]:
salary_df = pd.read_csv('Salary.csv')

## 1. 预测职业Model - 随机森林分类模型（Random Forest Classifier)

## 1.1 查看数据结构和预处理

In [6]:
# 查看数据基本信息
print(salary_df.info())

# 显示前几行数据
salary_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6684 entries, 0 to 6683
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6684 non-null   float64
 1   Gender               6684 non-null   object 
 2   Education Level      6684 non-null   int64  
 3   Job Title            6684 non-null   object 
 4   Years of Experience  6684 non-null   float64
 5   Salary               6684 non-null   float64
 6   Country              6684 non-null   object 
 7   Race                 6684 non-null   object 
 8   Senior               6684 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 470.1+ KB
None


Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0


### 选取特征和目标变量：

In [7]:
X = salary_df[['Gender', 'Race', 'Education Level']]
y = salary_df['Job Title']

### 编码类别变量（使用 One-Hot Encoding）

In [8]:
# 使用 ColumnTransformer 对类别列做 One-Hot 编码
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['Gender', 'Race', 'Education Level'])
    ])

# 构建完整 pipeline：预处理 + 分类器
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

##  1.2 划分训练集与测试集

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 1.3 训练模型

In [10]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Gender', 'Race',
                                                   'Education Level'])])),
                ('classifier', RandomForestClassifier(random_state=42))])

## 1.4 模型评估

In [26]:
# 预测
y_pred = model.predict(X_test)

# 准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# 分类报告
print(classification_report(y_test, y_pred))

# 混淆矩阵
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Model Accuracy: 0.19
                                 precision    recall  f1-score   support

                     Accountant       0.00      0.00      0.00         1
       Administrative Assistant       0.00      0.00      0.00         1
             Back end Developer       0.00      0.00      0.00        56
               Business Analyst       0.00      0.00      0.00         2
 Business Development Associate       0.00      0.00      0.00         2
   Business Development Manager       0.00      0.00      0.00         2
  Business Intelligence Analyst       0.00      0.00      0.00         1
    Business Operations Analyst       0.00      0.00      0.00         1
      Content Marketing Manager       0.00      0.00      0.00        12
                     Copywriter       0.00      0.00      0.00         1
       Customer Service Manager       0.00      0.00      0.00         1
           Customer Service Rep       0.00      0.00      0.00         1
Customer Service Representati

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 1.5 使用模型预测新学生数据

In [27]:
# 新学生数据
new_student = pd.DataFrame({
    'Gender': ['Female'],
    'Race': ['White'],
    'Education Level': ["Master's"]
})

# 预测职业
predicted_job_title = model.predict(new_student)
print(f"Predicted Job Title: {predicted_job_title[0]}")

Predicted Job Title: Software Engineer


## 2. 预测薪酬 Model - 随机森林回归模型（Random Forest Regressor

### 2.1 查看数据结构和预处理

In [28]:
# 查看数据基本信息
print(salary_df.info())

# 显示前几行数据
salary_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6684 entries, 0 to 6683
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  6684 non-null   float64
 1   Gender               6684 non-null   object 
 2   Education Level      6684 non-null   int64  
 3   Job Title            6684 non-null   object 
 4   Years of Experience  6684 non-null   float64
 5   Salary               6684 non-null   float64
 6   Country              6684 non-null   object 
 7   Race                 6684 non-null   object 
 8   Senior               6684 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 470.1+ KB
None


Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Senior
0,32.0,Male,1,Software Engineer,5.0,90000.0,UK,White,0
1,28.0,Female,2,Data Analyst,3.0,65000.0,USA,Hispanic,0
2,45.0,Male,3,Manager,15.0,150000.0,Canada,White,1
3,36.0,Female,1,Sales Associate,7.0,60000.0,USA,Hispanic,0
4,52.0,Male,2,Director,20.0,200000.0,USA,Asian,0


### 选取特征和目标变量：

In [29]:
# 特征列
X = salary_df[['Age', 'Education Level', 'Job Title', 'Years of Experience']]
# 目标变量
y = salary_df['Salary']

### 2.2 预处理：编码分类变量 + 数值标准化

In [30]:
# 分类变量列（Education Level, Job Title）
categorical_features = ['Education Level', 'Job Title']

# 数值变量列（Age, Years of Experience）
numeric_features = ['Age', 'Years of Experience']

# 构建预处理器
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

### 2.3 构建模型流水线（Pipeline）

In [31]:
# 构建完整 pipeline：预处理 + 模型
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

### 2.4 划分训练集和测试集

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 2.5 训练模型

In [36]:
model.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['Age',
                                                   'Years of Experience']),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Education Level',
                                                   'Job Title'])])),
                ('regressor', RandomForestRegressor(random_state=42))])

### 2.6 评估模型表现

In [37]:
# 预测
y_pred = model.predict(X_test)

# 评估指标
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error: 84329476.44
Root Mean Squared Error: 9183.11
R² Score: 0.97


### 2.7 用模型预测新学生数据的 Salary

In [38]:
new_student = pd.DataFrame({
    'Age': [24],
    'Education Level': ["Master's"],
    'Job Title': ["Data Scientist"],
    'Years of Experience': [1]
})

predicted_salary = model.predict(new_student)
print(f"Predicted Salary: ${predicted_salary[0]:,.2f}")

Predicted Salary: $72,290.00
