In [120]:
import numpy as np
import pandas as pd
from tensorflow import keras

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

import xgboost as xgb
from sklearn.svm import SVR
from tensorflow.keras import layers
from sklearn.tree import  DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [121]:
seed = 42 
np.random.seed(42)

file_path = './ds_salaries.csv'

In [122]:
df = pd.read_csv(filepath_or_buffer=file_path)

df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [123]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [124]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [125]:
categorical_columns = df.columns[df.dtypes == 'object']

categorical_columns

Index(['experience_level', 'employment_type', 'job_title', 'salary_currency',
       'employee_residence', 'company_location', 'company_size'],
      dtype='object')

In [126]:
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"{col} : {unique_values}\n")

experience_level : ['SE' 'MI' 'EN' 'EX']

employment_type : ['FT' 'CT' 'FL' 'PT']

job_title : ['Principal Data Scientist' 'ML Engineer' 'Data Scientist'
 'Applied Scientist' 'Data Analyst' 'Data Modeler' 'Research Engineer'
 'Analytics Engineer' 'Business Intelligence Engineer'
 'Machine Learning Engineer' 'Data Strategist' 'Data Engineer'
 'Computer Vision Engineer' 'Data Quality Analyst'
 'Compliance Data Analyst' 'Data Architect'
 'Applied Machine Learning Engineer' 'AI Developer' 'Research Scientist'
 'Data Analytics Manager' 'Business Data Analyst' 'Applied Data Scientist'
 'Staff Data Analyst' 'ETL Engineer' 'Data DevOps Engineer' 'Head of Data'
 'Data Science Manager' 'Data Manager' 'Machine Learning Researcher'
 'Big Data Engineer' 'Data Specialist' 'Lead Data Analyst'
 'BI Data Engineer' 'Director of Data Science'
 'Machine Learning Scientist' 'MLOps Engineer' 'AI Scientist'
 'Autonomous Vehicle Technician' 'Applied Machine Learning Scientist'
 'Lead Data Scientist' 'Cloud Da

In [127]:
df.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [128]:
data_values = df.work_year.value_counts()

fig = go.Figure(data=go.Pie(
    labels=data_values.index, 
    values=data_values.values, 
    hole=0.4, 
    textinfo='label+percent',
    insidetextorientation='radial',
    marker=dict(
        colors=px.colors.sequential.RdBu, 
        line=dict(
            color='honeydew', 
            width=2
        )
    ),
))

fig.update_layout(
    title="数据在各个年份的分布",
    annotations=[dict(text="年份", showarrow=False, font_size=20)],
    height=600
)

fig.show()

In [129]:
mean_salary = df.groupby('work_year')['salary_in_usd'].mean()

fig = make_subplots()

bar_trace = go.Bar(
    x=mean_salary.index,
    y=mean_salary.values,
    name="平均工资",
    marker=dict(color='steelblue') 
)
fig.add_trace(bar_trace)

line_trace = go.Scatter(
    x=mean_salary.index,
    y=mean_salary.values,
    name="平均工资",
    mode="lines+markers", 
    line=dict(color='darkorange'),
    marker=dict(symbol='circle-open', size=15)
)
fig.add_trace(line_trace)

fig.update_layout(
    title="平均工资在各个年份的分布",
    xaxis_title="年份",
    yaxis_title="平均工资",
    height=600
)

fig.show()

In [130]:
fig = px.box(
    data_frame=df, 
    x='work_year', 
    y='salary_in_usd', 
    color='work_year', 
    points='all', 
    height=600, 
    notched=True
)

fig.update_layout(
    title="工作年份与薪酬（美元）的关系",
    xaxis_title="工作年份",
    yaxis_title="薪酬（美元）"
)

fig.show()


In [131]:
data_values = df.experience_level.value_counts()


pie_fig = go.Figure(data=go.Pie(
    labels=data_values.index, 
    values=data_values.values, 
    hole=0.4, 
    textinfo='label+percent',
    insidetextorientation='radial',
    marker=dict(
        colors=px.colors.sequential.RdBu, 
        line=dict(
            color='honeydew', 
            width=2
        )
    )
))

pie_fig.update_layout(
    title="按工作经验划分数据分布",
    annotations=[dict(text="工作经验", showarrow=False, font_size=12)],
    height=600
)

pie_fig.show()

bar_fig = go.Figure(data=go.Bar(
    x=data_values.index,
    y=data_values.values,
    text=data_values.values,
    textposition='outside',
    marker=dict(
        color=px.colors.sequential.RdBu,
        line_color='white',
        line_width=2.5,
    )
)
)

bar_fig.update_layout(
    title="按工作经验划分数据分布",
    font_size=15,
    height=600
)

bar_fig.show()

In [132]:
data_values = df.work_year.value_counts()

fig = go.Figure(data=go.Pie(
    labels=data_values.index, 
    values=data_values.values, 
    hole=0.4, 
    textinfo='label+percent',
    insidetextorientation='radial',
    marker=dict(
        colors=px.colors.sequential.RdBu, 
        line=dict(
            color='honeydew', 
            width=2
        )
    ),
))

fig.update_layout(
    title="数据在各个年份的分布",
    annotations=[dict(text="年份", showarrow=False, font_size=20)],
    height=600
)


fig.show()

In [133]:
fig = px.histogram(
    data_frame=df,
    x='experience_level',
    facet_col='work_year',
    nbins=7,
    text_auto=True,
    labels={'experience_level': '不同的雇佣类型', 'count': '数量'},
    title='不同年份的雇佣类型的分布'
)

fig.update_traces(
    textposition='auto',
    marker_color='brown',
)

fig.update_layout(yaxis_title='数量')

fig.show()

In [134]:
count_values = df.experience_level.value_counts()
salary_values = df.groupby('experience_level').mean(
    numeric_only=True)['salary_in_usd']

fig = make_subplots(rows=1, cols=2, specs=[
                    [{'type': 'domain'}, {'type': 'domain'}]])

count_trace = go.Pie(
    labels=count_values.index,
    values=count_values.values,
    pull=[0, 0, 0, 0.2],
    name="数量"
)

salary_trace = go.Pie(
    labels=salary_values.index,
    values=salary_values.values,
    pull=[0, 0.2, 0, 0],
    name="平均工资"
)


fig.add_trace(count_trace, row=1, col=1)
fig.add_trace(salary_trace, row=1, col=2)

# Update the traces
fig.update_traces(
    textinfo='label+percent',
    hole=0.4,
    marker=dict(
        colors=px.colors.sequential.RdBu,
        line_color='lavender',
        line_width=2.5
    )
)

fig.update_layout(
    title_text="根据雇佣类型划分的平均工资分布",
    height=600,
    annotations=[
        dict(
            text="数量",
            font_size=17,
            showarrow=False,
            x=0.17
        ),
        dict(
            text="平均工资",
            font_size=20,
            showarrow=False,
            x=0.83
        )
    ]
)

fig.show()

fig = make_subplots()

bar_trace = go.Bar(
    x=salary_values.index,
    y=salary_values.values,
    name="平均工资(柱状图)",
    text=salary_values.values,
    marker=dict(
        color='brown',
        line_color='white',
        line_width=2.5,
    )
)

fig.add_trace(bar_trace)

fig.update_layout(
    title="根据雇佣类型划分的平均工资分布",
    xaxis_title="雇佣类型",
    yaxis_title="平均工资（美元）",
    height=600
)

fig.show()

In [135]:
count_values = df.experience_level.value_counts()
salary_values = df.groupby('experience_level').mean(
    numeric_only=True)['salary_in_usd']

fig = make_subplots(rows=1, cols=2, specs=[
                    [{'type': 'domain'}, {'type': 'domain'}]])

count_trace = go.Pie(
    labels=count_values.index,
    values=count_values.values,
    pull=[0, 0, 0, 0.2],
    name="数量"
)

salary_trace = go.Pie(
    labels=salary_values.index,
    values=salary_values.values,
    pull=[0, 0.2, 0, 0],
    name="平均工资"
)

fig.add_trace(count_trace, row=1, col=1)
fig.add_trace(salary_trace, row=1, col=2)

fig.update_traces(
    textinfo='label+percent',
    hole=0.4,
    marker=dict(
        colors=px.colors.sequential.RdBu,
        line_color='lavender',
        line_width=2.5
    )
)

fig.update_layout(
    title_text="根据工作经验划分的数据分布",
    height=600,
    annotations=[
        dict(
            text="数量",
            font_size=17,
            showarrow=False,
            x=0.17
        ),
        dict(
            text="平均工资",
            font_size=20,
            showarrow=False,
            x=0.83
        )
    ]
)

fig.show()

fig = make_subplots()

bar_trace = go.Bar(
    x=salary_values.index,
    y=salary_values.values,
    name="平均工资(柱状图)",
    text=salary_values.values,
    marker=dict(
        color='brown',
        line_color='white',
        line_width=2.5,
    )
)

fig.add_trace(bar_trace)

fig.update_layout(
    title="根据雇佣类型划分的平均工资分布",
    xaxis_title="雇佣类型",
    yaxis_title="平均工资（美元）",
    height=600
)

fig.show()

In [136]:
fig = px.box(df, x='experience_level', y='salary_in_usd', color='experience_level', facet_col='work_year')

fig.update_layout(
    title="根据工作经验划分的薪酬（美元）分布",
    yaxis_title="薪酬（美元）",
    height=600
)

fig.update_xaxes(title_text="工作经验")

fig.show()

In [137]:
fig = px.violin(df, x='experience_level', y='salary_in_usd', color='experience_level', facet_col='work_year', points='all')

fig.update_layout(
    title="根据工作经验划分的薪酬（美元）分布",
    yaxis_title="薪酬（美元）",
    height=600
)

fig.update_xaxes(title_text="工作经验")
fig.show()

In [138]:
fig = px.sunburst(df, path=['work_year', 'experience_level'], values='salary_in_usd')

fig.update_layout(
    title='按照工作经验和工作年份划分的总薪酬分布', 
    height=600  
)

fig.show()

In [139]:
count_values = df.employment_type.value_counts()
salary_values = df.groupby('employment_type').mean(
    numeric_only=True)['salary_in_usd']


fig = make_subplots(rows=1, cols=2, specs=[
                    [{'type': 'domain'}, {'type': 'domain'}]])

count_trace = go.Pie(
    labels=count_values.index,
    values=count_values.values,
    name="Value Counts"
)

salary_trace = go.Pie(
    labels=salary_values.index,
    values=salary_values.values,
    name="Mean Salary"
)

fig.add_trace(count_trace, row=1, col=1)
fig.add_trace(salary_trace, row=1, col=2)


fig.update_traces(
    textinfo='label+percent',
    hole=0.4,
    marker=dict(
        colors=px.colors.sequential.RdBu,
        line_color='lavender',
        line_width=2.5
    )
)

fig.update_layout(
    title_text="根据工作经验划分的数据分布",
    height=600,
    annotations=[
        dict(
            text="数量",
            font_size=17,
            showarrow=False,
            x=0.17
        ),
        dict(
            text="平均薪酬",
            font_size=20,
            showarrow=False,
            x=0.83
        )
    ]
)

fig.show()


fig = make_subplots()

bar_trace = go.Bar(
    x=salary_values.index,
    y=salary_values.values,
    name="Mean Salary (Bar)",
    text=salary_values.values,
    marker=dict(
        color='brown',
        line_color='white',
        line_width=2.5,
    )
)

fig.add_trace(bar_trace)
 

fig.update_layout(
    title="不同工作经验的平均工资分布",
    xaxis_title="工作经验",
    yaxis_title="平均薪酬（美元）",
    height=600
)


fig.show()

In [140]:
box_plot = px.box(
    data_frame=df,
    x='employment_type',
    y='salary_in_usd',
    points='outliers',
    title="不同的工作岗位的薪酬分布",
    color='employment_type',
    notched=True
)

box_plot.update_layout(
    xaxis_title="工作岗位类型",
    yaxis_title="薪酬（美元）",
    height=600
)

box_plot.show()

facet_box_plot = px.box(
    data_frame=df,
    x='employment_type',
    y='salary_in_usd',
    points='outliers',
    title="不同的工作岗位的薪酬分布",
    color='employment_type',
    facet_col='experience_level'
)

facet_box_plot.update_layout(
    xaxis_title="工作岗位类型",
    yaxis_title="薪酬（美元）",
    height=600
)

facet_box_plot.update_xaxes(title_text="工作岗位类型")
facet_box_plot.show()

violin_plot = px.violin(
    data_frame=df,
    x='employment_type',
    y='salary_in_usd',
    points='all',
    title="不同的工作岗位的薪酬分布",
    color='employment_type'
)

violin_plot.update_layout(
    xaxis_title="工作岗位类型",
    yaxis_title="薪酬（美元）",
    height=600
)

violin_plot.show()


In [141]:
box_plot = px.box(
    data_frame=df,
    x='work_year',
    y='salary_in_usd',
    color='employment_type',
    title="不同的工作岗位类型的薪酬分布",
    
    labels={'work_year': '年份', 'salary_in_usd': '薪酬（美元）', 'employment_type': 'Employment Type'}
)

box_plot.update_layout(
    legend_title="工作岗位类型",
    height=600
)

box_plot.show()


In [142]:
fig = px.sunburst(df, path=['experience_level', 'employment_type', 'work_year'], values='salary_in_usd')

fig.update_layout(
    title='按照工作年份，工作岗位类型和工作经验水平划分的总薪酬分布',
    height=600 )

fig.show()

In [143]:
count_values = df.company_size.value_counts()
salary_values = df.groupby('company_size').mean(
    numeric_only=True)['salary_in_usd']

fig = make_subplots(rows=1, cols=2, specs=[
                    [{'type': 'domain'}, {'type': 'domain'}]])

count_trace = go.Pie(
    labels=count_values.index,
    values=count_values.values,
    name="Value Counts"
)

salary_trace = go.Pie(
    labels=salary_values.index,
    values=salary_values.values,
    name="Mean Salary"
)

fig.add_trace(count_trace, row=1, col=1)
fig.add_trace(salary_trace, row=1, col=2)

fig.update_traces(
    textinfo='label+percent',
    hole=0.4,
    marker=dict(
        colors=px.colors.sequential.RdBu,
        line_color='lavender',
        line_width=2.5
    )
)
fig.update_layout(
    title_text="根据不同公司规模划分的分布",
    height=600,
    annotations=[
        dict(
            text="数量",
            font_size=17,
            showarrow=False,
            x=0.17
        ),
        dict(
            text="平均薪酬",
            font_size=20,
            showarrow=False,
            x=0.83
        )
    ]
)

fig.show()

fig = make_subplots()

bar_trace = go.Bar(
    x=salary_values.index,
    y=salary_values.values,
    name="平均薪酬 (柱状图)",
    text=salary_values.values,
    marker=dict(
        color='brown',
        line_color='white',
        line_width=2.5,
    )
)

fig.add_trace(bar_trace)


fig.update_layout(
    title="根据不同公司规模划分的平均薪酬分布",
    xaxis_title="工作经验",
    yaxis_title="平均薪酬（美元）",
    height=600
)

fig.show()

In [144]:
histogram = px.histogram(
    data_frame=df,
    x='salary_currency',
    y='salary_in_usd',
    text_auto=True,
    color='company_size',
    title='按照薪酬币种划分的薪酬分布',
    labels={'salary_currency': 'Salary Currency', 'salary_in_usd': 'Salary (USD)'}
)

histogram.update_layout(
    showlegend=True,
    height=600
)

histogram.show()

swarm_plot = px.strip(
    data_frame=df,
    x='salary_currency',
    y='salary_in_usd',
    color='company_size',
    title='按照薪酬币种划分的薪酬分布',
    labels={'salary_currency': '不同货币', 'salary_in_usd': '薪酬（美元）'},
    hover_data={'salary_in_usd': ':$.2f'}
)

swarm_plot.update_layout(
    showlegend=True,
    height=600
)

swarm_plot.show()


In [145]:
location_counts = (df['company_location'] == df['employee_residence']).value_counts()

pie_chart = px.pie(
    names=['Matching Locations', 'Non-Matching Locations'],
    values=location_counts.values,
    title='公司所在地和员工居住地的匹配情况',
    color_discrete_sequence=px.colors.sequential.RdBu,
    height=600,
    hole=0.4,
)

pie_chart.update_traces(
    insidetextorientation='radial',
    textinfo='label+percent',
    marker=dict(
        line=dict(
            color='honeydew',
            width=2
        )
    )
)

pie_chart.show()


In [146]:
remote_counts = df['remote_ratio'].value_counts()

fig = px.pie(
    names=remote_counts.index,
    values=remote_counts.values,
    color_discrete_sequence=px.colors.sequential.RdBu,
    hole=0.4,
    title="远程工作比例分布",
    height=600
)

fig.update_traces(
    textinfo='label+percent',
    marker=dict(
        line=dict(
            color='honeydew',
            width=2
        )
    )
)

fig.show()

In [147]:
histogram = px.histogram(
    data_frame=df,
    x='remote_ratio',
    color='work_year',
    barmode='group',
    title='工作年份和远程工作比例分布',
    labels={'remote_ratio': '远程工作比例', 'work_year': '工作年份'},
    height=500
)
histogram.update_yaxes(title_text='数量')
histogram.update_layout(showlegend=True)
histogram.show()

box_plot = px.box(
    data_frame=df,
    x='remote_ratio',
    y='salary_in_usd',
    color='work_year',
    title='工作年限和远程工作比例对薪酬的影响',
    labels={'remote_ratio': '远程工作比例', 'salary_in_usd': '薪酬 (美元)', 'work_year': '工作年份'},
    height=500
)
box_plot.show()

sunburst = px.sunburst(
    data_frame=df,
    path=['remote_ratio', 'work_year'],
    title='远程比例和工作年份的层次结构',
    height=500
)
sunburst.show()

In [148]:
processed_data = df.copy()

processed_data.drop(columns=['work_year', 'job_title', 'salary_currency', 'salary', 'employee_residence', 'company_location'], inplace=True)

processed_data.columns = ['Exp_level', "Employment_Type", 'Salary', "Remote_Ratio", 'Cmp_Size']

processed_data.tail()

Unnamed: 0,Exp_level,Employment_Type,Salary,Remote_Ratio,Cmp_Size
3750,SE,FT,412000,100,L
3751,MI,FT,151000,100,L
3752,EN,FT,105000,100,S
3753,EN,CT,100000,100,L
3754,SE,FT,94665,50,L


In [149]:
categorical_cols = processed_data.select_dtypes(include=['object']).columns

cat_encoders = {}

for col in categorical_cols:
    encoder = OrdinalEncoder()
    processed_data[col] = encoder.fit_transform(processed_data[[col]].to_numpy())
    cat_encoders[col] = encoder
    
processed_data.head()

Unnamed: 0,Exp_level,Employment_Type,Salary,Remote_Ratio,Cmp_Size
0,3.0,2.0,85847,100,0.0
1,2.0,0.0,30000,100,2.0
2,2.0,0.0,25500,100,2.0
3,3.0,2.0,175000,100,1.0
4,3.0,2.0,120000,100,1.0


In [150]:
scaler = StandardScaler()

mirror_data = processed_data.copy()
target = mirror_data.pop('Salary').to_numpy()
features = mirror_data.to_numpy()

scalled_data = scaler.fit_transform(features)

In [151]:
linear_corr = processed_data.corr(method='pearson').round(2)

corr_heatmap = px.imshow(linear_corr,
                         text_auto=True,
                         labels=dict(x="Features", y="Features",
                                     color="Correlation"),
                         color_continuous_scale=px.colors.sequential.RdBu,
                         title="线性相关（热图）",
                         height=600
                         )

corr_heatmap.update_layout(
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_title="特征",
    yaxis_title="特征",
    coloraxis_colorbar_title="相关性"
)

corr_heatmap.show()

In [152]:
spearman_corr = processed_data.corr(method='spearman').round(2)

corr_heatmap = px.imshow(spearman_corr,
                         text_auto=True,
                         labels=dict(x="Features", y="Features",
                                     color="Correlation"),
                         color_continuous_scale=px.colors.sequential.RdBu,
                         title="Spearman 相关性（热图）",
                         height=600
                         )

corr_heatmap.update_layout(
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    xaxis_title="特征",
    yaxis_title="特征",
    coloraxis_colorbar_title="相关性"
)

corr_heatmap.show()

In [153]:
X_train, X_test, y_train, y_test = train_test_split(
    scalled_data, 
    target, 
    train_size=0.8, 
    test_size=0.2, 
    random_state=42
)

performances = pd.DataFrame(columns=["Model_Name", "MSE", "RMSE", "MAE", "R2_Score"])

In [154]:
def compute_performance(model, test_data):
 
    X, y = test_data
    
    model_predictions = model.predict(X)
    
    mse = mean_squared_error(y, model_predictions)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, model_predictions)
    r2 = r2_score(y, model_predictions)
    return mse, rmse, mae, r2

In [155]:
lr_model = LinearRegression()

lr_model.fit(X_train, y_train)

lr_predictions = lr_model.predict(X_test)

lr_mse, lr_rmse, lr_mae, lr_r2 = compute_performance(model=lr_model, test_data=[X_test, y_test])

performances.loc[0] = ["Linear Regression", lr_mse, lr_rmse, lr_mae, lr_r2]

print(f"Mean Squared Error: {lr_mse:.2f}")
print(f"Root Mean Squared Error: {lr_rmse:.2f}")
print(f"Mean Absolute Error: {lr_mae:.2f}")
print(f"R-squared: {lr_r2:.2f}")


Mean Squared Error: 3463457997.60
Root Mean Squared Error: 58851.15
Mean Absolute Error: 45998.33
R-squared: 0.12


In [156]:
svr_model = SVR()

svr_model.fit(X_train, y_train)

svr_predictions = svr_model.predict(X_test)

svr_mse, svr_rmse, svr_mae, svr_r2 = compute_performance(model=svr_model, test_data=[X_test, y_test])

performances.loc[1] = ["SVR", svr_mse, svr_rmse, svr_mae, svr_r2]

print(f"Mean Squared Error: {svr_mse:.2f}")
print(f"Root Mean Squared Error: {svr_rmse:.2f}")
print(f"Mean Absolute Error: {svr_mae:.2f}")
print(f"R-squared: {svr_r2:.2f}")


Mean Squared Error: 3944415638.60
Root Mean Squared Error: 62804.58
Mean Absolute Error: 48552.75
R-squared: 0.00


In [157]:
dtr_model = DecisionTreeRegressor()

dtr_model.fit(X_train, y_train)

dtr_predictions = dtr_model.predict(X_test)

dtr_mse, dtr_rmse, dtr_mae, dtr_r2 = compute_performance(model=dtr_model, test_data=[X_test, y_test])

performances.loc[2] = ["DTR", dtr_mse, dtr_rmse, dtr_mae, dtr_r2]

print(f"Mean Squared Error: {dtr_mse:.2f}")
print(f"Root Mean Squared Error: {dtr_rmse:.2f}")
print(f"Mean Absolute Error: {dtr_mae:.2f}")
print(f"R-squared: {dtr_r2:.2f}")

Mean Squared Error: 3171655830.63
Root Mean Squared Error: 56317.46
Mean Absolute Error: 43745.52
R-squared: 0.20


In [158]:
rfr_model = RandomForestRegressor()

rfr_model.fit(X_train, y_train)

rfr_predictions = rfr_model.predict(X_test)

rfr_mse, rfr_rmse, rfr_mae, rfr_r2 = compute_performance(model=rfr_model, test_data=[X_test, y_test])

performances.loc[3] = ["RFR", rfr_mse, rfr_rmse, rfr_mae, rfr_r2]

print(f"Mean Squared Error: {rfr_mse:.2f}")
print(f"Root Mean Squared Error: {rfr_rmse:.2f}")
print(f"Mean Absolute Error: {rfr_mae:.2f}")
print(f"R-squared: {rfr_r2:.2f}")

Mean Squared Error: 3166080618.21
Root Mean Squared Error: 56267.94
Mean Absolute Error: 43703.84
R-squared: 0.20


In [159]:
xgb_model = xgb.XGBRegressor()

xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(X_test)

xgb_mse, xgb_rmse, xgb_mae, xgb_r2 = compute_performance(model=xgb_model, test_data=[X_test, y_test])

performances.loc[4] = ["XGB", xgb_mse, xgb_rmse, xgb_mae, xgb_r2]

print(f"Mean Squared Error: {xgb_mse:.2f}")
print(f"Root Mean Squared Error: {xgb_rmse:.2f}")
print(f"Mean Absolute Error: {xgb_mae:.2f}")
print(f"R-squared: {xgb_r2:.2f}")


Mean Squared Error: 3170733542.79
Root Mean Squared Error: 56309.27
Mean Absolute Error: 43716.80
R-squared: 0.20


In [160]:
fig_mse = px.bar(performances, x='Model_Name', y='MSE', title='均方误差比较')

fig_mse.update_layout(
    xaxis_title='模型',
    yaxis_title='均方误差',
    showlegend=False
)

fig_mse.show()

fig_rmse = px.bar(performances, x='Model_Name', y='RMSE', title='均方根误差比较')

fig_rmse.update_layout(
    xaxis_title='模型',
    yaxis_title='均方根误差',
    showlegend=False
)

fig_rmse.show()

fig_mae = px.bar(performances, x='Model_Name', y='MAE', title='平均绝对误差比较')

fig_mae.update_layout(
    xaxis_title='模型',
    yaxis_title='平均绝对误差',
    showlegend=False
)

fig_mae.show()

fig_r2 = px.bar(performances, x='Model_Name', y='R2_Score', title='R 平方比较')

fig_r2.update_layout(
    xaxis_title='模型',
    yaxis_title='R 平方',
    showlegend=False
)

fig_r2.show()
