# 匯入套件

In [9]:
#基本套件
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] 
plt.rcParams['axes.unicode_minus'] = False

#plotly畫圖套件
import plotly.express as px
import plotly.graph_objs as go

#機器學習套件
import numpy as np
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split



# 資料前處理

## 讀檔&設定index

In [10]:
#讀檔，用utf-8的格式開
df = pd.read_csv('./dataset/processed_data/97-111 A1事件預測資料集.csv', encoding='utf-8')

#將df['統計期']轉成df的index，並把index的名字移除
df.set_index('統計期', inplace=True) 
df.index.name = None


## 數據&特徵 可視化

In [53]:
accident_num = ['肇事總件數(件)','A1類發生件數(件)','A2類發生件數(件)']
accident_people = ['A1類死亡人數(人)','A1類受傷人數(人)','A2類受傷人數(人)']
car_num = ['總計車輛數', '大客車', '大貨車', '小客車', '小貨車', '特種車', '機車-重型', '機車-輕型']
a1_index = ['A1類發生件數(件)','A1類死亡人數(人)','A1類受傷人數(人)']

# 創建圖
fig = go.Figure()

# 添加肇事事件數的數據
for column in accident_num:
    fig.add_trace(
        go.Scatter(x=df.index, y=df[column], mode='lines', name=column, visible=True)) #初始可見

# 添加傷亡人數的數據
for column in accident_people:
    fig.add_trace(
        go.Scatter(x=df.index, y=df[column], mode='lines', name=column, visible=False))
    
# 添加車輛數的數據
for column in car_num:
    fig.add_trace(
        go.Scatter(x=df.index, y=df[column], mode='lines', name=column, visible=False))

#添加A1指標的數據
for column in a1_index:
    fig.add_trace(
        go.Scatter(x=df.index, y=df[column], mode='lines', name=column, visible=False))

# 建立按钮列表
buttons = [
    dict(
        label="事故件數",
        method="update",
        args=[{"visible": [True] * len(accident_num) + [False] * len(accident_people) + [False] * len(car_num) + [False] * len(a1_index)}, 
              {"title": "97-111 交通事故件數(件)", "annotations": []}]
    ),
    dict(
        label="事故傷亡人數",
        method="update",
        args=[{"visible": [False] * len(accident_num) + [True] * len(accident_people) + [False] * len(car_num) + [False] * len(a1_index)}, 
              {"title": "97-111 交通事故傷亡人數(人)", "annotations": []}]
    ),
    dict(
        label="機動車輛數",
        method="update",
        args=[{"visible": [False] * len(accident_num) + [False] * len(accident_people) + [True] * len(car_num) + [False] * len(a1_index)}, 
              {"title": "97-111 機動車輛數(輛)", "annotations": []}]
    ),
    dict(
        label="事故件數",
        method="update",
        args=[{"visible": [False] * len(accident_num) + [False] * len(accident_people) + [False] * len(car_num) + [True] * len(a1_index)}, 
              {"title": "97-111 A1類交通事故指數(件or人)", "annotations": []}]
    ),
]

# 更新布局
fig.update_layout(
    title='97-111 交通事故件數(件)', #初始圖的標題
    xaxis=dict(title="時間(月)", rangeslider_visible=True, rangeselector=dict(
        buttons=list([
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(count=2, label="2y", step="year", stepmode="backward"),
            dict(count=3, label="3y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )),
    updatemenus=[
        dict(buttons=buttons)
    ]
)

# 显示图表
fig.show()

# # 保存为 HTML 文件
fig.write_html("./dataset/graph/97-111.3 交通事故各指標視覺圖.html")


# 資料拆分&特徵工程

## 預測任務設定

看一下我們目前資料集中有的特徵

In [None]:
print('資料集的長度為: {}\n目前有的特徵數量為: {}\n特徵為: {}'.format(len(df), len(df.columns), df.columns))

任務目標是需要預測出'A1類發生件數(件)', 故y(label) = 'A1類發生件數(件)'

資料集中剩下的特徵就為 X(feature)  
但由於'肇事總件數(件)'跟'A2類發生件數(件)'過於接近相似，故移除相對難取得真值的'肇事總件數(件)'

In [None]:
y = df['A1類發生件數(件)']
X = df.iloc[:,3:]

print('label 的維度信息: ', y.shape)
print('feature 的維度信息: ', X.shape)

In [None]:

X_train = X[:int(0.7*len(df))]
X_test = X[int(0.7*len(df)):]
y_train = y[:int(0.7*len(df))]
y_test = y[int(0.7*len(df)):]

print('X_train 的維度信息: ', X_train.shape)
print('X_test  的維度信息: ', X_test.shape)
print('y_train 的維度信息: ', y_train.shape)
print('y_test  的維度信息: ', y_test.shape)

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train) #標準化，使每個參數的 mean=0,std=1
X_test = scaler.transform(X_test)

X_train 正則化後的箱型圖

In [54]:
#因為X_train為numpy數組，需要先將其轉回dataframe格式才能作圖
X_train_df = pd.DataFrame(X_train, columns=X.columns)

fig = go.Figure()

for column in X_train_df.columns:
    fig.add_trace(go.Box(y=X_train_df[column], name=column, visible=True))


for column in X.columns:
    fig.add_trace(go.Box(y=X[column], name=column, visible=False))

# 建立按钮列表
buttons = [
    dict(
        label="標準化後",
        method="update",
        args=[{"visible": [True] * len(X_train_df.columns) + [False] * len(X.columns)}, 
              {"title": "X_train(各參數)標準化後的箱型圖", "annotations": []}]
    ),
    dict(
        label="標準化前",
        method="update",
        args=[{"visible": [False] * len(X_train_df.columns) + [True] * len(X.columns)}, 
              {"title": "X_train(各參數)標準化前的箱型圖", "annotations": []}]
    )]

fig.update_layout(title='X_train(各參數)標準化後的箱型圖', updatemenus = [dict(buttons=buttons)])
fig.show()

fig.write_html("./dataset/graph/X_train標準化前後對比的箱型圖.html")

NameError: name 'X_train' is not defined

# 模型建立

In [None]:
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

# 預測 & 評估模型

In [None]:
result_df = pd.DataFrame(index = df.index[int(0.7*len(df)):], columns=['true value', 'LinearRegression predict'])

# 将数据填充到 DataFrame 中
result_df['true value'] = y_test
result_df['LinearRegression predict'] = y_predict

In [None]:
# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(y_test, y_predict))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(y_test, y_predict)))

In [None]:
fig = px.box(result_df, x= index)
fig.show()

# 預測可視化

In [None]:
## 计算中间位置
# x_middle = (max(time_date) - min(time_date)) / 2 + min(time_date)
# y_middle = (max(max(y_test), max(y_predict)) - min(min(y_test), min(y_predict))) / 2 + min(min(y_test), min(y_predict))

# 或者指定右上角位置
# x_right_top = max(time_date)

# y_right_top = max(max(y_test), max(y_predict))

In [7]:
#基本套件
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] 
plt.rcParams['axes.unicode_minus'] = False

#plotly畫圖套件
import plotly.express as px
import plotly.graph_objs as go

#機器學習套件
import numpy as np
from dash import Dash, dcc, html, Input, Output
from sklearn import linear_model, tree, neighbors
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score

app = Dash(__name__)


models = {'Muti-LinearRegression': linear_model.LinearRegression,
          'Decision Tree': tree.DecisionTreeRegressor,
          'k-NN': neighbors.KNeighborsRegressor}

app.layout = html.Div([
    html.H4("預測A1類發生件數(件)"),
    html.P("Select model:"),
    dcc.Dropdown(
        id='dropdown',
        options=["Muti-LinearRegression", "Decision Tree", "k-NN"],
        value='Decision Tree', #默認選擇的是 Decision Tree 這個選項
        clearable=False
    ),
    dcc.Graph(id='graph'),
])


@app.callback(
    Output('graph', 'figure'),
    Input('dropdown', 'value')
)



def train_and_display(name):
    #讀檔，用utf-8的格式開
    df = pd.read_csv('./dataset/processed_data/97-111 A1事件預測資料集.csv', encoding='utf-8')

    #將df['統計期']轉成df的index，並把index的名字移除
    df.set_index('統計期', inplace=True) 
    df.index.name = None


    y = df['A1類發生件數(件)']
    X = df.iloc[:,3:]


    X_train = X[:int(0.7*len(df))]
    X_test = X[int(0.7*len(df)):]
    y_train = y[:int(0.7*len(df))]
    y_test = y[int(0.7*len(df)):]

    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train) #標準化，使每個參數的 mean=0,std=1
    X_test = scaler.transform(X_test)

    model = models[name]()
    model.fit(X_train, y_train)

    y_predict = model.predict(X_test)

    mse = mean_squared_error(y_test, y_predict)
    r2 = r2_score(y_test, y_predict)

    
    time_date = df.index[int(0.7*len(df)):]

    fig = go.Figure([

        go.Scatter(x= time_date, y= y_test, 
                   name='真值', mode='lines+markers'), #, mode='markers'為散點圖
        go.Scatter(x= time_date, y= y_predict, 
                   name='預測值', mode='lines+markers'),
                   ])
    

    fig.update_layout(
        title='A1類發生件數(件)',
        xaxis=dict(title="時間(月)", rangeslider_visible=True, rangeselector=dict(
            buttons=list([
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(count=2, label="2y", step="year", stepmode="backward"),
                dict(count=3, label="3y", step="year", stepmode="backward"),
                dict(step="all")
            ]))
        ),

        annotations=[
            dict(                
                x = max(time_date),  
                y = max(max(y_test), max(y_predict)),
                xanchor='right',
                yanchor='top',

                text=f'MSE: {mse:.2f} <br>R<sup>2</sup> Score: {r2:.2f}',
                showarrow=False, #有無箭頭

                bgcolor='rgba(255, 255, 255, 0.8)', #紅色,綠色,藍色,透明度 (255=白,1=不透明)
                bordercolor='rgba(255, 255, 255, 0.8)',
                
                borderwidth=2,
                borderpad=4
            )
        ]
    )

    fig.write_html("./dataset/graph/預測A1類發生件數_時間序列圖.html")
    
    return fig

app.run_server(debug=True, port=8051)

a
b


In [None]:

colors = ['Positive' if c > 0 else 'Negative' for c in model.coef_]

fig = px.bar(
    x=X.columns, y=model.coef_, color=colors,
    color_discrete_sequence=['red', 'blue'],
    labels=dict(x='Feature', y='Linear coefficient'),
    title='Weight of each feature for predicting petal width'
)
fig.show()

# 完整code

In [5]:
#基本套件
import pandas as pd

#plotly畫圖套件
import plotly.express as px
import plotly.graph_objs as go

#機器學習套件
import numpy as np
import xgboost as xgb
from sklearn import linear_model
from sklearn import tree
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score

#定義函式

#設定一個資料處理的function
def data_process(): 
    #讀檔，用utf-8的格式開
    df = pd.read_csv('./dataset/processed_data/97-111 A1事件預測資料集.csv', encoding='utf-8')
    #將df['統計期']轉成df的index，並把index的名字移除
    df.set_index('統計期', inplace=True) 
    df.index.name = None
    #切分x,y
    y = df['A1類發生件數(件)']
    X = df.iloc[:,3:]
    #切分訓練,測試集
    X_train = X[:int(0.7*len(df))]
    X_test = X[int(0.7*len(df)):]
    y_train = y[:int(0.7*len(df))]
    y_test = y[int(0.7*len(df)):]
    #做標準化
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train) #標準化，使每個參數的 mean=0,std=1
    X_test = scaler.transform(X_test)
    #先存之後會用到的index
    time_date = df.index[int(0.7*len(df)):]

    return time_date, X_train, X_test, y_train, y_test

#設定一個train model 跟 display結果的function
def train_display(model_name, model_instance, time_date, X_train, X_test, y_train, y_test):
    #呼叫model
    model = model_instance
    #訓練模型並預測
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    #得到評估結果
    mse = mean_squared_error(y_test, y_predict)
    r2 = r2_score(y_test, y_predict)
    #把結果存在result_df
    result_df['{} predict'.format(model_name)] = y_predict

    #畫圖，真值跟預測值的時間序列圖
    fig = go.Figure([

        go.Scatter(x= time_date, y= y_test, 
                   name='真值', mode='lines+markers'), #, mode='markers'為散點圖
        go.Scatter(x= time_date, y= y_predict, 
                   name='預測值', mode='lines+markers'),
                   ])
    #增加底下時間滑桿
    fig.update_layout(
        title='{}_A1類發生件數(件)'.format(model_name),
        xaxis=dict(title="時間(月)", rangeslider_visible=True, rangeselector=dict(
            buttons=list([
                dict(count=6, label="6m", step="month", stepmode="backward"),
                dict(count=1, label="1y", step="year", stepmode="backward"),
                dict(count=2, label="2y", step="year", stepmode="backward"),
                dict(count=3, label="3y", step="year", stepmode="backward"),
                dict(step="all")
            ]))
        ),
        #增加R2,MSE的標註
        annotations=[
            dict(    
                #標註(X,Y)的位置            
                x = max(time_date),  
                y = max(max(y_test), max(y_predict)),
                #在方框裡靠右上?(不確定)
                xanchor='right',
                yanchor='top',
                
                #文字內容與邊框設置
                text=f'MSE: {mse:.2f} <br>R<sup>2</sup> Score: {r2:.2f}', #<sup>2</sup>上標， .2f取到小數點後兩位
                showarrow=False, #有無箭頭
                bgcolor='rgba(255, 255, 255, 0.8)',  #設定文字框的背景色，紅色,綠色,藍色,透明度 (255=白,1=不透明)
                bordercolor='rgba(255, 255, 255, 0.8)',#設定邊框颜色的
                borderwidth=2, #設定邊框寬度為2 個像素。
                borderpad=4    #設定邊框與圖表内容之間的間距為2個像素。
            )
        ]
    )
    #
    # fig.write_html("./dataset/graph/{}_A1類發生件數預測結果.html".format(model_name))
    
    fig.write_html("./dataset/graph/{}.html".format(model_name))
    
    from bs4 import BeautifulSoup as Soup

    html = open("./dataset/graph/{}.html".format(model_name), "r") 
    contents = html.read() 

    soup = Soup(contents)
    append_html = Soup("""<select id="abc" style="width:auto; font-size:18px">
                            <option value="./Decision_Tree.html">Decision_Tree</option>
                            <option value="./XGB_Regression.html">XGB_Regression</option>
                            <option value="./Multi-LinearRegression.html">Multi-LinearRegression</option> 
                        </select>
                        <script>
                            var path = window.location.pathname;
                            var page = path.split("/").pop();
                            const abc = document.getElementById("abc");
                            abc.value = "./" + page;
                            console.log(page);
                            abc.addEventListener("change",function(){
                                var url = this.value;
                                window.location = url;
                            })
                        </script>""" , "html.parser")
    soup.body.insert(0,append_html)

    with open("./dataset/graph/{}.html".format(model_name),"r+") as file:
        file.write(str(soup))
    

    return fig, result_df



#設定要使用哪些模型
models = {
    'Multi-LinearRegression': linear_model.LinearRegression(),
    'Decision_Tree': tree.DecisionTreeRegressor(max_depth = 5), 
    'XGB_Regression': xgb.XGBRegressor()
}

#先將資料做前處理
time_date, X_train, X_test, y_train, y_test = data_process()
#先建立好預測結果的資料集
result_df = pd.DataFrame(index=time_date, columns=['true value'])
result_df['true value'] = y_test

for model_name, model_instance in models.items():
    # print(f"Model Name: {model_name}")
    # print(f"Model Instance: {model_instance}")
    train_display(model_name, model_instance, time_date, X_train, X_test, y_train, y_test)


result_df.to_csv('./dataset/processed_data/A1事件預測結果資料集.csv', encoding='utf-8')