In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.models import load_model

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
import joblib

import numpy as np
import pandas as pd
import os

In [None]:
# !pip list --format=freeze > requirements.txt


In [None]:
weather_donghua_df = pd.read_csv("./data/C0Z100_2024.csv")
print(weather_donghua_df.columns)

weather_donghua_df['datetime'] = pd.to_datetime(weather_donghua_df['Unnamed: 0'], format='%m/%d/%Y %H:%M')
weather_donghua_df['formatted_datetime'] = weather_donghua_df['datetime'].dt.strftime('%Y%m%d%H')
print(weather_donghua_df[['Unnamed: 0', 'formatted_datetime']])
print(weather_donghua_df.columns)



In [None]:
weather_data_dict = weather_donghua_df.set_index('formatted_datetime').T.to_dict()

# 打印前幾個項目檢查
for key, value in list(weather_data_dict.items())[:]:
    print(f"Key: {key}")
    print(f"Value: {value}")
    print()
    
print(weather_data_dict['2024010101']['TxSoil0cm'])
print(weather_data_dict.keys())

In [None]:
list_of_files = os.listdir('./36_TrainingData_Additional_V2')
print(list_of_files)

In [None]:
import os
import pandas as pd

# 檢查並創建目標文件夾
output_folder = './complete_Training_Data/'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"目標文件夾 {output_folder} 已創建")

# 處理檔案
for doc_name in os.listdir('./TrainingData'):
    df = pd.read_csv('./TrainingData/' + doc_name)
    print("df", len(df))
    device_name = doc_name.split('_')[0]
    
    for add_doc_name in list_of_files:
        add_device_name = add_doc_name.split('_')[0]
        if device_name == add_device_name:
            add_df = pd.read_csv('./36_TrainingData_Additional_V2/' + add_doc_name)
            print("add_df", len(add_df))
            df = pd.concat([df, add_df])
            print("df", len(df))
    
    save_name = device_name + '_complete_Train.csv'
    print("saving to:", save_name)
    df.to_csv(output_folder + save_name, index=False)


###  觀察資料


In [None]:

# 載入所有 L1 到 L17 的訓練資料
# LookBackNum = 12 #LSTM往前看的筆數
# ForecastNum = 48 #預測筆數
dataframes = []
for i in range(1, 18):
    # filename = f'./TrainingData/L{i}_Train.csv'
    filename = f'./complete_Training_Data/L{i}_complete_Train.csv'
    df = pd.read_csv(filename, encoding='utf-8')
    print(len(df))
    dataframes.append(df)

# 合併所有 DataFrame
SourceData = pd.concat(dataframes, ignore_index=True)
print(len(SourceData))

In [None]:
print(SourceData)
print(SourceData.columns)   
SourceData['DateTime'] = pd.to_datetime(SourceData['DateTime'], format='%Y-%m-%d %H:%M:%S.%f')

# 格式化為 YYYYMMDDHH
SourceData['FormattedDateTime'] = SourceData['DateTime'].dt.strftime('%Y%m%d%H')

# 檢查結果
print(SourceData[['DateTime', 'FormattedDateTime']])

In [None]:
print(len(SourceData))
count = 0
# count = 0
for index, data in SourceData.iterrows():
    formatted_datetime = data['FormattedDateTime']
    
    # 檢查 weather_data_dict 是否包含該日期時間
    if formatted_datetime in weather_data_dict:
        SourceData.loc[index, 'WSGust'] = weather_data_dict[formatted_datetime]['WSGust']
        SourceData.loc[index, 'WDGust'] = weather_data_dict[formatted_datetime]['WDGust']
        SourceData.loc[index, 'GloblRad'] = weather_data_dict[formatted_datetime]['GloblRad']
        SourceData.loc[index, 'Precp'] = weather_data_dict[formatted_datetime]['Precp']
        SourceData.loc[index, 'TxSoil0cm'] = weather_data_dict[formatted_datetime]['TxSoil0cm']
        SourceData.loc[index, 'TxSoil10cm'] = weather_data_dict[formatted_datetime]['TxSoil10cm']
        SourceData.loc[index, 'TxSoil20cm'] = weather_data_dict[formatted_datetime]['TxSoil20cm'] 
        SourceData.loc[index, 'TxSoil50cm'] = weather_data_dict[formatted_datetime]['TxSoil50cm']
        SourceData.loc[index, 'TxSoil100cm'] = weather_data_dict[formatted_datetime]['TxSoil100cm']
    else:
        print(f'{formatted_datetime} not found in weather_data_dict')

    count += 1
    # if count % 100 == 0:
    #     break
    print(f'Processing row {count}/{len(SourceData)}')

# 儲存最後的結果
SourceData.to_csv('full_information_sources_data.csv', index=False)


In [None]:
SourceData = pd.read_csv('full_information_sources_data.csv', encoding='utf-8')

In [None]:
print(SourceData.head())

In [None]:
# # 顯示相關性排序(Linear)
# import pandas as pd

# # 計算相關係數
# corr_matrix = SourceData.corr()
# sunlight_corr = corr_matrix["Sunlight(Lux)"].sort_values(ascending=False)

# # 顯示相關性排序
# print(sunlight_corr)


### 接下來把 complete_Training_Data文件夾 輸入到微氣候數據處理程序 產出 CompleteAVG 和 CompleteIncompleteAVG 兩個文件夾

In [None]:
import pandas as pd

# 欄位名稱列表
column_names = [
    "DateTime", "WindSpeed(m/s)", "Pressure(hpa)", 
    "Temperature(°C)", "Humidity(%)", "Sunlight(Lux)", "Power(mW)",
    "WSGust", "WDGust", "GloblRad", "Precp", 
    "TxSoil0cm", "TxSoil10cm", "TxSoil20cm", "TxSoil50cm", "TxSoil100cm"
]

# 讀取時指定欄位名稱
dataframes = []
for i in range(1, 18):
    if i < 10:
        location_code = f'0{i}'    
    else:
        location_code = f'{i}'
    filename = f'./CompleteAVG/AvgDATA_'+location_code+'.csv'
    
    # 指定欄位名稱讀取
    df = pd.read_csv(filename, encoding='utf-8', names=column_names)
    print(len(df))
    dataframes.append(df)

# 合併所有 DataFrame
Avg_SourceData = pd.concat(dataframes, ignore_index=True)
print(len(Avg_SourceData))


In [None]:
# print(weather_donghua_df.head())

In [None]:
weather_data_dict = weather_donghua_df.set_index('formatted_datetime').T.to_dict()

# 打印前幾個項目檢查
for key, value in list(weather_data_dict.items())[:]:
    print(f"Key: {key}")
    print(f"Value: {value}")
    print()
    
print(weather_data_dict['2024010101']['TxSoil0cm'])
print(weather_data_dict.keys())

In [None]:
print(Avg_SourceData.head())
print(Avg_SourceData.columns)

In [None]:
count = 0
# count = 0
for index, data in Avg_SourceData.iterrows():
    print(str(data['DateTime']))
    formatted_datetime = str(data['DateTime'])[:10]
    
    # 檢查 weather_data_dict 是否包含該日期時間
    if formatted_datetime in weather_data_dict:
        Avg_SourceData.loc[index, 'WSGust'] = weather_data_dict[formatted_datetime]['WSGust']
        Avg_SourceData.loc[index, 'WDGust'] = weather_data_dict[formatted_datetime]['WDGust']
        Avg_SourceData.loc[index, 'GloblRad'] = weather_data_dict[formatted_datetime]['GloblRad']
        Avg_SourceData.loc[index, 'Precp'] = weather_data_dict[formatted_datetime]['Precp']
        Avg_SourceData.loc[index, 'TxSoil0cm'] = weather_data_dict[formatted_datetime]['TxSoil0cm']
        Avg_SourceData.loc[index, 'TxSoil10cm'] = weather_data_dict[formatted_datetime]['TxSoil10cm']
        Avg_SourceData.loc[index, 'TxSoil20cm'] = weather_data_dict[formatted_datetime]['TxSoil20cm']
        Avg_SourceData.loc[index, 'TxSoil50cm'] = weather_data_dict[formatted_datetime]['TxSoil50cm']
        Avg_SourceData.loc[index, 'TxSoil100cm'] = weather_data_dict[formatted_datetime]['TxSoil100cm']
        
        print(Avg_SourceData.loc[index,'GloblRad'])
        
    else:
        print(f'{formatted_datetime} not found in weather_data_dict')

    count += 1
    # if count % 100 == 0:
    #     break
    print(f'Processing row {count}/{len(Avg_SourceData)}')

# 儲存最後的結果
Avg_SourceData.to_csv('full_information_Avg_sources_data.csv', index=False)


In [None]:
Avg_SourceData = pd.read_csv('full_information_Avg_sources_data.csv', encoding='utf-8', dtype={"DateTime": int})
print(Avg_SourceData.head())

In [None]:
# 顯示相關性排序(Linear)
import pandas as pd

# 計算相關係數
corr_matrix = Avg_SourceData.corr()
sunlight_corr = corr_matrix["Sunlight(Lux)"].sort_values(ascending=False)

# 顯示相關性排序
print(sunlight_corr)


In [None]:
Time_list = Avg_SourceData['DateTime'].tolist()
# print(Time_list)
print(len(Time_list))
# print(set(Time_list))
# print(len(set(Time_list)))
count = 0
for i in Time_list:
    print(i)
    i = str(i)
    year_month = i[:6]
    day = i[6:8]
    yesterday = str(int(day) - 1)
    if len(yesterday) == 1:
        yesterday = '0' + yesterday
    time_loc = i[8:]
    new_code = year_month + yesterday + time_loc
    print(yesterday,new_code)
    if int(new_code) in Time_list:
        print('yes')
        count += 1
    else:
        print('no')
        
print(count)

In [None]:
result = Avg_SourceData.set_index("DateTime").T.to_dict()
print(result.keys())
print(result[20240101090001])
print(Avg_SourceData.head())

In [None]:
Time_list = Avg_SourceData['DateTime'].tolist()
# print(Time_list)
print(len(Time_list))
# print(set(Time_list))
# print(len(set(Time_list)))

count = 0
for index, data in  Avg_SourceData.iterrows():
    code = data['DateTime']
    # print(code)
    code = str(code)
    year_month = code[:6]
    day = code[6:8]
    yesterday = str(int(day) - 1)
    if len(yesterday) == 1:
        yesterday = '0' + yesterday
    time_loc = code[8:14]
    new_code = year_month + yesterday + time_loc
    print(year_month,day,time_loc)
    # print(yesterday,new_code)
    new_code = int(new_code)
    print(new_code)
    if new_code in Time_list:
        Avg_SourceData.loc[index,'WindSpeed(m/s)_yes'] = result[int(new_code)]['WindSpeed(m/s)']
        Avg_SourceData.loc[index, 'Pressure(hpa)_yes'] = result[int(new_code)]['Pressure(hpa)']
        Avg_SourceData.loc[index, 'Temperature(°C)_yes'] = result[int(new_code)]['Temperature(°C)']
        Avg_SourceData.loc[index, 'Humidity(%)_yes'] = result[int(new_code)]['Humidity(%)']
        Avg_SourceData.loc[index, 'Sunlight(Lux)_yes'] = result[int(new_code)]['Sunlight(Lux)']
        Avg_SourceData.loc[index, 'Power(mW)_yes'] = result[int(new_code)]['Power(mW)']
        Avg_SourceData.loc[index, 'WSGust_yes'] = result[int(new_code)]['WSGust']
        Avg_SourceData.loc[index, 'WDGust_yes'] = result[int(new_code)]['WDGust']
        Avg_SourceData.loc[index, 'GloblRad_yes'] = result[int(new_code)]['GloblRad']
        Avg_SourceData.loc[index, 'Precp_yes'] = result[int(new_code)]['Precp']
        Avg_SourceData.loc[index, 'TxSoil0cm_yes'] = result[int(new_code)]['TxSoil0cm']
        Avg_SourceData.loc[index, 'TxSoil10cm_yes'] = result[int(new_code)]['TxSoil10cm']
        Avg_SourceData.loc[index, 'TxSoil20cm_yes'] = result[int(new_code)]['TxSoil20cm']
        Avg_SourceData.loc[index, 'TxSoil50cm_yes'] = result[int(new_code)]['TxSoil50cm']
        Avg_SourceData.loc[index, 'TxSoil100cm_yes'] = result[int(new_code)]['TxSoil100cm']
        
        count += 1
    else:
        Avg_SourceData.loc[index,'WindSpeed(m/s)_yes'] = -999
        Avg_SourceData.loc[index, 'Pressure(hpa)_yes'] = -999
        Avg_SourceData.loc[index, 'Temperature(°C)_yes'] = -999
        Avg_SourceData.loc[index, 'Humidity(%)_yes'] = -999
        Avg_SourceData.loc[index, 'Sunlight(Lux)_yes'] = -999
        Avg_SourceData.loc[index, 'Power(mW)_yes'] = -999
        Avg_SourceData.loc[index, 'WSGust_yes'] = -999
        Avg_SourceData.loc[index, 'WDGust_yes'] = -999
        Avg_SourceData.loc[index, 'GloblRad_yes'] = -999
        Avg_SourceData.loc[index, 'Precp_yes'] = -999   
        Avg_SourceData.loc[index, 'TxSoil0cm_yes'] = -999
        Avg_SourceData.loc[index, 'TxSoil10cm_yes'] = -999
        Avg_SourceData.loc[index, 'TxSoil20cm_yes'] = -999
        Avg_SourceData.loc[index, 'TxSoil50cm_yes'] = -999
        Avg_SourceData.loc[index, 'TxSoil100cm_yes'] = -999
        print(f'{new_code} not found in weather_data_dict')
        
print(count)

In [None]:
print(Avg_SourceData.head())
Avg_SourceData.to_csv('full_information_Avg_sources_data_with_yes.csv', index=False)

In [None]:
New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes.csv', encoding='utf-8', dtype={"DateTime": int})
print(len(New_Avg_SourceData))
New_Avg_SourceData = New_Avg_SourceData[New_Avg_SourceData['Precp_yes'] != -999]
print(len(New_Avg_SourceData))

In [None]:
# 顯示相關性排序(Linear)
import pandas as pd

# 計算相關係數
corr_matrix = New_Avg_SourceData.corr()
sunlight_corr = corr_matrix["Sunlight(Lux)"].sort_values(ascending=False)

# 顯示相關性排序
print(sunlight_corr)


In [None]:
X = New_Avg_SourceData.drop(columns=['DateTime','Power(mW)', 'Sunlight(Lux)','Temperature(°C)','WindSpeed(m/s)','Pressure(hpa)','Humidity(%)']).values

# 設定目標為 Sunlight
y = New_Avg_SourceData['Sunlight(Lux)'].values

In [None]:
New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes.csv', encoding='utf-8', dtype={"DateTime": int})
result = New_Avg_SourceData.set_index("DateTime").T.to_dict()
print(result.keys())

In [None]:
result[20240102090001]

In [None]:
import os
os.listdir('./CompleteIncompleteAVG')
incomplete_data = pd.DataFrame()
for i in os.listdir('./CompleteIncompleteAVG'):
    if i.endswith('.csv'):
        print(i)
        df = pd.read_csv('./CompleteIncompleteAVG/'+i, encoding='utf-8')
        print(len(df))
        incomplete_data = pd.concat([incomplete_data, df], ignore_index=True)
print(len(incomplete_data))
# incomplete_data = pd.read_csv('full_information_incomplete_data.csv', encoding='utf-8', dtype={"DateTime": int})

In [None]:
print(incomplete_data.head())

In [None]:
incomplete_data_dict = incomplete_data.set_index("Serial").T.to_dict()

In [None]:
print(incomplete_data_dict.keys())

In [None]:
for index, row in New_Avg_SourceData.iterrows():
    code = row['DateTime']
    code = str(code)
    # print(code[:14])
    code = code[:14]
    year_month = code[:6]
    day = code[6:8]
    hour = code[8:10]
    minute = code[10:12]
    location_code = code[12:]
    yes_min = int(minute) - 10
    if yes_min < 0:
        yes_min = 60 + yes_min
        hour = int(hour) - 1
        
    hour = int(hour)
    if yes_min < 10:
        yes_min = '0' + str(yes_min)
    if hour < 10:
        hour = '0' + str(hour)
        
    # print(hour,yes_min)
    new_code = year_month + day + str(hour) + str(yes_min) + location_code
    new_code = int(new_code)
    print(new_code)
    if new_code in result.keys():
        New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_min'] = result[int(new_code)]['Sunlight(Lux)']

    else:
        if new_code in incomplete_data_dict.keys():
            New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_min'] = incomplete_data_dict[new_code]['Sunlight(Lux)']
        else:
            New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_min'] = -999
        
    yes2_min = int(yes_min) - 10
    if yes2_min < 0:
        yes2_min = 60 + int(yes2_min)
        hour = int(hour) - 1
    hour = int(hour)
    if yes2_min < 10:
        yes2_min = '0' + str(yes2_min)
    if hour < 10:
        hour = '0' + str(hour)
    new_code = year_month + day + str(hour) + str(yes2_min) + location_code
    print(new_code)
    new_code = int(new_code)
    if new_code in result.keys():
        New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_2min'] = result[int(new_code)]['Sunlight(Lux)']

    else:
        New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_2min'] = -999
        
    yes3_min = int(yes2_min) - 10
    if yes3_min < 0:
        yes3_min = 60 + int(yes3_min)
        hour = int(hour) - 1
    hour = int(hour)
    if yes3_min < 10:
        yes3_min = '0' + str(yes3_min)
    if hour < 10:
        hour = '0' + str(hour)
    new_code = year_month + day + str(hour) + str(yes3_min) + location_code
    print(new_code)
    new_code = int(new_code)
    if new_code in result.keys():
        New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_3min'] = result[int(new_code)]['Sunlight(Lux)']

    else:
        New_Avg_SourceData.loc[index, 'Sunlight(Lux)_last_3min'] = -999
        
    


In [None]:
print(New_Avg_SourceData.head())

In [None]:
New_Avg_SourceData.to_csv('full_information_Avg_sources_data_with_yes_last_min_last2_min_3.csv', index=False)

In [None]:
Final_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes_last_min_last2_min_3.csv', encoding='utf-8', dtype={"DateTime": int})
print(len(Final_Avg_SourceData))
Final_Avg_SourceData = Final_Avg_SourceData[Final_Avg_SourceData['Sunlight(Lux)_last_min'] != -999 ]
print(len(Final_Avg_SourceData))

New_Avg_SourceData = Final_Avg_SourceData

In [None]:
for index, row in New_Avg_SourceData.iterrows():
    location_code = str(row['DateTime'])[12:14]
    day = str(row['DateTime'])[6:8]
    hour = str(row['DateTime'])[8:10]
    minute = str(row['DateTime'])[10:12]
    month = str(row['DateTime'])[4:6]
    print(location_code)
    New_Avg_SourceData.loc[index, 'Location'] = int(location_code)
    if int(location_code) in [1,2,3,4,5,6,7,13,14]:
        height = 5
    elif int(location_code) in [8,9]:
        height = 3
    elif int(location_code) in [10,11,12,15,16]:
        height = 1
    elif int(location_code) == 17:
        height = 2
    else:
        raise ValueError(f"Invalid location code: {location_code}")
    
    int_location_code = int(location_code)
    if int_location_code == 1:
        direction = 181
    elif int_location_code == 2:
        direction = 175
    elif int_location_code == 3:
        direction = 180
    elif int_location_code == 4:
        direction = 161
    elif int_location_code == 5 or int_location_code == 6:
        direction = 208
    elif int_location_code == 7:
        direction = 172
    elif int_location_code == 8:
        direction = 219
    elif int_location_code == 9:
        direction = 151
    elif int_location_code == 10:
        direction = 223
    elif int_location_code == 11:
        direction = 131
        
    elif int_location_code == 12:
        direction = 298
    elif int_location_code == 13:
        direction = 249
    elif int_location_code == 14:
        direction = 197
    elif int_location_code == 15:
        direction = 127
    elif int_location_code == 16:
        direction = 82
    elif int_location_code == 17:
        direction = 0
    else:
        raise ValueError(f"Invalid location code: {location_code}")
    
    direction_x = np.cos(np.radians(direction))
    direction_y = np.sin(np.radians(direction))
    New_Avg_SourceData.loc[index, 'Height'] = height
    # New_Avg_SourceData.loc[index, 'Day'] = int(day)
    # New_Avg_SourceData.loc[index, 'Hour'] = int(hour)
    New_Avg_SourceData.loc[index, 'Time'] = (int(hour) - 9)*6 + int(minute)/10
    New_Avg_SourceData.loc[index, 'Minute'] = int(minute)
    New_Avg_SourceData.loc[index, 'Month'] = int(month)
    New_Avg_SourceData.loc[index, 'Direction_x'] = direction_x
    New_Avg_SourceData.loc[index, 'Direction_y'] = direction_y
    
    
    


In [None]:
print(New_Avg_SourceData.columns)

In [None]:
# 顯示相關性排序(Linear)
import pandas as pd

# 計算相關係數
corr_matrix = New_Avg_SourceData.corr()
sunlight_corr = corr_matrix["Sunlight(Lux)"].sort_values(ascending=False)

# 顯示相關性排序
print(sunlight_corr)


In [None]:
print(New_Avg_SourceData['Location'])

In [None]:
New_Avg_SourceData_1 = New_Avg_SourceData

print(len(New_Avg_SourceData_1))


In [None]:
selected_clns =   ['WSGust', 'WDGust', 'GloblRad', 'Precp', 'TxSoil0cm', 'TxSoil10cm', 'TxSoil20cm', 
                   'TxSoil50cm', 'TxSoil100cm', 'WindSpeed(m/s)_yes', 
                   'Pressure(hpa)_yes', 'Temperature(°C)_yes', 'Humidity(%)_yes', 'Sunlight(Lux)_yes', 
                   'Power(mW)_yes', 'WSGust_yes', 'WDGust_yes', 'GloblRad_yes', 'Precp_yes', 'TxSoil0cm_yes', 'TxSoil10cm_yes', 
                   'TxSoil20cm_yes', 'TxSoil50cm_yes', 'TxSoil100cm_yes', 'Sunlight(Lux)_last_min',
                   'Sunlight(Lux)_last_2min','Time', 'Minute', 'Direction_x', 'Direction_y', 'Location', 'Height']     
        

In [None]:
print(selected_clns)

In [None]:
y = New_Avg_SourceData_1['Sunlight(Lux)'].values
X = New_Avg_SourceData_1[selected_clns].values
print(len(X[0]))
print(len(X))


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.linear_model import Ridge
# 假設你的數據是 Regression_X_train 和 Regression_y_train
# 使用 MinMaxScaler 來縮放特徵數據
# scaler = Regression_MinMaxModel

# 先縮放所有特徵
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))
# 使用 train_test_split 拆分訓練集和驗證集（例如，80% 訓練，20% 驗證）
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.0001, random_state=42)

model_sunlight = lgb.LGBMRegressor(n_estimators=2500,reg_alpha = 0.8, random_state=42,objective='regression',metric='rmse')

# model_sunlight = Ridge(alpha=1.0)
model_sunlight.fit(X_train, y_train)

# 訓練集 R² 分數
r2_train = model_sunlight.score(X_train, y_train)
print(f"Training R squared: {r2_train}")

# 驗證集 R² 分數
r2_val = model_sunlight.score(X_val, y_val)
print(f"Validation R squared: {r2_val}")


In [None]:
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score
y_pred = model_sunlight.predict(X_val)
y_pred = scaler_y.inverse_transform(y_pred.reshape(-1, 1))
y_val = scaler_y.inverse_transform(y_val.reshape(-1, 1))


print("mean absolute error:",mean_absolute_error(y_val,y_pred))
print("mean squared error:",mean_squared_error(y_val,y_pred))
print("root mean squared error:",np.sqrt(mean_squared_error(y_val,y_pred)))


In [None]:
# New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes.csv', encoding='utf-8', dtype={"DateTime": int})
New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes_last_min_last2_min_3.csv', encoding='utf-8', dtype={"DateTime": int})
print(len(New_Avg_SourceData))
New_Avg_SourceData = New_Avg_SourceData[New_Avg_SourceData['Power(mW)_yes'] != -999]
print(len(New_Avg_SourceData))

In [None]:
X_f = New_Avg_SourceData.drop(columns=['DateTime','Power(mW)','Temperature(°C)','WindSpeed(m/s)','Pressure(hpa)','Humidity(%)','Sunlight(Lux)_last_3min']).values
# 設定目標為 Sunlight
y_f = New_Avg_SourceData['Power(mW)'].values

In [None]:
print(len(X_f[0]))
print(New_Avg_SourceData.drop(columns=['DateTime','Power(mW)','Temperature(°C)','WindSpeed(m/s)','Pressure(hpa)','Humidity(%)','Sunlight(Lux)_last_3min']).columns)

In [None]:
corr_matrix = New_Avg_SourceData.corr()
power_corr = corr_matrix["Power(mW)"].sort_values(ascending=False)
print(power_corr)

In [None]:
# 初始化 MinMaxScaler
scaler_X_power = MinMaxScaler()
scaler_y_power = MinMaxScaler()

# 對特徵和目標進行正規化
X_scaled = scaler_X_power.fit_transform(X_f)
y_scaled = scaler_y_power.fit_transform(y_f.reshape(-1, 1))


In [None]:
print(X_scaled)

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# 假設你的數據是 Regression_X_train 和 Regression_y_train
# 使用 MinMaxScaler 來縮放特徵數據
# scaler = Regression_MinMaxModel

# # 先縮放所有特徵
# X_scaled = scaler.fit_transform(Regression_X_train)

# 使用 train_test_split 拆分訓練集和驗證集（例如，80% 訓練，20% 驗證）
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled ,test_size=0.001, random_state=42)

# 訓練決策樹回歸模型
# model = DecisionTreeRegressor(random_state=42)

# model_power = RandomForestRegressor(n_estimators=100, random_state=42)
# model_power = XGBRegressor(
#     n_estimators=400,
#     random_state=42,
#     max_depth=7,
# )
model_power = lgb.LGBMRegressor(n_estimators=2500, random_state=42,objective='regression',metric='rmse')

model_power.fit(X_train, y_train)

# 訓練集 R² 分數
r2_train = model_power.score(X_train, y_train)
print(f"Training R squared: {r2_train}")

# 驗證集 R² 分數
r2_val = model_power.score(X_val, y_val)
print(f"Validation R squared: {r2_val}")


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, r2_score
import numpy as np

# 自定義 R² 的評分函數
r2_scorer = make_scorer(r2_score)

# 進行 5 折交叉驗證（可以修改 cv 的值）
cv_scores = cross_val_score(
    model_power, X_scaled, y_scaled, cv=5, scoring=r2_scorer
)

# 打印每次交叉驗證的分數和平均分數
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Mean R² score: {np.mean(cv_scores):.4f}")


In [None]:
y_val_pred = model_power.predict(X_val)
diff = []
# 顯示幾個驗證集的真實值和預測值
print("\nSample of Actual vs Predicted values (Validation set):")
print(y_val_pred)
for i in range(len(y_val_pred)):  # 顯示前五個結果
    actual = scaler_y_power.inverse_transform(y_val[i].reshape(-1, 1))
    predicted = scaler_y_power.inverse_transform(y_val_pred[i].reshape(-1, 1))
    
    # 顯示結果時去掉括號
    print(f"Actual: {actual[0][0]}, Predicted: {predicted[0][0]}")
    
    diff.append(abs(actual - predicted))
    
print(sum(diff)//len(diff))

In [None]:
New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes.csv', encoding='utf-8', dtype={"DateTime": int})
print(len(New_Avg_SourceData))
New_Avg_SourceData = New_Avg_SourceData[New_Avg_SourceData['Precp_yes'] != -999]
print(len(New_Avg_SourceData))

In [None]:
New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes.csv', encoding='utf-8', dtype={"DateTime": int})
print(len(New_Avg_SourceData))
# New_Avg_SourceData = New_Avg_SourceData[New_Avg_SourceData['Precp_yes'] != -999]
# print(len(New_Avg_SourceData))

In [None]:
# New_Avg_SourceData.head()

In [None]:
X = New_Avg_SourceData.drop(columns=['DateTime','Power(mW)', 'Sunlight(Lux)','Temperature(°C)','WindSpeed(m/s)','Pressure(hpa)','Humidity(%)']).columns
print(X)

In [None]:
Final_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes_last_min_last2_min_3.csv', encoding='utf-8', dtype={"DateTime": int})


In [None]:
result = Final_Avg_SourceData.set_index("DateTime").T.to_dict()
len(result)

In [None]:
print(len(selected_clns))
print(selected_clns)

In [None]:
DataName = os.getcwd()+r'/36_TestSet_SubmissionTemplate/upload(no answer).csv'
# DataName = os.getcwd()+r'/full_information_Avg_sources_data_with_yes.csv'
# New_Avg_SourceData = pd.read_csv('full_information_Avg_sources_data_with_yes.csv', encoding='utf-8', dtype={"DateTime": int})
# print(len(New_Avg_SourceData))
# SourceData = New_Avg_SourceData[New_Avg_SourceData['Precp_yes'] != -999]
# print(len(SourceData))

SourceData = pd.read_csv(DataName, encoding='utf-8')
target = ['序號']
# target = ['DateTime']

EXquestion = SourceData[target].values

inputs = [] #存放參考資料
PredictOutput = [] #存放預測值(天氣參數)
PredictPower = [] #存放預測值(發電量) 
Serial_list = [] #存放序號

count = 0
data = []

# columns = [
#     'WSGust', 'WDGust', 'GloblRad', 'Precp', 'TxSoil0cm', 'TxSoil10cm',
#     'TxSoil20cm', 'TxSoil50cm', 'TxSoil100cm', 'WindSpeed(m/s)_yes', 'Pressure(hpa)_yes', 
#     'Temperature(°C)_yes', 'Humidity(%)_yes', 'Sunlight(Lux)_yes', 'Power(mW)_yes', 'WSGust_yes', 
#     'WDGust_yes', 'GloblRad_yes', 'Precp_yes', 'TxSoil0cm_yes', 'TxSoil10cm_yes', 
#     'TxSoil20cm_yes', 'TxSoil50cm_yes', 'TxSoil100cm_yes','Sunlight(Lux)_last_min','Sunlight(Lux)_last_2min','Sunlight(Lux)_last_3min'
# ]
columns = selected_clns
# 創建空的 DataFrame
result_df = pd.DataFrame(columns=columns)
sunlight_tmp_dict = {}
for qs in EXquestion:
    key = str(qs[0])[:10]
    print(key)  
    # 'WSGust', 'WDGust', 'GloblRad', 'Precp', 'TxSoil0cm', 'TxSoil10cm',
    #    'TxSoil20cm', 'TxSoil50cm', 'TxSoil100cm'
    data = [weather_data_dict[key]['WSGust'],weather_data_dict[key]['WDGust'],weather_data_dict[key]['GloblRad'],weather_data_dict[key]['Precp'],weather_data_dict[key]['TxSoil0cm'],weather_data_dict[key]['TxSoil10cm'],weather_data_dict[key]['TxSoil20cm'],weather_data_dict[key]['TxSoil50cm'],weather_data_dict[key]['TxSoil100cm']]
    # print(data)
    code = qs[0]
    code = str(code)
    year_month = code[:6]
    day = code[6:8]
    yesterday = str(int(day) - 1)
    if len(yesterday) == 1:
        yesterday = '0' + yesterday
    time_loc = code[8:14]
    new_code = year_month + yesterday + time_loc
    print(year_month,day,time_loc)
    # print(yesterday,new_code)
    new_code = int(new_code)
    print("yesterday",new_code)
    # 'WindSpeed(m/s)_yes',
    #    'Pressure(hpa)_yes', 'Temperature(°C)_yes', 'Humidity(%)_yes',
    #    'Sunlight(Lux)_yes', 'Power(mW)_yes', 'WSGust_yes', 'WDGust_yes',
    #    'GloblRad_yes', 'Precp_yes', 'TxSoil0cm_yes', 'TxSoil10cm_yes',
    #    'TxSoil20cm_yes', 'TxSoil50cm_yes', 'TxSoil100cm_yes'],
    if new_code in result.keys():
        data.append(result[new_code]['WindSpeed(m/s)'])
        data.append(result[new_code]['Pressure(hpa)'])
        data.append(result[new_code]['Temperature(°C)'])
        data.append(result[new_code]['Humidity(%)'])
        data.append(result[new_code]['Sunlight(Lux)'])
        data.append(result[new_code]['Power(mW)'])
        data.append(result[new_code]['WSGust'])
        data.append(result[new_code]['WDGust'])
        data.append(result[new_code]['GloblRad'])
        data.append(result[new_code]['Precp'])
        data.append(result[new_code]['TxSoil0cm'])
        data.append(result[new_code]['TxSoil10cm'])
        data.append(result[new_code]['TxSoil20cm'])
        data.append(result[new_code]['TxSoil50cm'])
        data.append(result[new_code]['TxSoil100cm'])
        # data.append(result[new_code]['Sunlight(Lux)_last_min'])
        # data.append(result[new_code]['Sunlight(Lux)_last_2min'])
    else:
        raise Exception(f'{new_code} not found in result_dict')
    
    code = code[:14]
    year_month = code[:6]
    day = code[6:8]
    hour = code[8:10]
    minute = code[10:12]
    location_code = code[12:]
    yes_min = int(minute) - 10
    if yes_min < 0:
        yes_min = 60 + yes_min
        hour = int(hour) - 1
        
    hour = int(hour)
    if yes_min < 10:
        yes_min = '0' + str(yes_min)
    if hour < 10:
        hour = '0' + str(hour)
        
    # print(hour,yes_min)
    new_code = year_month + day + str(hour) + str(yes_min) + location_code
    new_code = int(new_code)
    print("last_min",new_code)
    if new_code in result.keys():
        data.append(result[new_code]['Sunlight(Lux)_last_min'])
    else:
        if new_code in incomplete_data_dict.keys():
            data.append(incomplete_data_dict[new_code]['Sunlight(Lux)'])
        else:
            data.append(-999)
    
    yes2_min = int(yes_min) - 10
    if yes2_min < 0:
        yes2_min = 60 + int(yes2_min)
        hour = int(hour) - 1
    hour = int(hour)
    if yes2_min < 10:
        yes2_min = '0' + str(yes2_min)
    if hour < 10:
        hour = '0' + str(hour)
    new_code = year_month + day + str(hour) + str(yes2_min) + location_code
    print("last2_min",new_code)
    new_code = int(new_code)
    if new_code in result.keys():
        data.append(result[new_code]['Sunlight(Lux)_last_2min'])
    else:
        if new_code in incomplete_data_dict.keys():
            data.append(incomplete_data_dict[new_code]['Sunlight(Lux)'])
        else:
            data.append(-999)
            # raise Exception(f'{new_code} not found in result_dict')
            
    # yes3_min = int(yes2_min) - 10
    # if yes3_min < 0:
    #     yes3_min = 60 + int(yes3_min)
    #     hour = int(hour) - 1    
    # hour = int(hour)
    # if yes3_min < 10:
    #     yes3_min = '0' + str(yes3_min)
    # if hour < 10:
    #     hour = '0' + str(hour)
    # new_code = year_month + day + str(hour) + str(yes3_min) + location_code
    # print("last3_min",new_code)
    # new_code = int(new_code)
    # if new_code in result.keys():
    #     data.append(result[new_code]['Sunlight(Lux)_last_3min'])
    # else:
    #     if new_code in incomplete_data_dict.keys():
    #         data.append(incomplete_data_dict[new_code]['Sunlight(Lux)'])
    #     else:
    #         data.append(-999)
    #         # raise Exception(f'{new_code} not found in result_dict')
    code = str(qs[0])
    location_code = code[12:14]
    day = code[6:8]
    hour = code[8:10]
    minute = code[10:12]
    month = code[4:6]
    print(location_code)
    New_Avg_SourceData.loc[index, 'Location'] = int(location_code)
    if int(location_code) in [1,2,3,4,5,6,7,13,14]:
        height = 5
    elif int(location_code) in [8,9]:
        height = 3
    elif int(location_code) in [10,11,12,15,16]:
        height = 1
    elif int(location_code) == 17:
        height = 2
    else:
        raise ValueError(f"Invalid location code: {location_code}")
    
    int_location_code = int(location_code)
    if int_location_code == 1:
        direction = 181
    elif int_location_code == 2:
        direction = 175
    elif int_location_code == 3:
        direction = 180
    elif int_location_code == 4:
        direction = 161
    elif int_location_code == 5 or int_location_code == 6:
        direction = 208
    elif int_location_code == 7:
        direction = 172
    elif int_location_code == 8:
        direction = 219
    elif int_location_code == 9:
        direction = 151
    elif int_location_code == 10:
        direction = 223
    elif int_location_code == 11:
        direction = 131
        
    elif int_location_code == 12:
        direction = 298
    elif int_location_code == 13:
        direction = 249
    elif int_location_code == 14:
        direction = 197
    elif int_location_code == 15:
        direction = 127
    elif int_location_code == 16:
        direction = 82
    elif int_location_code == 17:
        direction = 0
    else:
        raise ValueError(f"Invalid location code: {location_code}")
    
    direction_x = np.cos(np.radians(direction))
    direction_y = np.sin(np.radians(direction))
    'Time', 'Minute', 'Direction_x', 'Direction_y', 'Location', 'Height'
    data.append((int(hour) - 9)*6 + int(minute)/10)
    data.append(int(minute))
    data.append(direction_x)
    data.append(direction_y)
    data.append(int_location_code)
    data.append(height)
    
    print(data)
    print(len(data))
    result_df.loc[len(result_df)] = data
        
        
    
    
    
    
    
    
  




In [None]:
len(result_df.iloc[0])

In [None]:
# print(result_df.head(100))
print(result_df.values)

In [None]:
import numpy as np

# 假設 result_df 是你的輸入數據框
X_sunlight = result_df.values
X_with_prev = []
# 初始化一個變量存儲結果
y_sunlight_pred_list = []

# 初始值，假設第一筆資料不需要任何依賴
prev_prediction = 0  

for i in range(len(X_sunlight)):
    # 在原始輸入數據上添加前一筆預測結果
    # 假設預測結果需要添加為額外的特徵
    X_with_prev = X_sunlight[i]

    if i % 48 == 0 :
        pass
        # X_with_prev = X_sunlight[i]
    elif i % 48 == 1:
        # print(len(X_with_prev))
        X_with_prev[len(X_with_prev)-8] = y_sunlight_pred_list[-1]
    else:
        X_with_prev[len(X_with_prev)-8] = y_sunlight_pred_list[-1]
        X_with_prev[len(X_with_prev)-7] = y_sunlight_pred_list[-2]
    # else:
    #     raise ValueError(f"Invalid index: {i}")
        # X_with_prev[len(X_with_prev)-3] = y_sunlight_pred_list[-1]
        # X_with_prev[len(X_with_prev)-2] = y_sunlight_pred_list[-2]
        # X_with_prev[len(X_with_prev)-1] = y_sunlight_pred_list[-3]
        
    # 標準化輸入數據
    X_with_prev = np.array(X_with_prev).reshape(1, -1)
    X_scaled = scaler_X.transform(X_with_prev)
    
    # 預測當前輸出
    y_pred_scaled = model_sunlight.predict(X_scaled)
    y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1))
    
    # 存儲結果
    y_sunlight_pred_list.append(y_pred[0][0])
    
    # 更新前一筆預測結果
    prev_prediction = y_pred[0][0]

# 將結果轉為 NumPy array 或其他需要的格式
y_sunlight_pred = np.array(y_sunlight_pred_list)
print(y_sunlight_pred)


In [None]:
print(len(y_sunlight_pred))

In [None]:
print(result_df.columns)
result_df.insert(0,'Sunlight(Lux)',y_sunlight_pred)
print(result_df.columns)
print(result_df.head())


In [None]:
print(result_df.head(50))

In [None]:
X_power = result_df.drop(columns=['Time','Minute','Direction_x','Direction_y','Location','Height']).values

# 使用 MinMaxScaler 來縮放特徵數據
scaler_X_power = MinMaxScaler()
X_power_scaled = scaler_X_power.fit_transform(X_power)

# 預測功率（power）
y_power_pred_scaled = model_power.predict(X_power_scaled)

# 逆縮放功率預測結果
y_power_pred = scaler_y_power.inverse_transform(y_power_pred_scaled.reshape(-1, 1))

# 將功率預測結果轉為正數(如果預測結果為負數，將其設置為 0)
for i in range(len(y_power_pred)):
    if y_power_pred[i][0] < 0:
        y_power_pred[i][0] = 0

# 將功率預測結果加到數據集中
result_df['Predicted_Power'] = y_power_pred


In [None]:
print(result_df.head(50))
result_df['Predicted_Power'] = result_df['Predicted_Power'].apply(lambda x: round(x, 2))

In [None]:
print(result_df.head(50))

In [None]:
print(EXquestion)
print(result_df['Predicted_Power'].values)

In [None]:
# diff = 0
# for i in range(len(SourceData)):
#     # print(EXquestion[i][0], result_df['Predicted_Power'].values[i])
#     # print(SourceData.iloc[i]['DateTime'], SourceData.iloc[i]['Power(mW)'])
#     print(SourceData.iloc[i]['Power(mW)'],result_df['Predicted_Power'].values[i])
#     diff += abs(SourceData.iloc[i]['Power(mW)'] - result_df['Predicted_Power'].values[i])
    
# print(diff/len(SourceData))


In [None]:
result_df.to_csv('data_ouput_check.csv', index=False)

In [None]:
Serial_list = []
for qs in EXquestion:
    Serial_list.append(qs[0])
print(Serial_list)
df = pd.DataFrame({'序號':Serial_list,'答案':result_df['Predicted_Power'].values})
# 將 DataFrame 寫入 CSV 檔案
df.to_csv('upload_check.csv', index=False) 
print('Output CSV File Saved')