In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from datetime import datetime

from scipy.spatial import cKDTree
from scipy.optimize import curve_fit
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings

warnings.filterwarnings("ignore")

In [4]:
data = pd.read_csv("data/total_add_velocity_weighted_average.csv")
target = pd.read_csv("data/target.csv")
water = pd.read_csv("data/water_velocity2016.csv")
wind = pd.read_csv("data/wind_velocity2016.csv")

# Make predict model

In [8]:
reg = data.copy()
reg = reg.iloc[:,4:10]

In [29]:
reg

Unnamed: 0,water_u,water_v,wind_u,wind_v,d_long,d_lati
0,0.089673,0.041876,4.087482,-1.265760,0.090988,0.070000
1,0.095552,0.053962,3.988419,-1.597433,0.112015,0.041000
2,0.113870,0.135307,1.290056,0.333414,0.009995,-0.028999
3,0.259294,0.156671,5.640256,8.854036,0.102005,0.050999
4,0.368874,0.064946,13.505250,7.603525,0.086990,-0.024002
...,...,...,...,...,...,...
15809,0.076116,-0.035941,3.090491,1.099326,0.029999,-0.000999
15810,0.075121,-0.036927,1.551420,2.108921,-0.041992,-0.029999
15811,0.081807,-0.019462,1.289979,4.271356,-0.057007,0.008999
15812,0.081604,-0.005729,0.537444,4.900105,0.072998,0.054001


In [9]:
features = ['water_u', 'water_v', 'wind_u', 'wind_v']
target_d_long = 'd_long'
target_d_lati = 'd_lati'

In [10]:
X_train, X_test, y_train_long, y_test_long, y_train_lati, y_test_lati = train_test_split(
    reg[features], reg[target_d_long], reg[target_d_lati], test_size=0.2, random_state=42)

In [11]:
model_d_long = lgb.LGBMRegressor()
model_d_lati = lgb.LGBMRegressor()

In [12]:
model_d_long.fit(X_train, y_train_long)
model_d_lati.fit(X_train, y_train_lati)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000261 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 12651, number of used features: 4
[LightGBM] [Info] Start training from score 0.015105
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1020
[LightGBM] [Info] Number of data points in the train set: 12651, number of used features: 4
[LightGBM] [Info] Start training from score 0.006790


In [13]:
predictions_d_long = model_d_long.predict(X_test)
predictions_d_lati = model_d_lati.predict(X_test)

In [14]:
rmse_d_long = math.sqrt(mean_squared_error(y_test_long, predictions_d_long))
rmse_d_lati = math.sqrt(mean_squared_error(y_test_lati, predictions_d_lati))

print(f'RMSE for d_long: {rmse_d_long}')
print(f'RMSE for d_lati: {rmse_d_lati}')

RMSE for d_long: 0.043965370044968005
RMSE for d_lati: 0.03455150916900853


# Preprocessing target data

In [15]:
target

Unnamed: 0,year,month,day,hour,minute,longitude_degree,longitude_minute,latitude_degree,latitude_minute,drifter,time,longitude,latitude
0,2016,3,13,11,49,127,4.31,32,30.442,1,71.5,127.071833,32.507367
1,2016,3,15,20,15,126,36.949,33,45.559,2,74.0,126.615817,33.759317
2,2016,3,19,12,1,129,16.916,34,56.978,3,77.75,129.281933,34.949633
3,2016,3,19,12,38,129,12.744,34,59.2,4,77.75,129.2124,34.986667
4,2016,7,15,15,40,127,4.468,32,30.976,5,195.75,127.074467,32.516267
5,2016,7,18,12,48,129,17.834,34,55.305,6,198.75,129.297233,34.92175
6,2016,7,20,1,49,129,23.897,37,33.155,7,200.25,129.398283,37.552583


In [16]:
drifter = [1,2,3,4,5,6,7]
t = []
lon = []
lat = []
for i in range(len(target)):
    start_date = datetime(2016, 1, 1)
    y = int(target.iloc[i][0])
    m = int(target.iloc[i][1])
    d = int(target.iloc[i][2])
    target_date = datetime(y, m, d)
    
    h = ((target.iloc[i][3] // 6) + 1) * 0.25
    x = (target_date - start_date).days
    x += h
    x -=1
    t.append(x)
    
    lon_deg = target.iloc[i][5]
    lon_min = target.iloc[i][6]
    
    lat_deg = target.iloc[i][7]
    lat_min = target.iloc[i][8]
    
    _lon = lon_deg + (lon_min / 60)
    _lat = lat_deg + (lat_min / 60)
    
    lon.append(_lon)
    lat.append(_lat)    
    
target["drifter"] = drifter    
target["time"] = t
target["longitude"] = lon
target["latitude"] = lat
target

Unnamed: 0,year,month,day,hour,minute,longitude_degree,longitude_minute,latitude_degree,latitude_minute,drifter,time,longitude,latitude
0,2016,3,13,11,49,127,4.31,32,30.442,1,71.5,127.071833,32.507367
1,2016,3,15,20,15,126,36.949,33,45.559,2,74.0,126.615817,33.759317
2,2016,3,19,12,1,129,16.916,34,56.978,3,77.75,129.281933,34.949633
3,2016,3,19,12,38,129,12.744,34,59.2,4,77.75,129.2124,34.986667
4,2016,7,15,15,40,127,4.468,32,30.976,5,195.75,127.074467,32.516267
5,2016,7,18,12,48,129,17.834,34,55.305,6,198.75,129.297233,34.92175
6,2016,7,20,1,49,129,23.897,37,33.155,7,200.25,129.398283,37.552583


In [17]:
main = target.iloc[:,9:]
main

Unnamed: 0,drifter,time,longitude,latitude
0,1,71.5,127.071833,32.507367
1,2,74.0,126.615817,33.759317
2,3,77.75,129.281933,34.949633
3,4,77.75,129.2124,34.986667
4,5,195.75,127.074467,32.516267
5,6,198.75,129.297233,34.92175
6,7,200.25,129.398283,37.552583


In [18]:
water = water.fillna(0)
wind = wind.fillna(0)

In [19]:
main["water_u"] = 0
main["water_v"] = 0
main["wind_u"] = 0
main["wind_v"] = 0
main['d_long'] = 0
main['d_lati'] = 0

In [20]:
# Create a grid of longitude and latitude from the water DataFrame
grid = water[['longitude', 'latitude']].drop_duplicates().values

# Create a KD-Tree for efficient nearest neighbor search
kdtree = cKDTree(grid)

# Function to find the 4 closest vertices for a given point
def find_closest_vertices(row):
    point = np.array([row['longitude'], row['latitude']])
    # Query the KD-Tree to find the 4 closest vertices
    _, indices = kdtree.query(point, k=4)
    return indices

# Apply the function to the main DataFrame to find the 4 closest vertices for each point
main['closest_vertices'] = main.apply(find_closest_vertices, axis=1)

In [21]:
u = []
v = []
for i in range(len(main)):
    x = water.loc[water["time"] == main.iloc[i][1]]
    w_u = []
    w_v = []
    w_d = []
    for j in main.iloc[i][-1]:
        d = ((main.iloc[i][2] - x.iloc[j][1])**2 + (main.iloc[i][3] - x.iloc[j][2])**2) ** 0.5
        w_u.append(x.iloc[j][-2])
        w_v.append(x.iloc[j][-1])
        w_d.append(d)

    # weithged average
    total_weight = sum(w_d)
    weighted_u = sum(w_u[i] * w_d[i] for i in range(len(w_d))) / total_weight
    weighted_v = sum(w_v[i] * w_d[i] for i in range(len(w_d))) / total_weight

    u.append(weighted_u)
    v.append(weighted_v)

main["water_u"] = u
main["water_v"] = v

In [22]:
# Create a grid of longitude and latitude from the water DataFrame
grid2 = wind[['longitude', 'latitude']].drop_duplicates().values

# Create a KD-Tree for efficient nearest neighbor search
kdtree = cKDTree(grid2)

# Function to find the 4 closest vertices for a given point
def find_closest_vertices(row):
    point = np.array([row['longitude'], row['latitude']])
    # Query the KD-Tree to find the 4 closest vertices
    _, indices = kdtree.query(point, k=4)
    return indices

# Apply the function to the main DataFrame to find the 4 closest vertices for each point
main['closest_vertices2'] = main.apply(find_closest_vertices, axis=1)

In [23]:
u = []
v = []
for i in range(len(main)):
    x = wind.loc[wind["time"] == main.iloc[i][1]]
    w_u = []
    w_v = []
    w_d = []
    for j in main.iloc[i][-1]:
        d = ((main.iloc[i][2] - x.iloc[j][1])**2 + (main.iloc[i][3] - x.iloc[j][2])**2) ** 0.5
        w_u.append(x.iloc[j][-2])
        w_v.append(x.iloc[j][-1])
        w_d.append(d)

    # weighted_average
    total_weight = sum(w_d)
    weighted_u = sum(w_u[i] * w_d[i] for i in range(len(w_d))) / total_weight
    weighted_v = sum(w_v[i] * w_d[i] for i in range(len(w_d))) / total_weight

    u.append(weighted_u)
    v.append(weighted_v)

main["wind_u"] = u
main["wind_v"] = v

In [20]:
main = main.iloc[:,:-2]
main

Unnamed: 0,drifter,time,longitude,latitude,water_u,water_v,wind_u,wind_v,d_long,d_lati
0,1,71.5,127.071833,32.507367,-0.011915,0.225438,-3.647036,0.083368,0,0
1,2,74.0,126.615817,33.759317,0.009959,0.054703,-1.908922,-4.001212,0,0
2,3,77.75,129.281933,34.949633,0.104336,0.17252,2.419362,-3.2501,0,0
3,4,77.75,129.2124,34.986667,0.119752,0.199445,2.470042,-3.347662,0,0
4,5,195.75,127.074467,32.516267,-0.016639,0.14389,-0.781448,-1.16722,0,0
5,6,198.75,129.297233,34.92175,0.168963,0.236222,3.850836,1.386833,0,0
6,7,200.25,129.398283,37.552583,0.049902,0.074426,-4.100182,2.380681,0,0


In [24]:
new_main = pd.DataFrame()

# 원본 데이터 프레임의 각 행을 100번 복사하여 추가하고, 조건에 따라 열 값을 설정합니다.
for i in range(len(main)):
    for j in range(100*4):
        row = main.iloc[i].copy()
        row['time'] += j * 0.25
        
        if j > 0:
            row['longitude'] = 0
            row['latitude'] = 0
            row['water_u'] = 0
            row['water_v'] = 0
            row['wind_u'] = 0
            row['wind_v'] = 0
            row['d_long'] = 0
            row['d_lati'] = 0
            
        new_main = pd.concat([new_main, row.to_frame().T], ignore_index=True)

In [25]:
drifter_1 = new_main.loc[new_main["drifter"] == 1].reset_index(drop=True)
drifter_2 = new_main.loc[new_main["drifter"] == 2].reset_index(drop=True)
drifter_3 = new_main.loc[new_main["drifter"] == 3].reset_index(drop=True)
drifter_4 = new_main.loc[new_main["drifter"] == 4].reset_index(drop=True)
drifter_5 = new_main.loc[new_main["drifter"] == 5].reset_index(drop=True)
drifter_6 = new_main.loc[new_main["drifter"] == 6].reset_index(drop=True)
drifter_7 = new_main.loc[new_main["drifter"] == 7].reset_index(drop=True)

In [26]:
drifter_1

Unnamed: 0,drifter,time,longitude,latitude,water_u,water_v,wind_u,wind_v,d_long,d_lati,closest_vertices,closest_vertices2
0,1,71.5,127.071833,32.507367,-0.011915,0.225438,-3.647036,0.083368,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
1,1,71.75,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
2,1,72.0,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
3,1,72.25,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
4,1,72.5,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
...,...,...,...,...,...,...,...,...,...,...,...,...
395,1,170.25,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
396,1,170.5,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
397,1,170.75,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"
398,1,171.0,0,0,0,0,0,0,0,0,"[933, 841, 932, 840]","[7741, 7649, 7740, 7648]"


In [27]:
result = pd.DataFrame()
drifter_lst = [drifter_1, drifter_2, drifter_3, drifter_4,
              drifter_5, drifter_6, drifter_7]

In [28]:
for df in drifter_lst:

    data = df

    # Create a grid of longitude and latitude from the water DataFrame
    grid = water[['longitude', 'latitude']].drop_duplicates().values

    # Create a KD-Tree for efficient nearest neighbor search
    kdtree = cKDTree(grid)

    # Function to find the 4 closest vertices for a given point
    def find_closest_vertices(row):
        point = np.array([row['longitude'], row['latitude']])
        # Query the KD-Tree to find the 4 closest vertices
        _, indices = kdtree.query(point, k=4)
        return indices

    for i in range(len(data)-1):
        # 예측
        features = [data.iloc[i][4], data.iloc[i][5], data.iloc[i][6], data.iloc[i][7]]
        _d_long = model_d_long.predict([features])
        _d_lati = model_d_lati.predict([features])

        # 예측값 할당
        data.iloc[i][-2] = _d_long
        data.iloc[i][-1] = _d_lati

        # 예측한 값으로 다음 위도 경도 설정
        data.iloc[i+1][2] = data.iloc[i][2] + data.iloc[i][-2]
        data.iloc[i+1][3] = data.iloc[i][3] + data.iloc[i][-1]

        near_lst = find_closest_vertices(data.iloc[i+1])

        x = water.loc[water["time"] == data.iloc[i][1]]
        w_u = []
        w_v = []
        w_d = []
        for j in near_lst:
            d = ((data.iloc[i][2] - x.iloc[j][1])**2 + (data.iloc[i][3] - x.iloc[j][2])**2) ** 0.5
            w_u.append(x.iloc[j][-2])
            w_v.append(x.iloc[j][-1])
            w_d.append(d)

        # weithged average
        total_weight = sum(w_d)
        weighted_u = sum(w_u[i] * w_d[i] for i in range(len(w_d))) / total_weight
        weighted_v = sum(w_v[i] * w_d[i] for i in range(len(w_d))) / total_weight

        data.iloc[i+1][4] = weighted_u
        data.iloc[i+1][5] = weighted_v

    # Create a grid of longitude and latitude from the water DataFrame
    grid2 = wind[['longitude', 'latitude']].drop_duplicates().values

    # Create a KD-Tree for efficient nearest neighbor search
    kdtree = cKDTree(grid2)

    # Function to find the 4 closest vertices for a given point
    def find_closest_vertices(row):
        point = np.array([row['longitude'], row['latitude']])
        # Query the KD-Tree to find the 4 closest vertices
        _, indices = kdtree.query(point, k=4)
        return indices

    for i in range(len(data)-1):
        near_lst = find_closest_vertices(data.iloc[i+1])

        x = wind.loc[wind["time"] == data.iloc[i][1]]
        w_u = []
        w_v = []
        w_d = []
        for j in near_lst:
            d = ((data.iloc[i][2] - x.iloc[j][1])**2 + (data.iloc[i][3] - x.iloc[j][2])**2) ** 0.5
            w_u.append(x.iloc[j][-2])
            w_v.append(x.iloc[j][-1])
            w_d.append(d)

        # weithged average
        total_weight = sum(w_d)
        weighted_u = sum(w_u[i] * w_d[i] for i in range(len(w_d))) / total_weight
        weighted_v = sum(w_v[i] * w_d[i] for i in range(len(w_d))) / total_weight

        data.iloc[i+1][6] = weighted_u
        data.iloc[i+1][7] = weighted_v
    
    result = pd.concat([result, data], ignore_index=True)

KeyboardInterrupt: 

In [None]:
result

In [None]:
result.describe()

In [None]:
result.to_csv("data/result_100days.csv", index = False)