In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/ibtracs.csv')

# DATA PREPROCESSING

In [3]:
KEYS = [
  'STORM ID', 'SEASON', 'BASIN', 'SUBBASIN', 'ISO TIME', 'NATURE',
  'LAT', 'LON',
  'DIST2LAND', 'LANDFALL',
  'STORM DIR', 'STORM SPEED', 'USA SSHS'
]

In [4]:
mask = df['STORM DIR'].notna() & df['STORM SPEED'].notna()
df = df.rename(columns={'ISO_TIME_________': 'ISO TIME'})
df = df[mask][KEYS].copy()

## Features thời gian

In [5]:
full_dates = df['ISO TIME'].str.extract(r'(\d{4}-\d{2}-\d{2})')[0]
full_dates = full_dates.ffill()
time_only_mask = ~df['ISO TIME'].str.contains(r'\d{4}-\d{2}-\d{2}', na=False)
df.loc[time_only_mask, 'ISO TIME'] = full_dates[time_only_mask] + ' ' + df.loc[time_only_mask, 'ISO TIME']
df['ISO TIME'] = pd.to_datetime(df['ISO TIME'])

In [6]:
df['MONTH'] = df['ISO TIME'].dt.month
df['HOUR'] = df['ISO TIME'].dt.hour
df['DAY OF YEAR'] = df['ISO TIME'].dt.dayofyear
df['TIME FROM GENESIS'] = df.groupby('STORM ID').cumcount() * 3

## Features cho chuyển động bão

In [7]:
rad = np.pi / 180.0
df['MOVE X'] = df['STORM SPEED'] * np.sin(df['STORM DIR'] * rad)
df['MOVE Y'] = df['STORM SPEED'] * np.cos(df['STORM DIR'] * rad)

In [8]:
df['ACCELERATION'] = df.groupby('STORM ID')['STORM SPEED'].diff().fillna(0) / 3
df['TURN RATE'] = df.groupby('STORM ID')['STORM DIR'].diff().fillna(0) / 3

## Lag features cho từng cơn bão

In [9]:
lags = [1, 2, 3]  # 3, 6, 9 hours ago
for lag in lags:
  df[f'LAT LAG {lag}'] = df.groupby('STORM ID')['LAT'].shift(lag)
  df[f'LON LAG {lag}'] = df.groupby('STORM ID')['LON'].shift(lag)
  df[f'SPEED LAG {lag}'] = df.groupby('STORM ID')['STORM SPEED'].shift(lag)
  df[f'DIR LAG {lag}'] = df.groupby('STORM ID')['STORM DIR'].shift(lag)

## Features cho sự thay đổi vị trí

In [10]:
df['LAT CHANGE'] = df['LAT'] - df['LAT LAG 1']
df['LON CHANGE'] = df['LON'] - df['LON LAG 1']

In [11]:
def haversine_distance(lat1, lon1, lat2, lon2):
  R = 6371 # Earth radius in km
  lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
  dlat = lat2 - lat1
  dlon = lon2 - lon1
  a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
  return 2 * R * np.arcsin(np.sqrt(a))

In [12]:
df['DISTANCE'] = haversine_distance(df['LAT LAG 1'], df['LON LAG 1'], df['LAT'], df['LON'])
df['REAL SPEED'] = df['DISTANCE'] / 3 

## Features địa lý

In [13]:
df['DIST EQUATOR'] = np.abs(df['LAT'])
df['NORTH HEMISPHERE'] = (df['LAT'] > 0).astype(int)
df['NEAR LAND'] = (df['DIST2LAND'] < 100).astype(int)
df['STORM SPEED NEAR LAND'] = df['STORM SPEED'] * df['NEAR LAND']

## Features xu hướng

In [14]:
for window in [3, 6]:  # 9h and 18h
  df[f'SPEED MEAN {window}'] = df.groupby('STORM ID')['STORM SPEED'].rolling(
    window=window, min_periods=1
  ).mean().reset_index(0, drop=True)
  
  df[f'DIR STD {window}'] = df.groupby('STORM ID')['STORM DIR'].rolling(
    window=window, min_periods=1
  ).std().reset_index(0, drop=True)

## Encoding categorical variables

In [15]:
df = pd.get_dummies(df, columns=['BASIN', 'NATURE'], prefix=['BASIN', 'NATURE'])

## Target variables

In [16]:
prediction_horizons = {'6H': 2, '12H': 4, '24H': 8}
for horizon, steps in prediction_horizons.items():
  df[f'NEXT LAT {horizon}'] = df.groupby('STORM ID')['LAT'].shift(-steps)
  df[f'NEXT LON {horizon}'] = df.groupby('STORM ID')['LON'].shift(-steps)

## Data cleaning

In [17]:
df = df.dropna()

# Lưu file CSV

In [18]:
df.to_csv('../data/ibtracs_dataset.csv', index=False)