<a href="https://colab.research.google.com/github/Hanguyen-6715/Data-science-in-Astrophysics/blob/main/Enrich_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def _basic_fea(df):
    df['diurnal_temp_range'] = df['tempmax'] - df['tempmin']
    df['average_temp'] = (df['tempmax'] - df['tempmin']) / 2
    df['temp_fahrenheit'] = df['temp'] * 9/5 + 32 # convert Celcius to Fahrenhiet degree

    df['precip_days_gt_5mm'] = (df['precip'] > 5).sum()
    return df


# Heat index


In [None]:
def _heat_index(RH, T):
    """
    RH: humidity %
    T: fahrenheit
    unit HI: F
    """
    # Basic heat index
    HI = -42.379 + 2.04901523*T + 10.14333127*RH - 0.22475541*T*RH - \
            0.00683783*T*T - 0.05481717*RH*RH + 0.00122874*T*T*RH + \
            0.00085282*T*RH*RH - 0.00000199*T*T*RH*RH

    # Apply adjustments
    if (RH < 13) and (80 <= T <= 120):
        ADJUSTMENT = ((13-RH)/4) * np.sqrt((17 - np.abs(T-95.)) / 17)
        HI = HI - ADJUSTMENT

    if (RH > 85) and (80 <= T <= 87):
        ADJUSTMENT = ((RH-85)/10) * ((87-T)/5)
        HI = HI + ADJUSTMENT  # Corrected to add the adjustment

    return HI


In [None]:
def _level_HI(df):
    """
    + Caution: Fatigue possible with prolonged exposure and/or physical activity
    + Extreme Caution: Heat stroke, heat cramps, or heat exhaustion
    + Danger: Heat cramps or heat exhaustion likely, and heat stroke
    + Extreme Danger: Heat stroke highly
    unit: F
    """

    df.loc[df['heat_index'] < 80, 'level_HI'] = 'safe'
    df.loc[df['heat_index'].between(80, 90, inclusive = 'left'), 'level_HI'] = 'caution'
    df.loc[df['heat_index'].between(90, 103, inclusive = 'left'), 'level_HI'] = 'extreme caution'
    df.loc[df['heat_index'].between(103, 124, inclusive = 'left'), 'level_HI'] = 'danger'
    df.loc[df['heat_index'] >= 125, 'level_HI'] = 'extreme danger'

    return df


# Wind chill

In [None]:
def _windchill(WS, T):
    """
    T: celcius
    ws: km/h
    """
    Tw = 13.127 + 0.6215*T - 11.362*(pow(WS, 0.16)) + 0.396*T*(pow(WS, 0.16))
    return Tw


In [None]:
def _wind_level(x):
    """unit: km/h
    """
    levels = {
        (0, 0): 0,
        (1, 5): 1,
        (6, 11): 2,
        (12, 19): 3,
        (20, 28): 4,
        (29, 38): 5,
        (39, 49): 6,
        (50, 61): 7,
        (62, 74): 8,
        (75, 88): 9,
        (89, 102): 10,
        (103, 117): 11,
        (118, 133): 12,
        (134, 149): 13,
        (150, 166): 14,
        (167, 183): 15,
        (184, 201): 16,
        (202, 220): 17,
        (221, np.inf): 18
    }

    for (low, high), level in levels.items():
        if low <= x <= high:
            return level
    return None

In [None]:
def categorize_wind_level(df, wind_level_col='wind_level', output_col='ten_cap_bao'):
    """
    Categorizes wind levels into different types of storms
    """
    df.loc[df[wind_level_col].isin([0, 1, 2, 3]), output_col] = 'L'
    df.loc[df[wind_level_col].isin([4]), output_col] = 'vung ap thap'
    df.loc[df[wind_level_col].isin([5]), output_col] = 'vung ap thap duoc cap so hieu'
    df.loc[df[wind_level_col].isin([6, 7]), output_col] = 'ap thap nhiet doi'
    df.loc[df[wind_level_col].isin([8, 9]), output_col] = 'bao'
    df.loc[df[wind_level_col].isin([10, 11]), output_col] = 'bao manh'
    df.loc[df[wind_level_col].isin([12, 13, 14, 15]), output_col] = 'bao rat manh'
    df.loc[df[wind_level_col].isin([16, 17, 18]), output_col] = 'sieu bao'

    return df

# Dew point

In [None]:
def _dew_point(RH, T):
    """
    RH - Humidity: %
    T: Celsius
    """
    a = 17.625
    b = 243.04 # Celsius
    alpha = np.log(RH/100) + a*T/(b+T)
    DP = (b * alpha)  / (a - alpha)
    return DP


# Nang Nong

In [5]:
def _level_nangnong(df):
    df.loc[df['tempmax'].between(35,37), 'level_nangnong'] = 'nang nong'
    df.loc[df_weather['tempmax'].between(37,39), 'level_nangnong'] = 'nang nong gay gat'
    df.loc[df['tempmax'] >= 39, 'level_nangnong'] = 'nang nong dac biet gay gat'

    df['is_T35'] = df['tempmax'].apply(lambda x: 1 if x >= 35 else 0)
    df['is_T37'] = df['tempmax'].apply(lambda x: 1 if x >= 37 else 0)
    return df


In [6]:
def _area_T35(group):
    if group['is_T35'].sum() >= (group['locationId'].nunique() / 2):
        return 'dien rong'
    else:
        return 'cuc bo'


def _area_T37(group):
    area_type = group['area_nangnong'].iloc[0]  # Take the values of col 'are' in current group

    # Condition:nắng nóng gay gắt trên diện rộng
    is_T35_sum = group['is_T35'].sum()
    is_T37_sum = group['is_T37'].sum()

    if area_type == 'dien rong':
        # Nếu có ít nhất 1/2 số trạm có nhiệt độ ≥ 35°C và trong số đó có ít nhất 1/2 số trạm có nhiệt độ ≥ 37°C
        if is_T37_sum >= (is_T35_sum / 2):
            return 'gay gat dien rong'
        else:
            return 'gay gat cuc bo'

    elif area_type == 'cuc bo':
        # Nắng nóng cục bộ không thể có nắng nóng gay gắt trên diện rộng
        if is_T37_sum >= (is_T35_sum / 2):
            return None
        else:
            return 'gay gat cuc bo'

    # Trường hợp nếu 'area' không phải là 'dien rong' hay 'cuc bo'
    return None

In [7]:
def _heatwave(group):
    """
    đợt nắng nóng: Khi nắng nóng diện rộng xuất hiện liên tục từ 2 ngày trở lên trong một khu vực dự báo
    """
    group['heatwave_period'] = (group['area_nangnong'] == 'dien rong').astype(int)
    group['heatwave_period'] = group['heatwave_period'].diff().ne(0).cumsum()
    heatwave_count = group.groupby('heatwave_period')['heatwave_period'].transform('size')
    group['dot_nang_nong'] = (heatwave_count >= 2) & (group['area_nangnong'] == 'dien rong')

    return group


# Precip

In [None]:
def _level_precip(df, precip_col='precip', output_col='level_precip'):
    """
    categorizes level of precipitation
    unit: mm/12h
    """

    df.loc[df[precip_col] < 0.3, output_col] = 'mua nho, k dang ke'
    df.loc[df[precip_col].between(0.3, 3, inclusive = 'left') , output_col] = 'mua nho'
    df.loc[df[precip_col].between(3, 8, inclusive = 'left'), output_col] = 'mua'
    df.loc[df[precip_col].between(8, 25, inclusive = 'left'), output_col] = 'mua vua'
    df.loc[df[precip_col].between(25, 50, inclusive = 'left'), output_col] = 'mua to'
    df.loc[df[precip_col] > 50, output_col] = 'mua rat to'

    return df

# Main

In [None]:
df = _basic_fea(df)

# Nhiet do bieu kien
df['heat_index'] = df.apply(lambda row: _heat_index(row['humidity'], row['temp_fahrenheit']), axis=1)

# level of heat index
df = _level_HI(df)

# windchill
df['windchill'] = df.apply(lambda row: _windchill(row['windspeed'], row['temp']), axis=1)

# wind_level
df['wind_level'] = df['windspeed'].apply(_wind_level)

# cap do bao
df = categorize_wind_level(df)

# diem suong
df['dew_point'] = df.apply(lambda row: _dew_point(row['humidity'], row['temp']), axis=1)

# level precipitation
df = _level_precip(df, precip_col='precip', output_col='level_precip')

In [None]:
# nang nong
df.loc[df['tempmax'].between(35,37), 'level_nangnong'] = 'nang nong'
df.loc[df['tempmax'].between(37,39), 'level_nangnong'] = 'nang nong gay gat'
df.loc[df['tempmax'] >= 39, 'level_nangnong'] = 'nang nong dac biet gay gat'

df['is_T35'] = df['tempmax'].apply(lambda x: 1 if x >= 35 else 0)
df['is_T37'] = df['tempmax'].apply(lambda x: 1 if x >= 37 else 0)


# use def _area_T35
area_T35_values = df.groupby(['locationNameLv1', 'date']).apply(_area_T35).reset_index(name='area_nangnong')
df = df.merge(area_T35_values, on=['locationNameLv1', 'date'], how='left')

# use def _area_T37
area_T37_values = df.groupby(['locationNameLv1', 'date']).apply(_area_T37).reset_index(name='area_level_nangnong')
df = df.merge(area_T37_values, on=['locationNameLv1', 'date'], how='left')

# level_nangnong
df.loc[df['level_nangnong'].isna(), 'area_nangnong'] = None
df.loc[df['level_nangnong'].isna(), 'area_level_nangnong'] = None

# xac dinh cac dot nang nong
df1 = df.groupby(['locationNameLv1', 'locationId']).apply(_heatwave).reset_index(drop=True)