## 1. Library Import & Set Random State

In [1]:
import numpy as np
import random
import os

import pandas as pd
import xgboost as xgb
from imblearn.over_sampling import SMOTE

import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('font', family = 'Gulim')
mpl.rcParams['axes.unicode_minus']=False

import warnings
warnings.filterwarnings('ignore')

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## 2. Define CSI Score function

In [2]:
def CSI(y_test, y_pred):
    matrix = {
        'H11': 0, 'H22': 0, 'H33': 0,
        'F12': 0, 'F13': 0, 'F14': 0,
        'F21': 0, 'F23': 0, 'F24': 0,
        'F31': 0, 'F32': 0, 'F34': 0,
        'F41': 0, 'F42': 0, 'F43': 0,
        'M14': 0, 'M24': 0, 'M34': 0
    }

    # 매트릭스 채우기
    for true, pred in zip(y_test, y_pred):
        if true == pred:
            if true == 1:
                matrix['H11'] += 1
            elif true == 2:
                matrix['H22'] += 1
            elif true == 3:
                matrix['H33'] += 1
        else:
            if true == 1:
                if pred == 2:
                    matrix['F12'] += 1
                elif pred == 3:
                    matrix['F13'] += 1
                elif pred == 4:
                    matrix['F14'] += 1
            elif true == 2:
                if pred == 1:
                    matrix['F21'] += 1
                elif pred == 3:
                    matrix['F23'] += 1
                elif pred == 4:
                    matrix['F24'] += 1
            elif true == 3:
                if pred == 1:
                    matrix['F31'] += 1
                elif pred == 2:
                    matrix['F32'] += 1
                elif pred == 4:
                    matrix['F34'] += 1
            elif true == 4:
                if pred == 1:
                    matrix['F41'] += 1
                elif pred == 2:
                    matrix['F42'] += 1
                elif pred == 3:
                    matrix['F43'] += 1

    # H, F, M 계산
    H = matrix['H11'] + matrix['H22'] + matrix['H33']
    F = (matrix['F12'] + matrix['F13'] + matrix['F14'] +
        matrix['F21'] + matrix['F23'] + matrix['F24'] +
        matrix['F31'] + matrix['F32'] + matrix['F34'] +
        matrix['F42'] + matrix['F43'])
    M = matrix['M14'] + matrix['M24'] + matrix['M34']

    # CSI 계산
    CSI = H / (H + F + M)
    print(f'CSI: {CSI}')

## 3. Data Pre-processing & Making Derived Variables

In [3]:
train = pd.read_csv('data/fog_train.csv')
test = pd.read_csv('data/fog_test.csv')

# change column names in train data
cols = train.columns[1:].to_list()
train = train[cols]
for i in range(len(cols)):
    cols[i] = cols[i].split('.')[-1]
train.columns = cols

# change column names in test data
cols = test.columns[1:].to_list()
test = test[cols]
for i in range(len(cols)):
    cols[i] = cols[i].split('.')[-1]
test.columns = cols

for col in train.columns:
    train.loc[train[col]== -99.9, col] = np.nan
    train.loc[train[col]==-99, col] = np.nan
for col in test.columns:
    test.loc[test[col]== -99.9, col] = np.nan
    test.loc[test[col]==-99, col] = np.nan

for i in range(len(train['year'].unique())):
    year = train['year'].unique()[i]
    train.loc[train['year'] == year, 'year'] = 2000 + i
test['year'] = 2003

# fill missing value
interpolate_columns = ['ws10_deg', 'ws10_ms', 'ta', 'hm', 'sun10', 'ts']
for loc in train['stn_id'].unique():
    for col in interpolate_columns:
        train.loc[train['stn_id'] == loc, col] = train.loc[train['stn_id'] == loc, col].interpolate(method = 'linear')
    train.loc[train['stn_id'] == loc, interpolate_columns] = train.loc[train['stn_id'] == loc, interpolate_columns].fillna(method = 'ffill')
    train.loc[train['stn_id'] == loc, interpolate_columns] = train.loc[train['stn_id'] == loc, interpolate_columns].fillna(method = 'bfill')
train['re'] = train['re'].fillna(0)

for loc in test['stn_id'].unique():
    for col in interpolate_columns:
        test.loc[test['stn_id'] == loc, col] = test.loc[test['stn_id'] == loc, col].interpolate(method = 'linear')
    test.loc[test['stn_id'] == loc, interpolate_columns] = test.loc[test['stn_id'] == loc, interpolate_columns].fillna(method = 'ffill')
    test.loc[test['stn_id'] == loc, interpolate_columns] = test.loc[test['stn_id'] == loc, interpolate_columns].fillna(method = 'bfill')
test['re'] = test['re'].fillna(0)

# make derived variables
train['dew'] = (train.hm/100)**(1/8)*(112+0.9+train.ta)+(0.1*train.ta)-112
test['dew'] = (test.hm/100)**(1/8)*(112+0.9+test.ta)+(0.1*test.ta)-112

train['taVSdew'] = train['ta'] - train['dew']
test['taVSdew'] = test['ta'] - test['dew']

train['tsVSta'] = train['ts'] - train['ta']
test['tsVSta'] = test['ts'] - test['ta']

train = train.dropna(subset = ['class', 'vis1'])
train = train.fillna(0)
test = test.fillna(0)

for stn in train['stn_id'].unique():
    stn_id = stn[0]
    train.loc[train['stn_id']==stn, 'stn_id'] = stn_id
    
for stn in test['stn_id'].unique():
    stn_id = stn[0]
    test.loc[test['stn_id']==stn, 'stn_id'] = stn_id

train.loc[train["stn_id"] == "A", "stn_id"] = 0
train.loc[train["stn_id"] == "B", "stn_id"] = 1
train.loc[train["stn_id"] == "C", "stn_id"] = 2
train.loc[train["stn_id"] == "D", "stn_id"] = 3
train.loc[train["stn_id"] == "E", "stn_id"] = 4
train["stn_id"] = train["stn_id"].astype(int)

test.loc[test["stn_id"] == "A", "stn_id"] = 0
test.loc[test["stn_id"] == "B", "stn_id"] = 1
test.loc[test["stn_id"] == "C", "stn_id"] = 2
test.loc[test["stn_id"] == "D", "stn_id"] = 3
test.loc[test["stn_id"] == "E", "stn_id"] = 4
test["stn_id"] = test["stn_id"].astype(int)

train.shape, test.shape

((3133943, 18), (262800, 17))

## 4. Method
### (1) Oversampling for classifying 4

In [4]:
train_for_find_4 = train.copy()
test_for_find_4 = test.copy()

train_for_find_4.loc[train_for_find_4["class"]!=4, "class"] = 0
train_for_find_4.loc[train_for_find_4["class"]==4, "class"] = 1

X = train_for_find_4.drop(["class", "vis1"], axis=1)
y = train_for_find_4["class"]

smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_res, y_res = smote.fit_resample(X, y)
y_res.value_counts()

class
1.0    3101809
0.0    1550904
Name: count, dtype: int64

### (2) Classification 4

In [5]:
model = xgb.XGBClassifier(random_state=42)
model.fit(X_res, y_res)

pred_for_val = model.predict(train.drop(["vis1", "class"], axis=1))
pred_for_val_df = pd.DataFrame(pred_for_val)

pred_for_4 = model.predict(test.drop("class", axis=1))
test["class"] = pred_for_4
test["class"].value_counts()

class
1    260646
0      2154
Name: count, dtype: int64

### (3) Making Derived Variable with "re" for classifying 1, 2, 3

In [6]:
test.loc[test["class"]==1, "class"] = 4
_idx = test.loc[test["class"] != 4].index

temp_train = pd.read_csv('data/fog_train.csv', encoding = 'euc-kr')
temp_test = pd.read_csv('data/fog_test.csv', encoding = 'euc-kr')
# change column names it train data
cols = temp_train.columns[1:].to_list()
temp_train = temp_train[cols]
for i in range(len(cols)):
    cols[i] = cols[i].split('.')[-1]
temp_train.columns = cols
# change column names in test data
cols = temp_test.columns[1:].to_list()
temp_test = temp_test[cols]
for i in range(len(cols)):
    cols[i] = cols[i].split('.')[-1]
temp_test.columns = cols

for i in range(len(train['year'].unique())):
    year = temp_train['year'].unique()[i]
    temp_train.loc[temp_train['year'] == year, 'year'] = 2000 + i
temp_test['year'] = 2003

temp_train['time'] = temp_train['year'].astype(int).astype(str) + '-' + temp_train['month'].astype(int).astype(str) + '-' + temp_train['day'].astype(int).astype(str) + ' ' + temp_train['time'].astype(int).astype(str) + ':' + temp_train['minute'].astype(int).astype(str)
temp_train['datetime'] = pd.to_datetime(temp_train['time'])

temp_test['time'] = temp_test['year'].astype(int).astype(str) + '-' + temp_test['month'].astype(int).astype(str) + '-' + temp_test['day'].astype(int).astype(str) + ' ' + temp_test['time'].astype(int).astype(str) + ':' + temp_test['minute'].astype(int).astype(str)
temp_test['datetime'] = pd.to_datetime(temp_test['time'])

temp_cols = ['datetime', 'stn_id', 're']
temp_train = temp_train[temp_cols]
temp_test = temp_test[temp_cols]

temp_train['re'] = temp_train['re'].fillna(0)
temp_test['re'] = temp_test['re'].fillna(0)

lag_hours = [3, 18, 21, 24]
for lag in lag_hours:
    temp_train[f're_{lag}'] = 0
    temp_test[f're_{lag}'] = 0

for stn in temp_train['stn_id'].unique():
    temp = temp_train.loc[temp_train['stn_id']==stn, 're'].tolist()
    for lag in lag_hours:
        temp_lag = []
        for i in range(len(temp)):
            if i - lag*6 < 0:
                temp_lag.append(max(temp[:i+1]))
            else:
                temp_lag.append(max(temp[i-lag:i+1]))
        temp_train.loc[temp_train['stn_id']==stn, f're_{lag}'] = temp_lag

for stn in temp_test['stn_id'].unique():
    temp = temp_test.loc[temp_test['stn_id']==stn, 're'].tolist()
    for lag in lag_hours:
        temp_lag = []
        for i in range(len(temp)):
            if i - lag*6 < 0:
                temp_lag.append(max(temp[:i+1]))
            else:
                temp_lag.append(max(temp[i-lag:i+1]))
        temp_test.loc[temp_test['stn_id']==stn, f're_{lag}'] = temp_lag

_train = train.copy()

for lag in lag_hours:
    _train[f're_{lag}'] = temp_train.loc[_train.index, f're_{lag}']
    test[f're_{lag}'] = temp_test.loc[test.index, f're_{lag}']
__train = _train.copy()

_train = _train.loc[_train["class"]!=4]
_train.loc[train["class"] == 1, "class"] =0
_train.loc[train["class"] == 2, "class"] =1
_train.loc[train["class"] == 3, "class"] =2
_train = _train.reset_index(drop=True)

X = _train.drop(["class", "vis1"], axis=1)
y = _train["class"].astype(int)

### (4) Classification 1, 2, 3

In [7]:
model = xgb.XGBClassifier(random_state=42)
model.fit(X,y)

val_idx = pred_for_val_df.loc[pred_for_val_df[0] == 0].index
pred_for_val_df += 3
valid_idx = train.index.intersection(val_idx)
val_train = __train.loc[valid_idx, :]
pred_for_val_multi = model.predict(val_train.drop(["vis1", "class"], axis=1))
pred_for_val_multi += 1
pred_for_val_df.loc[valid_idx, 0] = pred_for_val_multi

pred = model.predict(test.loc[_idx,:].drop("class", axis=1))
pred += 1
test.loc[_idx,"class"] = pred
test["class"].value_counts()

class
4    260646
2      1261
3       649
1       244
Name: count, dtype: int64

### (5) CSI Score

In [8]:
CSI(pred_for_val_df[0], train["class"])

CSI: 0.09631072162717733


### (6) Compare target data ratio

In [9]:
print(train["class"].value_counts() / len(train) * 100)
print()
print()
print(test["class"].value_counts() / len(test) * 100)

class
4.0    98.974646
3.0     0.388648
2.0     0.385712
1.0     0.250994
Name: count, dtype: float64


class
4    99.180365
2     0.479833
3     0.246956
1     0.092846
Name: count, dtype: float64


## 5. Make file for submit

In [10]:
sub = pd.read_csv('data/fog_test.csv')
sub["fog_test.class"] = test["class"]
sub.to_csv("240247.csv", index=False)