# Weather Big Data Contest 2021

### Yonsei University - Hyunjoo Kim, Jiwon Lim, Hyejin Eum

## Modeling (A part of training & validation process with 2011-2019 data)

The values of y are all recorded as 0 for 2020 data. The actual values were not provided.

Data from 2011-2019 was used for presenting a part of training & validation process.

In [1]:
import pandas as pd
import numpy as np
import warnings
import sklearn.metrics
from matplotlib import font_manager, rc
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from pandas import Series, DataFrame
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
np.set_printoptions(threshold=np.inf)
warnings.filterwarnings(action='ignore')

In [2]:
# import data and min-max normalizing
data = pd.read_csv('data_210138.csv')
data.iloc[:,6:193] = MinMaxScaler().fit_transform(data.iloc[:,6:193])

data_part = data.loc[data['date'] < '2020-01-01',]

In [3]:
data_train = data_part.loc[data_part['date'].str.slice(stop=4) != '2019']
data_val = data_part.loc[data_part['date'].str.slice(stop=4) == '2019']

In [4]:
### Training data
# undersampling day1
X0 = data_train.loc[data_train['X1day_yn']==0,]
X1 = data_train.loc[data_train['X1day_yn']==1,]
non, X0_samp = train_test_split(X0, random_state = 601, test_size=0.05, shuffle=True, stratify=X0['year'])
data1 = pd.concat([X0_samp, X1])
data1 = data1.drop(['year'], axis=1)

# undersampling day2
X0_2 = data_train.loc[data_train['X2day_yn']==0,]
X1_2 = data_train.loc[data_train['X2day_yn']==1,]
non, X0_samp2 = train_test_split(X0_2, random_state = 601, test_size=0.025, shuffle=True, stratify=X0_2['year'])
data2 = pd.concat([X0_samp2, X1_2])
data2 = data2.drop(['year'], axis=1) 

x1_t = data1.iloc[:,6:193] # training data for predicting the occurrence of landslide within 1day
y1_t = data1.iloc[:,3]
x2_t = data2.iloc[:,6:193] # train data for predicting the occurrence of landslide within 2days
y2_t = data2.iloc[:,5]


### Validation data
# undersampling day1
X0_v = data_val.loc[data_val['X1day_yn']==0,]
X1_v = data_val.loc[data_val['X1day_yn']==1,]
non_v, X0_v_samp = train_test_split(X0_v, random_state = 601, test_size=0.05, shuffle=True)
data1_v = pd.concat([X0_v_samp, X1_v])
data1_v = data1_v.drop(['year'], axis=1)

# undersampling day2
X0_v_2 = data_val.loc[data_val['X2day_yn']==0,]
X1_v_2 = data_val.loc[data_val['X2day_yn']==1,]
non_v, X0_v_samp2 = train_test_split(X0_v_2, random_state = 601, test_size=0.05, shuffle=True)
data2_v = pd.concat([X0_v_samp2, X1_v_2])
data2_v = data2_v.drop(['year'], axis=1)

x1_v = data1_v.iloc[:,6:193] # validation data for predicting the occurrence of landslide within 1day
y1_v = data1_v.iloc[:,3]
x2_v = data2_v.iloc[:,6:193] # validation data for predicting the occurrence of landslide within 2days
y2_v = data2_v.iloc[:,5]

In [5]:
x1_t.shape

(93316, 187)

In [6]:
x1_v.shape

(11717, 187)

### Lasso regression - an example of predicting the occurrence of a landslide within 24-hour

In [7]:
from sklearn.linear_model import LogisticRegression

In [10]:
logisticRegr1 = LogisticRegression(C = 0.1, penalty='l1', solver='saga')
logisticRegr1.fit(x1_t, y1_t)

logisticRegr1_val = logisticRegr1.predict_proba(x1_v)[:,1]
day1_lr = (logisticRegr1_val >= 0.5).astype(int)

In [12]:
# CSI
print(confusion_matrix(y1_v, day1_lr)) # 52/(4+44+52) = 0.52
print(accuracy_score(y1_v, day1_lr))

[[11617     4]
 [   44    52]]
0.9959033882393103


### LightGBM -  an example of predicting the occurrence of a landslide within 24-hour

In [13]:
import lightgbm as lgb

In [14]:
train_gbm1 = lgb.Dataset(x1_t, label=y1_t)
params = {'learning_rate':0.01,
          'max_depth':4,
          'boosting':'gbdt',
          'objective':'binary',
          'metric':'binary_logloss',
          'is_training_metric':True,
          'num_leaves':20,
          'feature_fraction':0.9,
          'bagging_fraction':0.7,
          'bagging_freq':10,
          'seed':601}
num_round = 100

# training
lgbm1 = lgb.train(params, train_gbm1, num_round)

# validation
lgbm1_val = lgbm1.predict(x1_v)

[LightGBM] [Info] Number of positive: 263, number of negative: 93053
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7080
[LightGBM] [Info] Number of data points in the train set: 93316, number of used features: 171
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002818 -> initscore=-5.868770
[LightGBM] [Info] Start training from score -5.868770


In [15]:
day1_lgbm = pd.Series(lgbm1_val).apply(lambda x: 1 if x>=0.5 else 0)

# CSI
print(confusion_matrix(y1_v, day1_lgbm)) # 5/(2+91+5) = 0.05
print(accuracy_score(y1_v, day1_lgbm))

[[11619     2]
 [   91     5]]
0.9920628147136639


### SVM - an example of predicting the occurrence of a landslide within 48-hour

In [16]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV

In [21]:
svm_param_grid = {'C': [0.05,0.1], 'gamma': [10,1],'kernel': ['rbf', 'poly', 'sigmoid']}

In [22]:
svm_grid = GridSearchCV(svm.SVC(), svm_param_grid, refit=True, verbose=2)
svm_grid.fit(x2_t,y2_t)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .......................C=0.05, gamma=10, kernel=rbf; total time=  34.7s
[CV] END .......................C=0.05, gamma=10, kernel=rbf; total time=  35.6s
[CV] END .......................C=0.05, gamma=10, kernel=rbf; total time=  37.6s
[CV] END .......................C=0.05, gamma=10, kernel=rbf; total time=  36.8s
[CV] END .......................C=0.05, gamma=10, kernel=rbf; total time=  37.6s
[CV] END ......................C=0.05, gamma=10, kernel=poly; total time=   3.0s
[CV] END ......................C=0.05, gamma=10, kernel=poly; total time=   2.9s
[CV] END ......................C=0.05, gamma=10, kernel=poly; total time=   3.1s
[CV] END ......................C=0.05, gamma=10, kernel=poly; total time=   2.7s
[CV] END ......................C=0.05, gamma=10, kernel=poly; total time=   2.9s
[CV] END ...................C=0.05, gamma=10, kernel=sigmoid; total time=   1.4s
[CV] END ...................C=0.05, gamma=10, ke

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.05, 0.1], 'gamma': [10, 1],
                         'kernel': ['rbf', 'poly', 'sigmoid']},
             verbose=2)

In [24]:
print(svm_grid.best_estimator_)

SVC(C=0.05, gamma=10)


In [25]:
day2_svm = svm_grid.predict(x2_v)
print(confusion_matrix(y2_v, day2_svm)) # 0
print(accuracy_score(y2_v, day2_svm))

[[11621     0]
 [   96     0]]
0.9918067764786208
