In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score

In [2]:
floods = pd.read_excel('Flood Events Final.xlsx')
floods = floods.drop(columns=['Unnamed: 0'])
floods.head()

Unnamed: 0,Date,Bernam River,Selangor River,Buloh River,Klang River,Langat River,Sepang River
0,2001-01-25,0,0,0,1,0,0
1,2001-02-19,1,0,0,0,0,0
2,2001-04-03,0,0,0,1,1,0
3,2001-04-09,0,0,0,1,1,0
4,2001-04-13,0,0,0,1,0,0


In [3]:
from datetime import date, timedelta

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

start_date = date(2001, 1, 1)
end_date = date(2011, 1, 1)
dates = []
for single_date in daterange(start_date, end_date):
    dates.append(single_date.strftime("%Y-%m-%d"))

nonFloods = pd.DataFrame({
    'Date':dates,
    'Bernam River':0,
    'Selangor River':0,
    'Buloh River':0,
    'Klang River':0,
    'Langat River':0,
    'Sepang River':0,
})

nonFloods['Date'] = pd.to_datetime(nonFloods['Date'], format='%Y-%m-%d')
nonFloods.shape

(3652, 7)

In [4]:
floods = floods.append(nonFloods, ignore_index=True)
floods = floods.groupby(['Date']).sum().reset_index()

  floods = floods.append(nonFloods, ignore_index=True)


In [19]:
floodsFixed = []
for i in range(3652):
    floodsFixed.append(np.array(floods.iloc[i]))

In [5]:
hqprecip = pd.read_csv('HQprecipitation Data.csv')

for i in range(3652):
    hqprecip['time'][i] = hqprecip['time'][i][:10]
    
hqprecip['time'] = pd.to_datetime(hqprecip['time'], format='%Y-%m-%d')
hqprecip.head()

Unnamed: 0,time,HqPrecips
0,2001-01-01,"[1.744999885559082, 1.899999976158142, 1.55999..."
1,2001-01-02,"[6.5149993896484375, 3.3349997997283936, 0.779..."
2,2001-01-03,"[1.619999885559082, 1.125, 5.389999866485596, ..."
3,2001-01-04,"[3.509999990463257, 2.6349997520446777, 2.3450..."
4,2001-01-05,"[0.6399999856948853, 0.6349999904632568, 0.140..."


In [6]:
column_list = []
for i in range(225):
    colName = 'HqPrecip_'+str(i)
    column_list.append(colName)
    
column = []
for i in range(3652):
    values = (hqprecip['HqPrecips'][i])[1:-1].split(', ')
    for j in range(225):
        values[j] = float(values[j])
    column.append(values)

HqSplit = pd.DataFrame(np.column_stack(list(zip(*column))), columns=column_list)

In [7]:
floods = floods.drop(columns=['Date'])

### Multi-Layered Perceptron Test

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(HqSplit, floods, random_state=42)
clf = MLPClassifier(random_state=42, max_iter=300).fit(X_train, y_train)
clf.score(X_test, y_test)

0.8762322015334063

In [9]:
from sklearn import metrics

predictions = clf.predict(X_test)
cm = metrics.multilabel_confusion_matrix(y_test,predictions)
print(cm)

[[[908   0]
  [  5   0]]

 [[899   5]
  [  9   0]]

 [[906   0]
  [  7   0]]

 [[842  23]
  [ 43   5]]

 [[873   9]
  [ 31   0]]

 [[911   0]
  [  2   0]]]


### Label Powerset Attempt

In [33]:
floods['powerLabel'] = floods.apply(lambda x: 32*x['Bernam River']+16*x["Selangor River"]+8*x['Buloh River']+4*x['Klang River']+2*x['Langat River']+1*x['Sepang River'],axis=1)

In [76]:
floods['powerLabel'].isna().sum()

0

In [62]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier

# classifier = LabelPowerset(
#     classifier = RandomForestClassifier(n_estimators=100)
# )

classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)

predicitons = classifier.predict(X_test)

In [63]:
cm = metrics.multilabel_confusion_matrix(y_test,predictions)
print(cm)

[[[17552     4]
  [   46   779]]

 [[17547     0]
  [    0   834]]

 [[17524    17]
  [    0   840]]

 [[17544    19]
  [    4   814]]

 [[17527     3]
  [    0   851]]

 [[17578     2]
  [    0   801]]

 [[17558     0]
  [    0   823]]

 [[17514     0]
  [    0   867]]

 [[17551     0]
  [    0   830]]

 [[17502     3]
  [    0   876]]

 [[17496     0]
  [    0   885]]

 [[17507     0]
  [    0   874]]

 [[17535     0]
  [    0   846]]

 [[17579     0]
  [    0   802]]

 [[17560     2]
  [    0   819]]

 [[17531     0]
  [    0   850]]

 [[17564     0]
  [    0   817]]

 [[17581     0]
  [    0   800]]

 [[17580     0]
  [    0   801]]

 [[17510     0]
  [    0   871]]

 [[17553     0]
  [    0   828]]

 [[17558     0]
  [    0   823]]]


In [38]:
import imblearn
oversample = imblearn.over_sampling.SMOTE()
HqSplit, floods = oversample.fit_resample(HqSplit, floods['powerLabel'])

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 2, n_neighbors = 6

In [119]:
floods['powerLabel'].value_counts()

0     3342
4      131
2       78
16      22
6       18
8       17
12       7
32       6
18       5
22       4
14       4
10       3
36       3
48       2
34       2
1        2
21       1
46       1
20       1
30       1
56       1
17       1
Name: powerLabel, dtype: int64

In [None]:
samplingDict = {0:0}

In [118]:
ROS = imblearn.over_sampling.RandomOverSampler(random_state=42)
X_res, y_res = ROS.fit_resample(HqSplit, floods['powerLabel'])

In [111]:
y_res = y_res.fillna(value=0)
y_res

0         0
1         0
2         0
3         0
4         0
         ..
73519    56
73520    56
73521    56
73522    56
73523    56
Name: powerLabel, Length: 73524, dtype: int64

In [112]:
def unpowerLabel(label):
    value = int(bin(int(label))[2:])
    values = [int(x) for x in str(value)]
    while len(values) != 6:
        values.insert(0,0)
    return values

In [113]:
y_unpowered = []
for i in range(len(y_res)):
    y_unpowered.append(unpowerLabel(y_res[i]))

### Multiple Logistic Regression

In [122]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

multiClf = MultiOutputClassifier(LogisticRegression(max_iter=10000)).fit(X_train,y_train)

In [128]:
multiPredictions = multiClf.predict_proba(X_test)
multiPredictions[0]

array([[2.27269884e-03, 9.97727301e-01],
       [1.00000000e+00, 1.35237192e-10],
       [1.81775802e-03, 9.98182242e-01],
       ...,
       [5.32373874e-05, 9.99946763e-01],
       [9.99942928e-01, 5.70720618e-05],
       [9.99989439e-01, 1.05610782e-05]])

In [131]:
multiScore = multiClf.score(X_test,np.array(y_test))
multiScore

0.9381970513029759

In [116]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_unpowered, random_state=42)
clf = MLPClassifier(random_state=42, max_iter=300).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9956476796692236

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, random_state=42)
clf = MLPClassifier(random_state=42, max_iter=300).fit(X_train, y_train)
clf.score(X_test, y_test)

0.9972797997932648

In [46]:
predictions = clf.predict(X_test)
cm = metrics.multilabel_confusion_matrix(y_test,predictions)
print(cm)

[[[17552     4]
  [   46   779]]

 [[17547     0]
  [    0   834]]

 [[17524    17]
  [    0   840]]

 [[17544    19]
  [    4   814]]

 [[17527     3]
  [    0   851]]

 [[17578     2]
  [    0   801]]

 [[17558     0]
  [    0   823]]

 [[17514     0]
  [    0   867]]

 [[17551     0]
  [    0   830]]

 [[17502     3]
  [    0   876]]

 [[17496     0]
  [    0   885]]

 [[17507     0]
  [    0   874]]

 [[17535     0]
  [    0   846]]

 [[17579     0]
  [    0   802]]

 [[17560     2]
  [    0   819]]

 [[17531     0]
  [    0   850]]

 [[17564     0]
  [    0   817]]

 [[17581     0]
  [    0   800]]

 [[17580     0]
  [    0   801]]

 [[17510     0]
  [    0   871]]

 [[17553     0]
  [    0   828]]

 [[17558     0]
  [    0   823]]]


In [47]:
test = pd.read_csv('HQprecipitation Data Test.csv')

In [49]:
for i in range(31):
    test['time'][i] = test['time'][i][:10]
    
test['time'] = pd.to_datetime(test['time'], format='%Y-%m-%d')
test.head()

Unnamed: 0,time,HqPrecips
0,2021-12-01,"[0.10499999672174454, 0.09999999403953552, 0.0..."
1,2021-12-02,"[2.5199999809265137, 2.4600002765655518, 2.230..."
2,2021-12-03,"[1.9449999332427979, 0.1549999862909317, 0.014..."
3,2021-12-04,"[0.019999999552965164, 0.17499999701976776, 0...."
4,2021-12-05,"[0.029999999329447746, 0.03500000014901161, 0...."


In [50]:
column_list = []
for i in range(225):
    colName = 'HqPrecip_'+str(i)
    column_list.append(colName)
    
column = []
for i in range(31):
    values = (test['HqPrecips'][i])[1:-1].split(', ')
    for j in range(225):
        values[j] = float(values[j])
    column.append(values)

testSplit = pd.DataFrame(np.column_stack(list(zip(*column))), columns=column_list)

In [132]:
testPreds = multiClf.predict(testSplit)
testPreds

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [133]:
floods

Unnamed: 0,Bernam River,Selangor River,Buloh River,Klang River,Langat River,Sepang River,powerLabel
0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...
3647,0,0,0,0,0,0,0
3648,0,0,0,0,0,0,0
3649,0,0,0,0,0,0,0
3650,0,0,0,0,0,0,0
