# Predictive Modelling Project 2 - River flow prediction
## Hayoung Kim, 01603259

### 1. Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from collections import Counter
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
sns.set(style='white', context='notebook', palette='deep')
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
# Ignore warnings
import warnings    # To suppress warnings
warnings.filterwarnings("ignore")
from sklearn import metrics
from sklearn.model_selection import train_test_split

### 2. Load data

#### 2.1 Load train and Test set

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
#IDtest = test["id"]

### 3. Filling missing values in train dataset

In [3]:
traincol_list = train.columns.tolist()

See which rows having any missing value

In [4]:
NaN_list = list(train[train.isnull().any(axis=1)].index)
print(NaN_list)


[17, 93, 141, 276, 349, 371, 396, 422, 532, 599, 621, 761, 798, 847, 863, 996, 1077, 1103, 1143, 1275, 1336, 1473, 1761, 1812, 1925, 1939, 1991, 2155, 2239, 2501, 2566, 2857, 2902, 3010, 3054, 3122, 3135, 3150, 3260, 3265, 3317, 3355, 3425, 3655, 3746, 3978, 4065, 4235, 4343, 4402, 4489, 4517, 4572, 4607, 4659, 4668, 4676, 4724, 4886, 4898, 4971, 5005, 5063, 5102, 5177, 5217, 5234, 5349, 5357, 5366, 5392, 5410, 5437, 5512, 5523, 5572, 5619, 5669, 5713, 5841, 5852, 6077, 6136, 6143, 6182, 6252, 6261, 6493, 6499, 6512]


In [5]:
train.loc[5852][-9]

nan

Before filling missing values, the value was 'nan'.

##### Filling missing values

In [6]:
for NaNcol in traincol_list:
    train[NaNcol].fillna(value=train[NaNcol].mean(), inplace=True)

In [7]:
print(train.loc[5852][-9])


46.93219309936325


After filling missing values, the value is not 'nan' anymore. The missing value is replaced by the mean of the values in the same column.

### 4. Split the dataset

In [8]:
def data_split(X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [9]:
X = train.iloc[:,:-8]
Y = train.iloc[:,-8:]
Xtrain, Xtest, ytrain, ytest = data_split(X, Y)

### 5. Fit the train dataset to the regressor

#### 5.1 Choose the regressor

By a number of experiments, I found that 'ExtraTreesRegressor'(Multivariate Regressor) has the lowest RMSE value.

In [10]:

estimator = ExtraTreesRegressor(random_state=2, n_estimators=94, max_depth = 4000)
#estimator = RandomForestRegressor(random_state = random_state)
#estimator = KNeighborsRegressor()


#### 5.2 Parameter tuning for the selected regressor

I used 'RandomizedSearchCV' for parameter tuning.

The only difference between 'RandomizedSearchCV' and 'GridSearchCV' is in grid search we define the combinations and do training of the model whereas in RandomizedSearchCV the model selects the combinations randomly. Both are very effective ways of tuning the parameters that increase the model generalizability.

In [11]:
# Number of trees in the extratrees regressor
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)


{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [12]:
from sklearn.model_selection import RandomizedSearchCV
ext_random = RandomizedSearchCV(estimator = estimator, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=2, n_jobs = -1)


##### After checking the RMSE of the test dataset by split is low enough, I did training on the whole dataset.

In [13]:
# Fit the random search model
ext_random.fit(X,Y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 12.3min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0,
                                                 criterion='mse',
                                                 max_depth=4000,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=94, n_jobs=None,
                                                 oob_score=False,...
                   iid=

##### This is for the calculation of the RMSE when I use train and test dataset by split (80:20) for fitting the model.

In [14]:
'''from sklearn.metrics import mean_squared_error

y_pred = ext_random.predict(Xtest)

print(mean_squared_error(ytest, y_pred, squared=False))'''

'from sklearn.metrics import mean_squared_error\n\ny_pred = ext_random.predict(Xtest)\n\nprint(mean_squared_error(ytest, y_pred, squared=False))'

### 6. Load test dataset

In [15]:
test_df = pd.read_csv("test.csv")
test_df

Unnamed: 0,id,CHSI2__0,NASI2__0,EADM7__0,SCLM7__0,CLKM7__0,VALI2__0,NAPM7__0,DLDI4__0,CHSI2__-6,...,NAPM7__-48,DLDI4__-48,CHSI2__-60,NASI2__-60,EADM7__-60,SCLM7__-60,CLKM7__-60,VALI2__-60,NAPM7__-60,DLDI4__-60
0,0,122.0,3.45,112.0,54.9,40.6,16.00,46.2,32.3,125.0,...,46.2,29.8,133.0,3.66,121.0,57.0,48.0,17.10,46.2,29.3
1,1,151.0,3.36,162.0,61.7,86.6,22.20,46.2,51.9,150.0,...,46.2,55.8,134.0,3.27,139.0,62.2,59.7,15.10,46.2,51.3
2,2,116.0,3.41,113.0,54.9,42.0,16.20,44.5,33.9,116.0,...,46.2,30.7,128.0,3.62,121.0,56.2,36.4,16.20,46.2,30.1
3,3,107.0,3.27,105.0,53.7,31.8,11.00,40.3,35.4,107.0,...,43.7,34.1,113.0,3.29,110.0,55.8,42.2,12.50,43.7,34.3
4,4,78.5,3.28,85.5,44.9,28.5,6.10,42.2,25.4,78.7,...,42.2,39.0,77.7,3.27,85.5,44.5,28.8,3.38,42.2,37.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2277,2277,168.0,3.90,153.0,69.6,51.5,25.20,46.4,27.5,172.0,...,47.6,28.0,189.0,3.64,168.0,73.8,56.6,27.40,56.5,28.4
2278,2278,201.0,3.24,190.0,66.5,76.1,11.00,62.6,47.2,204.0,...,62.6,53.2,239.0,3.31,222.0,71.0,122.0,24.10,62.6,62.1
2279,2279,132.0,3.29,129.0,70.8,41.8,7.51,66.9,34.8,132.0,...,68.7,34.7,129.0,3.27,134.0,74.4,44.1,6.04,69.0,33.6
2280,2280,136.0,3.60,126.0,57.3,50.3,16.90,46.2,29.0,134.0,...,46.4,29.4,145.0,3.63,130.0,55.2,48.9,15.50,46.4,32.0


### 7. Filling missing values of the test dataset

In [16]:
NaN_list = list(test_df[test_df.isnull().any(axis=1)].index)
NaN_list


[152,
 484,
 511,
 513,
 549,
 613,
 688,
 691,
 774,
 809,
 811,
 862,
 1051,
 1056,
 1059,
 1131,
 1195,
 1291,
 1369,
 1410,
 1430,
 1454,
 1472,
 1660,
 1678,
 1738,
 1763,
 1816,
 2033,
 2214]

In [17]:
test_df.loc[152]

id            152.0
CHSI2__0      134.0
NASI2__0        3.5
EADM7__0      126.0
SCLM7__0       56.5
              ...  
SCLM7__-60      NaN
CLKM7__-60      NaN
VALI2__-60      NaN
NAPM7__-60      NaN
DLDI4__-60      NaN
Name: 152, Length: 65, dtype: float64

Before filling missing values, the value was 'nan'.

##### Filling missing values

In [18]:
NaNresult_list = test_df.columns.tolist()
for NaNcol in NaNresult_list:
    test_df[NaNcol].fillna(value=test_df[NaNcol].mean(), inplace=True)

In [19]:
test_df.loc[152]

id            152.000000
CHSI2__0      134.000000
NASI2__0        3.500000
EADM7__0      126.000000
SCLM7__0       56.500000
                 ...    
SCLM7__-60     79.583259
CLKM7__-60     67.046936
VALI2__-60     12.474809
NAPM7__-60     61.595826
DLDI4__-60     46.763455
Name: 152, Length: 65, dtype: float64

After filling missing values, the value is not 'nan' anymore. The missing value is replaced by the mean of the values in the same column.

### 8. Predict the output values of the test dataset

In [20]:
#result_Y = estimator.predict(PCAresult_standard)
result_Y = ext_random.predict(test_df.iloc[:,1:])

In [21]:
traincol_list[-8:]

['CHSI2_48H__0',
 'NASI2_48H__0',
 'EADM7_48H__0',
 'SCLM7_48H__0',
 'CLKM7_48H__0',
 'VALI2_48H__0',
 'NAPM7_48H__0',
 'DLDI4_48H__0']

In [22]:
result_df = pd.DataFrame(result_Y, columns = traincol_list[-8:])

result_df.index.name = 'id'
result_df

Unnamed: 0_level_0,CHSI2_48H__0,NASI2_48H__0,EADM7_48H__0,SCLM7_48H__0,CLKM7_48H__0,VALI2_48H__0,NAPM7_48H__0,DLDI4_48H__0
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,115.008333,3.367900,116.291667,54.570667,45.608333,15.143333,44.403000,34.640333
1,172.533333,3.326333,163.441667,61.201833,83.732333,15.490000,51.732333,34.980000
2,117.978333,3.377817,118.000000,53.300500,47.027917,12.509750,43.614667,34.826333
3,108.063889,3.461270,104.159907,50.802032,38.137333,13.249875,40.615833,34.584593
4,78.474452,3.319521,85.502000,44.512521,33.273190,5.578272,42.203500,23.371704
...,...,...,...,...,...,...,...,...
2277,154.624167,3.783700,142.420333,71.646783,49.207667,19.120500,46.394500,33.218367
2278,151.960000,3.286517,140.163333,66.210333,76.030500,12.948667,62.560333,63.100500
2279,122.405000,3.295967,124.108333,69.209167,41.516667,6.308050,65.743000,33.728667
2280,127.205000,3.582898,123.881667,55.935643,40.594774,16.427357,46.182333,30.379340


In [23]:
result_df.to_csv('61_river.csv') 