In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/datasets')

Mounted at /content/datasets


In [None]:
X = pd.read_csv('/content/datasets/MyDrive/Datasets/GCM Data (Daily)/CSV Files/Point_1.csv')
X = X.iloc[:, 1:]
y = pd.read_csv('/content/datasets/MyDrive/Datasets/IMD Rainfall 0.25x0.25/NC Files/csv files/rain_Point_1.csv')
y = y.iloc[:23034, -1]


In [None]:
y = y.fillna(y.mean())

In [None]:
np.any(np.isnan(y))

False

In [None]:
X = (X - X.mean(axis= 0))/X.std(axis= 0)

In [None]:
df = X.join(y)


In [None]:
df.head()

In [None]:
train_features, test_features, train_labels, test_labels = train_test_split(X, 
                                                                            y, 
                                                                            test_size = 0.2, 
                                                                            random_state = 42)

In [None]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error

In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)
dtrain = xgb.DMatrix(train_features, label=train_labels)
dtest = xgb.DMatrix(test_features, label=test_labels)

In [None]:
eval_set = [dtrain, dtest]

In [None]:
mean_train = np.mean(train_labels)

In [None]:
params = {"objective":"reg:squarederror", 'min_child_weight': 1, 
          'colsample_bytree': 1,'subsample': 1,
                'max_depth': 6, 'alpha': 10}

In [None]:
params['eval_metric'] = "rmse"
num_boost_round = 999

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

In [None]:
print("Best RMSE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best RMSE: 12.04 with 4 rounds


In [None]:
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=num_boost_round,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

In [None]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,12.010266,0.171391,12.182324,0.426773
1,11.796825,0.155553,12.111937,0.41504
2,11.583758,0.150859,12.093965,0.4167
3,11.440081,0.125882,12.091686,0.419668


In [None]:
cv_results["test-rmse-mean"].min()

12.091685666666669

TUNING MAX_DEPTH AND MIN_CHILD_WEIGHT

In [None]:
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(3,7)
    for min_child_weight in range(5,8)
]

In [None]:
min_rmse = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
params['max_depth'] = 3
params['min_child_weight'] = 7

TUNING SUBSAMPLE AND COLSAMPLE

In [None]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(4,11)]
    for colsample in [i/10. for i in range(4,11)]
]

In [None]:
min_rmse = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'rmse'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = (subsample,colsample)
print("Best params: {}, {}, RMSE: {}".format(best_params[0], best_params[1], min_rmse))

In [None]:
params['subsample'] = .9
params['colsample_bytree'] = 1

TUNING ETA

In [None]:
min_rmse = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time cv_results = xgb.cv(params, dtrain, num_boost_round=num_boost_round, seed=42, nfold=5, metrics=['rmse'], early_stopping_rounds=10)
    # Update best score
    mean_rmse = cv_results['test-rmse-mean'].min()
    boost_rounds = cv_results['test-rmse-mean'].argmin()
    print("\tRMSE {} for {} rounds\n".format(mean_rmse, boost_rounds))
    if mean_rmse < min_rmse:
        min_rmse = mean_rmse
        best_params = eta
print("Best params: {}, RMSE: {}".format(best_params, min_rmse))

In [None]:
params['eta'] = .1

In [None]:
evals_result = {}

In [None]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10,
    evals_result = evals_result
)

In [None]:
num_boost_round = model.best_iteration + 1
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

In [None]:
prediction = model.predict(dtest)
prediction

array([3.1647727, 2.6073577, 4.2016892, ..., 2.5990138, 3.3450005,
       3.2225397], dtype=float32)

FEATURE IMPORTANCE

In [None]:
xgb.plot_importance(model)

EXPLAINED VARIANCE SCORE


In [None]:
evs = explained_variance_score(test_labels, prediction)

In [None]:
evs

0.00021138717412705343

MAX ERROR

In [None]:
maxer = max_error(test_labels, prediction)

In [None]:
maxer

272.0488246258545

MEAN ABSOLUTE ERROR

In [None]:
mas = mean_absolute_error(test_labels, prediction)

In [None]:
mas

5.4592446567638975

**FUTURE PREDICTIONS**

In [None]:
X_future = pd.read_csv('/content/datasets/MyDrive/Datasets/GCM Future /CSV Files/future_Point_1.csv')

In [None]:
X

Unnamed: 0,CLT,HURS,HUSS,PSL,TAS,UAS,VAS,ZG500
0,13.604736,59.945312,0.011603,101535.250,298.00415,-4.616211,0.129883,5847.5273
1,18.096924,64.500000,0.012440,101489.125,297.94043,-4.403320,0.273438,5850.1680
2,8.856201,65.101560,0.012771,101461.125,298.15356,-4.736328,0.154297,5852.1290
3,63.604736,66.312500,0.012901,101307.875,298.04420,-4.218750,0.290039,5845.1406
4,77.667240,62.742188,0.012550,101178.560,298.38428,-3.522461,1.387695,5834.0117
...,...,...,...,...,...,...,...,...
23029,9.442139,74.187500,0.015402,101300.310,299.02880,-1.420898,0.916992,5872.6250
23030,0.067139,59.632812,0.011755,101529.690,298.45435,-3.545898,0.376953,5892.5350
23031,6.964111,65.023440,0.012337,101475.500,297.90480,-1.319336,-0.192383,5894.3240
23032,1.824951,65.562500,0.012844,101453.190,298.22266,-3.862305,0.035156,5888.2850


In [None]:
X_future = X_future.iloc[:, 1:]
y_future = y

In [None]:
X_future

Unnamed: 0,CLT,HURS,HUSS,PSL,TAS,UAS,VAS,ZG500
0,63.281250,72.312500,0.014381,101474.125,298.40967,-5.477539,-0.747070,5870.3790
1,30.206299,72.226560,0.014678,101578.560,298.78394,-6.118164,-0.960938,5871.2970
2,13.476562,72.593750,0.014564,101576.440,298.52760,-6.063477,-0.803711,5857.3477
3,14.520264,65.054690,0.012719,101510.375,298.24730,-5.908203,-0.448242,5841.1094
4,65.948490,64.984375,0.012638,101471.810,298.11792,-4.831055,-0.698242,5841.5350
...,...,...,...,...,...,...,...,...
12595,3.320312,63.328125,0.014457,102023.560,300.80690,-6.871094,-1.706055,5902.1290
12596,27.996826,68.351560,0.014900,101860.000,299.97998,-4.899414,-1.692383,5888.6797
12597,44.464110,71.945310,0.015841,101767.125,300.07837,-5.620117,-1.271484,5882.5350
12598,25.262451,64.437500,0.014330,101779.250,300.26587,-5.959961,-0.777344,5889.1406


In [None]:
y_future

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
23029    0.0
23030    0.0
23031    0.0
23032    0.0
23033    0.0
Name: RAINFALL, Length: 23034, dtype: float64

In [None]:
dfut = xgb.DMatrix(X_future)

In [None]:
y_fut = model.predict(dfut)

In [None]:
y_fut