In [None]:
pip install lightgbm



In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [9]:
dataset='/content/MeanderingInterploated.csv'

df=pd.read_csv(dataset, index_col=0)

df.head()

Unnamed: 0,name,c1_dist,c2_dist,c3_dist,c4_dist,c5_dist,c6_dist,c7_dist,c8_dist
0,1988-1,15.608011,12.747157,5.968249,11.954079,29.138634,33.62157,176.417148,191.867063
1,1988-2,14.828968,12.334799,5.968249,12.549398,29.114776,32.291611,175.064256,192.169605
2,1988-3,14.717676,12.218429,5.968249,12.79453,29.102749,32.046997,174.777029,192.212825
3,1988-4,15.608011,12.445481,5.968249,12.79453,29.1,33.334817,175.923876,191.867063
4,1989-1,15.57663,12.429729,5.967815,12.754464,28.709509,32.936109,175.297191,190.844985


In [None]:
df['year'] = df['name'].apply(lambda x: int(x.split('-')[0]))
df['quarter'] = df['name'].apply(lambda x: int(x.split('-')[1]))

print(df['quarter'].unique())

targets = ['c1_dist', 'c2_dist', 'c3_dist', 'c4_dist', 'c5_dist', 'c6_dist', 'c7_dist', 'c8_dist']
predictions = {}
features = ['year', 'quarter']

X = df[features]
y = df[targets]

# Set parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}


for target in targets:
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)

    gbm = lgb.train(
    params={
        "objective": "binary",
        "metric": "auc",
    },
    train_set=lgb_train,
    valid_sets=lgb_test,
    num_boost_round=100,
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
    ]
)



In [None]:
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE for {target}: {rmse:.4f}")

RMSE for c8_dist: 181.9585


In [None]:
print(min(df['c8_dist']), max(df['c8_dist']))

156.5910917 200.6757584


In [12]:
df['year'] = df['name'].apply(lambda x: int(x.split('-')[0]))
df['quarter'] = df['name'].apply(lambda x: int(x.split('-')[1]))

df_encoded = pd.get_dummies(df, columns=['quarter'], drop_first=True)


# Features and targets
features = ['year', 'quarter_2','quarter_3','quarter_4']
X = df_encoded[features]
y = df_encoded[['c1_dist', 'c2_dist', 'c3_dist', 'c4_dist', 'c5_dist', 'c6_dist', 'c7_dist', 'c8_dist']]



Unnamed: 0,c1_dist,c2_dist,c3_dist,c4_dist,c5_dist,c6_dist,c7_dist,c8_dist
0,15.608011,12.747157,5.968249,11.954079,29.138634,33.62157,176.417148,191.867063
1,14.828968,12.334799,5.968249,12.549398,29.114776,32.291611,175.064256,192.169605
2,14.717676,12.218429,5.968249,12.79453,29.102749,32.046997,174.777029,192.212825
3,15.608011,12.445481,5.968249,12.79453,29.1,33.334817,175.923876,191.867063
4,15.57663,12.429729,5.967815,12.754464,28.709509,32.936109,175.297191,190.844985


# Grid Search Best Parameters


In [None]:
from sklearn.multioutput import MultiOutputRegressor


# Define LightGBM model
lgb_model = lgb.LGBMRegressor(objective='regression', num_leaves=20, learning_rate=0.05, n_estimators=100)

# Wrap with MultiOutputRegressor
multi_output_model = MultiOutputRegressor(lgb_model)

# Define parameter grid for LightGBM
param_grid = {
    'estimator__estimator__num_leaves': [31, 50, 100],
    'estimator__estimator__learning_rate': [0.05, 0.1, 0.2],
    'estimator__estimator__n_estimators': [50, 100, 200],
    'estimator__estimator__max_depth': [-1, 5, 10],
    'estimator__estimator__subsample': [0.7, 0.8, 1.0]
}

# Grid Search using GridSearchCV
grid_search = GridSearchCV(
    estimator=multi_output_model,
    param_grid=param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

# Fit the grid search
grid_search.fit(X_train, y_train)



In [None]:
# Get best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best RMSE score: ", -grid_search.best_score_)


Best parameters found:  {'estimator__estimator__learning_rate': 0.05, 'estimator__estimator__max_depth': -1, 'estimator__estimator__n_estimators': 50, 'estimator__estimator__num_leaves': 31, 'estimator__estimator__subsample': 0.7}
Best RMSE score:  3.980270444252957


# Train multi-output regressor

In [25]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

lgb_model = lgb.LGBMRegressor(objective='regression', num_leaves=31, learning_rate=0.05, n_estimators=50)

multi_output_model = MultiOutputRegressor(lgb_model)



# Train the model
multi_output_model.fit(X_train, y_train)

In [31]:
# Predict on the test set
y_pred = multi_output_model.predict(X_test)

In [32]:
# Evaluate performance
range=[]
rmse = np.sqrt(mean_squared_error(y_test, y_pred, multioutput='raw_values'))  # RMSE for each target
print("RMSE for each control point:", rmse)


for i in df.columns[1:-2]:
    col_min = min(df[i])  # Use a custom variable name instead of `min`
    col_max = max(df[i])  # Use a custom variable name instead of `max`
    range.append([col_min, col_max])  # Append the range as a list


print(range)


RMSE for each control point: [ 0.77476678  0.54869963  1.46528855  1.1413273   3.98874929  6.42370828
 10.83504983  9.60168629]
[[2.0, 15.60801076], [4.0, 12.74715655], [0.0, 14.86001346], [0.4242640687, 12.79453008], [6.129437168, 29.13863415], [5.021951812, 36.76424894], [136.3063095, 176.9587805], [156.5910917, 200.6757584]]


In [33]:
for i in rmse:
  index_r=int(np.where(rmse==i)[0])
  err_percentage=i/(range[index_r][1]-range[index_r][0])*100
  print(index_r, '->', err_percentage)

0 -> 5.693460987576798
1 -> 6.2728914270897285
2 -> 9.860613886273235
3 -> 9.226376406255024
4 -> 17.33545629489136
5 -> 20.237061791813183
6 -> 26.652868972821576
7 -> 21.780104069957677


  index_r=int(np.where(rmse==i)[0])


In [34]:
# Calculate baseline predictions (mean of each target column in training data)
baseline_predictions = np.tile(y_train.mean(axis=0), (len(y_test), 1))

# Compute RMSE for each target column
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_predictions, multioutput='raw_values'))

# Display the RMSE for each control point
for i, rmse in enumerate(baseline_rmse):
    print(f"Baseline RMSE for Control Point {i+1}: {rmse:.4f}")


Baseline RMSE for Control Point 1: 2.4783
Baseline RMSE for Control Point 2: 1.2438
Baseline RMSE for Control Point 3: 2.4583
Baseline RMSE for Control Point 4: 3.1945
Baseline RMSE for Control Point 5: 6.6854
Baseline RMSE for Control Point 6: 9.0417
Baseline RMSE for Control Point 7: 14.0182
Baseline RMSE for Control Point 8: 13.3004


In [36]:
for i in baseline_rmse:
  index_r=int(np.where(baseline_rmse==i)[0])
  err_percentage=i/(range[index_r][1]-range[index_r][0])*100
  print(index_r, '->', err_percentage)

0 -> 18.211982635931996
1 -> 14.219321444896785
2 -> 16.54298659629663
3 -> 25.82372338024389
4 -> 29.05548496155555
5 -> 28.4848501768377
6 -> 34.48291204207571
7 -> 30.170226233405806


  index_r=int(np.where(baseline_rmse==i)[0])


In [None]:
y_train_pred = multi_output_model.predict(X_train)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred, multioutput='raw_values'))

print("Training RMSE for each control point:", rmse_train)

Training RMSE for each control point: [1.00948559 0.60233753 1.77910414 1.13546575 3.65374313 3.81265782
 7.5489073  7.16329868]


In [None]:
for i in rmse_train:
  index_r=int(np.where(rmse_train==i)[0])
  err_percentage=i/(range[index_r][1]-range[index_r][0])*100
  print(index_r, '->', err_percentage)

0 -> 7.418318584421618
1 -> 6.886095214812425
2 -> 11.972426154262203
3 -> 9.178992156635571
4 -> 15.879489991168239
5 -> 12.011285143030753
6 -> 18.569368878312194
7 -> 16.248957328234166


  index_r=int(np.where(rmse_train==i)[0])


In [None]:
# Predict for the next quarter
next_quarter = pd.DataFrame({'year': [2025], 'quarter': [2]})
next_prediction = multi_output_model.predict(next_quarter)
print("Predicted values for the next quarter:", next_prediction)

Predicted values for the next quarter: [[  6.41657413   7.32990802   5.82032426   2.85663828  26.73549158
   32.69467098 172.16053441 188.38549414]]
