In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import joblib
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, r2_score


from sklearn.decomposition import PCA

In [2]:
class TrainingModels():
  def __init__(self, X_train, X_val, y_train, y_val):
    self.X_train = X_train
    self.X_val = X_val
    self.y_train = y_train
    self.y_val = y_val

  def train_and_evaluate(self, algorithm, return_info=False):
    model = MultiOutputRegressor(algorithm)
    model.fit(self.X_train, self.y_train)
    train_score =  model.score(self.X_train, self.y_train)
    val_score =  model.score(self.X_val, self.y_val)

    y_pred = model.predict(self.X_val)
    mse = mean_squared_error(self.y_val, y_pred, multioutput='raw_values')

    print("Training Score:", train_score)
    print("Validation Score:", val_score)
    print('MSE : ', mse)

    if return_info:
      return model, train_score, val_score, mse


  def train_mutli_models(self, list_of_algorithms, return_df = False):
    if return_df:
      models = [str(algorithm) for algorithm in list_of_algorithms]
      train_scores = []
      val_scores = []
      mse_scores = []

      for algorithm in tqdm(list_of_algorithms):
        model, train_score, val_score, mse = self.train_and_evaluate(algorithm, return_info=True)
        train_scores.append(train_score)
        val_scores.append(val_score)
        # mse_scores.append(mse)

      return pd.DataFrame({'Models':models,
                           'Training Score':train_scores,
                           'Validation Score': val_scores,
                          #  'MSE': mse_scores
                           })

    else:
      for algorithm in tqdm(list_of_algorithms):
        self.train_and_evaluate(self, algorithm)


  def apply_grid_search(self, hyper_parameters, algorithm):
    scoring = {
        'MSE': make_scorer(mean_squared_error),
        'R2': make_scorer(r2_score)
    }

    grid_search = GridSearchCV(estimator=MultiOutputRegressor(algorithm), param_grid=hyper_parameters, cv=5, scoring=scoring, refit='MSE')
    grid_search.fit(self.X_train, self.y_train)

    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Evaluate on test data
    y_pred = best_estimator.predict(self.X_val)
    mse = mean_squared_error(self.y_val,y_pred)
    r2 = r2_score(self.y_val, y_pred)

    print("Best Parameters:", best_params)
    print("MSE on validation:", mse)
    print("R2 Score on validation:", r2)

    cv_results = grid_search.cv_results_

    mse_scores = cv_results['mean_test_MSE']
    r2_scores = cv_results['mean_test_R2']

    print("Cross-Validation Results in different compination:")
    for fold, (mse, r2) in enumerate(zip(mse_scores, r2_scores)):
        print(f"Fold {fold+1}: MSE={mse:.4f}, R2={r2:.4f}")

    return best_estimator

In [3]:
# Load the saved array
x_data = np.load('x_data.npy')
y_data = np.load('y_data.npy')
label_data = np.load('label_data.npy')

print(x_data.shape, y_data.shape, label_data.shape)

(1853, 468) (1853, 468) (1853, 3)


In [4]:
y_data[:5]

array([[317, 291, 295, ..., 227, 214, 210],
       [298, 277, 283, ..., 237, 229, 227],
       [309, 290, 293, ..., 228, 217, 214],
       [269, 242, 254, ..., 222, 228, 227],
       [311, 292, 293, ..., 242, 254, 253]])

In [5]:
X = np.concatenate((x_data, y_data), axis=1)
y = label_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [6]:
print(X_train.shape, X_val.shape, X_test.shape)

(1185, 936) (297, 936) (371, 936)


### Base Model

In [14]:
train_m = TrainingModels( X_train, X_val, y_train, y_val)
train_m.train_and_evaluate(LinearRegression())

Training Score: 0.9077777232260033
Validation Score: -6.325686159867935
MSE :  [1.16168383 0.19752149 0.89850313]


In [15]:
%%capture
models = [KNeighborsRegressor(n_neighbors=i) for i in range(3,11)]
performance_df = train_m.train_mutli_models(models, return_df=True)

In [16]:
performance_df

Unnamed: 0,Models,Training Score,Validation Score
0,KNeighborsRegressor(n_neighbors=3),0.639212,0.704677
1,KNeighborsRegressor(n_neighbors=4),0.589295,0.738832
2,KNeighborsRegressor(),0.5734,0.752174
3,KNeighborsRegressor(n_neighbors=6),0.519238,0.748552
4,KNeighborsRegressor(n_neighbors=7),0.50815,0.751594
5,KNeighborsRegressor(n_neighbors=8),0.477584,0.756345
6,KNeighborsRegressor(n_neighbors=9),0.471718,0.76343
7,KNeighborsRegressor(n_neighbors=10),0.464852,0.758823


When `n_neighbors=3`, it appears to be the optimal choice based on other values. However, the results are still not satisfactory, with a training score of 0.63 and a validation score of 0.70.

In [17]:
# with grid search and cross validation
hyperpara = {'n_neighbors':range(3,11)}
train_m.apply_grid_search(hyperpara, KNeighborsRegressor())

Best Parameters: {'n_neighbors': 7}
MSE on validation: 0.027850003484495386
R2 Score on validation: 0.7515938798595102
Cross-Validation Results in different compination:
Fold 1: MSE=0.3098, R2=0.6143
Fold 2: MSE=0.3057, R2=0.6343
Fold 3: MSE=0.3270, R2=0.4269
Fold 4: MSE=0.3224, R2=0.4962
Fold 5: MSE=0.3301, R2=0.4395
Fold 6: MSE=0.3288, R2=0.4598
Fold 7: MSE=0.3258, R2=0.4879
Fold 8: MSE=0.3231, R2=0.5175


In [18]:
decision_tree = DecisionTreeRegressor()

param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

train_m.apply_grid_search(param_grid, decision_tree)

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
MSE on validation: 0.6353668269952155
R2 Score on validation: -4.761974990375845
Cross-Validation Results in different compination:
Fold 1: MSE=0.3387, R2=0.2027
Fold 2: MSE=0.5210, R2=-1.5852
Fold 3: MSE=0.3926, R2=-0.3103
Fold 4: MSE=0.4006, R2=-0.3137
Fold 5: MSE=0.4644, R2=-0.9329
Fold 6: MSE=0.3284, R2=0.3903
Fold 7: MSE=0.3898, R2=-0.1424
Fold 8: MSE=0.3826, R2=0.0412
Fold 9: MSE=0.4070, R2=-0.3032
Fold 10: MSE=0.3331, R2=0.2953
Fold 11: MSE=0.3283, R2=0.3436
Fold 12: MSE=0.3283, R2=0.3435
Fold 13: MSE=0.3978, R2=-0.3197
Fold 14: MSE=0.3345, R2=0.3413
Fold 15: MSE=0.4267, R2=-0.3635
Fold 16: MSE=0.3827, R2=-0.0654
Fold 17: MSE=0.4175, R2=-0.3654
Fold 18: MSE=0.3818, R2=-0.0360
Fold 19: MSE=0.6194, R2=-2.7178
Fold 20: MSE=0.4595, R2=-0.9675
Fold 21: MSE=0.3204, R2=0.3959
Fold 22: MSE=0.3328, R2=0.3291
Fold 23: MSE=0.3293, R2=0.3635
Fold 24: MSE=0.3269, R2=0.3976
Fold 25: MSE=0.3918, R2=-0.1643
Fold 2

In [19]:
%%capture
list_of_algorithms = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(),
                      RandomForestRegressor(n_estimators=200), GradientBoostingRegressor(),
                      SVR(C=25),XGBRegressor()
]
performance_df = train_m.train_mutli_models(list_of_algorithms, True)

In [20]:
performance_df

Unnamed: 0,Models,Training Score,Validation Score
0,LinearRegression(),0.907778,-6.325686
1,KNeighborsRegressor(),0.5734,0.752174
2,DecisionTreeRegressor(),1.0,-5.519466
3,RandomForestRegressor(n_estimators=200),0.886155,-0.106437
4,GradientBoostingRegressor(),0.990365,-2.158079
5,SVR(C=25),0.828308,0.812387
6,"XGBRegressor(base_score=None, booster=None, ca...",0.999998,-4.87038


> Since SVR exhibited the best performance among the previous models, let's proceed with fine-tuning its parameters.

In [None]:
# SVR(C=25)
joblib.dump(svm_model, 'svm_model.pkl')

['svm_model.pkl']

So far, I've experimented with different models on the data without any preprocessing, and the best-performing one is SVM. The next steps will involve:
1. Fine-tuning the SVM model to optimize its performance.
2. Applying preprocessing techniques on the data, such as scaling, PCA, etc., to further enhance model performance.

### Fine-Tuning

In [47]:
train_m.y_train.shape

(1185, 3)

In [57]:
# svr_m = SVR()

# param_grid = {
#     'estimator__kernel': ['linear', 'rbf', 'poly'],
#     'estimator__C': [0.1, 1, 10, 20, 25, 30],
#     'estimator__gamma': ['scale', 'auto', 0.1, 1],
#     'estimator__degree': [2, 3, 4]
# }
# train_m.apply_grid_search(param_grid, svr_m)

### Apply Min-Max Scaler

In [7]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [8]:
X_train_scaled

array([[0.36754967, 0.35759494, 0.36963696, ..., 0.66666667, 0.59247649,
        0.57492355],
       [0.40397351, 0.46518987, 0.4290429 , ..., 0.67353952, 0.71473354,
        0.70642202],
       [0.38410596, 0.37025316, 0.37293729, ..., 0.69415808, 0.59874608,
        0.58715596],
       ...,
       [0.39072848, 0.42088608, 0.3960396 , ..., 0.68728522, 0.62068966,
        0.6116208 ],
       [0.3410596 , 0.39556962, 0.36963696, ..., 0.68728522, 0.68965517,
        0.67889908],
       [0.32781457, 0.40189873, 0.36963696, ..., 0.74226804, 0.76802508,
        0.76146789]])

## Apply PCA

In [9]:
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)

In [10]:
explained_variance_ratios = pca.explained_variance_ratio_

print('Variance for each component : ',explained_variance_ratios)
print("Total Variance : ", sum(explained_variance_ratios))

Variance for each component :  [0.28293078 0.2212019  0.19776256 0.14543347 0.09459137 0.02746136]
Total Variance :  0.9693814450244358


## Apply Models

In [12]:
train_m = TrainingModels(X_train_pca, X_val_pca, y_train, y_val)
train_m.train_and_evaluate(LinearRegression())

Training Score: 0.4126955309608391
Validation Score: 0.6060632118532532
MSE :  [0.04892548 0.02708865 0.0598857 ]


In [63]:
%%capture
models = [KNeighborsRegressor(n_neighbors=i) for i in range(3,11)]
performance_df = train_m.train_mutli_models(models, return_df=True)

In [64]:
performance_df

Unnamed: 0,Models,Training Score,Validation Score
0,KNeighborsRegressor(n_neighbors=3),0.63587,0.670284
1,KNeighborsRegressor(n_neighbors=4),0.586091,0.703971
2,KNeighborsRegressor(),0.56923,0.726559
3,KNeighborsRegressor(n_neighbors=6),0.513618,0.731101
4,KNeighborsRegressor(n_neighbors=7),0.500642,0.734557
5,KNeighborsRegressor(n_neighbors=8),0.476306,0.733547
6,KNeighborsRegressor(n_neighbors=9),0.465392,0.741858
7,KNeighborsRegressor(n_neighbors=10),0.455694,0.738803


When `n_neighbors=3`, it appears to be the optimal choice now with a little difference between train and validation

In [45]:
# %%capture
list_of_algorithms = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(),
                      RandomForestRegressor(n_estimators=200), GradientBoostingRegressor(),
                      SVR(C=25),XGBRegressor()
]
performance_df = train_m.train_mutli_models(list_of_algorithms, True)

  0%|          | 0/7 [00:00<?, ?it/s]

Training Score: 0.4126955309608391
Validation Score: 0.6060632118532532
MSE :  [0.04892548 0.02708865 0.0598857 ]
Training Score: 0.5692304597305592
Validation Score: 0.7265588775329607
MSE :  [0.04321247 0.01904423 0.02911581]
Training Score: 1.0
Validation Score: -5.203729023188916
MSE :  [0.89099895 0.02025853 0.9429966 ]


 57%|█████▋    | 4/7 [00:03<00:02,  1.17it/s]

Training Score: 0.8877096923937322
Validation Score: -1.115640667941774
MSE :  [0.3809243  0.01407616 0.21581894]


 71%|███████▏  | 5/7 [00:04<00:01,  1.23it/s]

Training Score: 0.9776155863765608
Validation Score: -3.594890891707626
MSE :  [0.77215914 0.01940847 0.54681656]


 86%|████████▌ | 6/7 [00:04<00:00,  1.45it/s]

Training Score: 0.9695135255470045
Validation Score: 0.7889239593641895
MSE :  [0.03197609 0.01402698 0.02458598]


100%|██████████| 7/7 [00:04<00:00,  1.44it/s]

Training Score: 0.9997008803559266
Validation Score: 0.7010288431483943
MSE :  [0.04945    0.01539589 0.03104047]





In [46]:
performance_df

Unnamed: 0,Models,Training Score,Validation Score
0,LinearRegression(),0.412696,0.606063
1,KNeighborsRegressor(),0.56923,0.726559
2,DecisionTreeRegressor(),1.0,-5.203729
3,RandomForestRegressor(n_estimators=200),0.88771,-1.115641
4,GradientBoostingRegressor(),0.977616,-3.594891
5,SVR(C=25),0.969514,0.788924
6,"XGBRegressor(base_score=None, booster=None, ca...",0.999701,0.701029


> According to the table and Mean Squared Error (MSE) analysis, Linear Regression, KNN, SVR, and XGBoost (XGB) are potential options. Although all of them exhibit overfitting, they demonstrate relatively small MSE values.

### Hyperparameter Tuning

In [71]:
train_m.train_and_evaluate(SVR(C=25), True)

Training Score: 0.9695135255470045
Validation Score: 0.7889239593641895
MSE :  [0.03197609 0.01402698 0.02458598]


(MultiOutputRegressor(estimator=SVR(C=25)),
 0.9695135255470045,
 0.7889239593641895,
 array([0.03197609, 0.01402698, 0.02458598]))

In [73]:
train_m.train_and_evaluate(SVR(C=8), True)

Training Score: 0.849900578085018
Validation Score: 0.8015165057035644
MSE :  [0.03084156 0.01351071 0.02195973]


(MultiOutputRegressor(estimator=SVR(C=8)),
 0.849900578085018,
 0.8015165057035644,
 array([0.03084156, 0.01351071, 0.02195973]))

In [13]:
train_m.train_and_evaluate(SVR(C=8), True)

Training Score: 0.849900578085018
Validation Score: 0.8015165057035644
MSE :  [0.03084156 0.01351071 0.02195973]


(MultiOutputRegressor(estimator=SVR(C=8)),
 0.849900578085018,
 0.8015165057035644,
 array([0.03084156, 0.01351071, 0.02195973]))

In [19]:
train_m.train_and_evaluate(SVR(C=8), True)

Training Score: 0.849900578085018
Validation Score: 0.8015165057035644
MSE :  [0.03084156 0.01351071 0.02195973]


(MultiOutputRegressor(estimator=SVR(C=8)),
 0.849900578085018,
 0.8015165057035644,
 array([0.03084156, 0.01351071, 0.02195973]))

> Currently, SVR is the best-performing model and does not exhibit overfitting. Thus, we will proceed with SVR, applying a preprocessing pipeline consisting of Min-Max Scaler followed by PCA before fitting the SVR model.

In [20]:
joblib.dump(scaler, 'min_max_scaler.joblib')

joblib.dump(pca, 'pca.joblib')

['pca.joblib']

In [23]:
def process_input(X_data):
  x_data_transformed = scaler.transform(X_data)
  pca_result = x_data_transformed.transform(x_data_transformed)
  return pca_result

In [27]:
x_data = np.concatenate((X_train_pca, X_val_pca))
y_data = np.concatenate((y_train, y_val))

In [29]:
model = MultiOutputRegressor(SVR(C=8))
model.fit(x_data, y_data)

In [30]:
joblib.dump(model, 'model.joblib')

['model.joblib']

## Apply Feature Selection Techniques

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
feature_importances = rf.feature_importances_
sorted_indices = np.argsort(feature_importances)[::-1]

print("Feature Importance Ranking:")
for i, idx in enumerate(sorted_indices):
    print(f"Rank {i+1}: Feature {idx} - Importance: {feature_importances[idx]}")

    if i == 10:
      break


Feature Importance Ranking:
Rank 1: Feature 4 - Importance: 0.03586287838610454
Rank 2: Feature 275 - Importance: 0.027426127969296107
Rank 3: Feature 440 - Importance: 0.025849356642033094
Rank 4: Feature 132 - Importance: 0.02183239430151599
Rank 5: Feature 673 - Importance: 0.01963887135444359
Rank 6: Feature 601 - Importance: 0.019218640119271966
Rank 7: Feature 598 - Importance: 0.014505589190558103
Rank 8: Feature 602 - Importance: 0.014127289826553518
Rank 9: Feature 363 - Importance: 0.013935542280988556
Rank 10: Feature 281 - Importance: 0.013728643361747894
Rank 11: Feature 524 - Importance: 0.012933746884842511


In [None]:
sum(feature_importances[sorted_indices[:350]])

0.9534587478963393

> Our dataset comprises 936 features. Using a random forest feature selector, we identified that selecting only 350 features retains approximately 95% of the information in the data.

In [None]:
features = []
for i, idx in enumerate(sorted_indices):
  # print(f"Rank {i+1}: Feature {idx} - Importance: {feature_importances[idx]}")
  features.append(idx)
  if i == 349:
    break

In [None]:
X_train_filtered = X_train[:, features]
X_val_filtered = X_val[:, features]

## Apply Models

In [None]:
train_m = TrainingModels(X_train_filtered, X_val_filtered, y_train, y_val)
train_m.train_and_evaluate(LinearRegression())

Training Score: 0.6659327014151243
Validation Score: -0.32558265990134033
MSE :  [0.16779532 0.03415836 0.21985384]


In [None]:
# %%capture
models = [KNeighborsRegressor(n_neighbors=i) for i in range(3,11)]
performance_df = train_m.train_mutli_models(models, return_df=True)

 12%|█▎        | 1/8 [00:00<00:03,  2.11it/s]

Training Score: 0.6418075733871146
Validation Score: 0.7231867598461305
MSE :  [0.04104034 0.02115725 0.03234402]


 25%|██▌       | 2/8 [00:01<00:03,  1.77it/s]

Training Score: 0.6150549776595979
Validation Score: 0.753543989946805
MSE :  [0.03660293 0.02132919 0.02772564]


 38%|███▊      | 3/8 [00:01<00:02,  1.77it/s]

Training Score: 0.5803915185632839
Validation Score: 0.7619003373077972
MSE :  [0.03528379 0.02117442 0.0266648 ]


 50%|█████     | 4/8 [00:02<00:02,  1.80it/s]

Training Score: 0.5239931207275538
Validation Score: 0.77510235627439
MSE :  [0.03425815 0.02093257 0.02357414]


 62%|██████▎   | 5/8 [00:02<00:01,  1.65it/s]

Training Score: 0.51423368586953
Validation Score: 0.7710987490456106
MSE :  [0.03492302 0.02101337 0.02403588]


 75%|███████▌  | 6/8 [00:03<00:01,  1.64it/s]

Training Score: 0.487440847190071
Validation Score: 0.7686416514125741
MSE :  [0.03514226 0.02107253 0.02456764]


 88%|████████▊ | 7/8 [00:04<00:00,  1.61it/s]

Training Score: 0.4737890424482227
Validation Score: 0.7658232319788133
MSE :  [0.03583244 0.02083438 0.02471283]


100%|██████████| 8/8 [00:04<00:00,  1.69it/s]

Training Score: 0.4663237863501963
Validation Score: 0.7648962025022796
MSE :  [0.03606854 0.02133173 0.02452053]





In [None]:
performance_df

Unnamed: 0,Models,Training Score,Validation Score
0,KNeighborsRegressor(n_neighbors=3),0.641808,0.723187
1,KNeighborsRegressor(n_neighbors=4),0.615055,0.753544
2,KNeighborsRegressor(),0.580392,0.7619
3,KNeighborsRegressor(n_neighbors=6),0.523993,0.775102
4,KNeighborsRegressor(n_neighbors=7),0.514234,0.771099
5,KNeighborsRegressor(n_neighbors=8),0.487441,0.768642
6,KNeighborsRegressor(n_neighbors=9),0.473789,0.765823
7,KNeighborsRegressor(n_neighbors=10),0.466324,0.764896


When `n_neighbors=3`, it appears to be the optimal choice now with a little difference between train and validation

In [None]:
# %%capture
list_of_algorithms = [LinearRegression(), KNeighborsRegressor(), DecisionTreeRegressor(),
                      RandomForestRegressor(n_estimators=200), GradientBoostingRegressor(),
                      SVR(C=25),XGBRegressor()
]
performance_df = train_m.train_mutli_models(list_of_algorithms, True)

 43%|████▎     | 3/7 [00:00<00:00, 25.37it/s]

Training Score: 0.4126955309608391
Validation Score: 0.6060632118532532
MSE :  [0.04892548 0.02708865 0.0598857 ]
Training Score: 0.5692304597305592
Validation Score: 0.7265588775329607
MSE :  [0.04321247 0.01904423 0.02911581]
Training Score: 1.0
Validation Score: -5.2251055024881365
MSE :  [0.88761086 0.01862827 0.9555444 ]
Training Score: 0.9021008362650335
Validation Score: -1.237636004887661
MSE :  [0.40072187 0.01387209 0.23156132]
Training Score: 0.9776155863765608
Validation Score: -3.5837831367067934
MSE :  [0.77211338 0.02033321 0.54267809]


 86%|████████▌ | 6/7 [00:08<00:01,  1.61s/it]

Training Score: 0.9695135255470045
Validation Score: 0.7889239593641895
MSE :  [0.03197609 0.01402698 0.02458598]


100%|██████████| 7/7 [00:08<00:00,  1.26s/it]

Training Score: 0.9997008803559266
Validation Score: 0.7010288431483943
MSE :  [0.04945    0.01539589 0.03104047]





In [None]:
performance_df

Unnamed: 0,Models,Training Score,Validation Score
0,LinearRegression(),0.412696,0.606063
1,KNeighborsRegressor(),0.56923,0.726559
2,DecisionTreeRegressor(),1.0,-5.480225
3,RandomForestRegressor(n_estimators=200),0.889104,-1.076193
4,GradientBoostingRegressor(),0.977616,-3.599618
5,SVR(C=25),0.969514,0.788924
6,"XGBRegressor(base_score=None, booster=None, ca...",0.999701,0.701029
