### Imports


In [87]:
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import os
import xgboost as xgb
from importlib import reload
from sklearn.metrics import accuracy_score

from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, mean_squared_error, mean_absolute_error, r2_score
import numpy as np
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

import ipywidgets as widgets
from IPython.display import clear_output, display

### Load Data

In [86]:
# Set the working directory
os.chdir(r'C:\Users\kamil\Documents\PredictModel\data-science-salaries-project')

processed_train= r'data-science-salaries-project\data\processed\processed_data_train.csv'
processed_test= r'data-science-salaries-project\data\processed\processed_data_test.csv'

In [3]:
train_data = pd.read_csv(processed_train)
test_data = pd.read_csv(processed_test)

# MAP QUALITY TO 0-6
mapping = {3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5, 9: 6}
train_data['quality'] = train_data['quality'].map(mapping)
test_data['quality'] = test_data['quality'].map(mapping)

X_train = train_data.drop('quality', axis=1)
y_train = train_data['quality']

X_test = test_data.drop('quality', axis=1)
y_test = test_data['quality']

### Selection of hyperparameters

In [83]:
import functions.LR_hyperparams_search
import functions.xgboost_hyperparams_search
import functions.MLPC_hyperparams_search

reload(functions.LR_hyperparams_search)  # To ensure the latest version is loaded
reload(functions.xgboost_hyperparams_search) 
reload(functions.MLPC_hyperparams_search)

from functions.xgboost_hyperparams_search import xgboost_hyperparams_search
from functions.LR_hyperparams_search import LR_hyperparams_search
from functions.MLPC_hyperparams_search import MLP_hyperparams_search


In [None]:
# widgets 

model_widget = widgets.Dropdown(
    options=['Logistic Regression', 'XGBoost', 'MLP Classifier'],
    value='Logistic Regression',
    description='Model:'
)

search_type_widget = widgets.Dropdown(
    options=['grid', 'random', 'optuna'],
    value='random',
    description='Search type:'
)

n_trials_widget = widgets.IntSlider(
    value=10,
    min=1,
    max=100,
    step=1,
    description='n_trials:',
    continuous_update=False
)

# Button widget
search_button = widgets.Button(description="Start Search")

# Definition of the function that will be called when the button is clicked
def hyperparameter_search(model_type, type_of_search, n_trials):
    if model_type == 'Logistic Regression':
        LR_best_model = LR_hyperparams_search(X_train, y_train, X_test, y_test, n_trials=n_trials, type_of_search=type_of_search)
        print(f"Best Logistic Regression Model: {LR_best_model}")
    elif model_type == 'XGBoost':
        XGBoost_best_model = xgboost_hyperparams_search(X_train, y_train, X_test, y_test, n_trials=n_trials, type_of_search=type_of_search)
        print(f"Best XGBoost Model: {XGBoost_best_model}")

    elif model_type == 'MLP Classifier':
        MLP_best_model = MLP_hyperparams_search(X_train, y_train, X_test, y_test, n_trials=n_trials, type_of_search=type_of_search)
        print(f"Best MLP Classifier Model: {MLP_best_model}")
    else:
        print("Unknown model type")


def on_search_button_click(b):
    clear_output(wait=True)  # Clear the output of the current output cell receiving output
    display(model_widget, search_type_widget, n_trials_widget, search_button) # show again the widgets
    model_type = model_widget.value
    search_type = search_type_widget.value
    n_trials = n_trials_widget.value
    print(f"Searching for the best hyperparameters for the {model_type} model using {search_type} search.\n Wait a moment...")
    
    # Start the hyperparameter search
    hyperparameter_search(model_type, search_type, n_trials)

# Assign the event handler to the button
search_button.on_click(on_search_button_click)

# Display the widgets
display(model_widget, search_type_widget, n_trials_widget, search_button)

Dropdown(description='Model:', index=2, options=('Logistic Regression', 'XGBoost', 'MLP Classifier'), value='M…

Dropdown(description='Search type:', index=1, options=('grid', 'random', 'optuna'), value='random')

IntSlider(value=6, continuous_update=False, description='n_trials:', min=1)

Button(description='Start Search', style=ButtonStyle())

Searching for the best hyperparameters for the MLP Classifier model using random search.
 Wait a moment...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Best Parameters: {'solver': 'lbfgs', 'momentum': np.float64(0.99), 'max_iter': 500, 'learning_rate_init': np.float64(0.00021544346900318845), 'hidden_layer_sizes': (100, 50), 'batch_size': 128, 'alpha': np.float64(0.03593813663804626), 'activation': 'relu'}
Accuracy: 0.5638
Best MLP Classifier Model: MLPClassifier(alpha=np.float64(0.03593813663804626), batch_size=128,
              hidden_layer_sizes=(100, 50),
              learning_rate_init=np.float64(0.00021544346900318845),
              max_iter=500, momentum=np.float64(0.99), random_state=42,
              solver='lbfgs')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


### Train model

In [77]:
# Definition of models to train
models = {
    "XGBoost": xgb.XGBClassifier(),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "Logistic_Regression": LogisticRegression(max_iter=1000)
}

# Training and evaluation of models
for model_name, model in models.items():
    
    # Training the model
    model.fit(X_train, y_train)

    # Predicting on the test set
    y_pred = model.predict(X_test)

    # Calculating metrics
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    f1 = f1_score(y_test, y_pred, average='micro')
    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"R-squared (R2): {r2:.4f}\n")

    # # Cross-validation
    # cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    # print(f"Cross-validation Scores (Accuracy): {[score for score in cv_scores]}\n")
    # print(f"Mean Accuracy from Cross-validation: {cv_scores.mean():.4f}\n")

Model: XGBoost
Precision: 0.6662
Recall: 0.6662
F1 Score: 0.6662
Accuracy: 0.6662
Mean Squared Error (MSE): 0.4700
Mean Absolute Error (MAE): 0.3762
R-squared (R2): 0.3886

Model: MLP
Precision: 0.5677
Recall: 0.5677
F1 Score: 0.5677
Accuracy: 0.5677
Mean Squared Error (MSE): 0.5977
Mean Absolute Error (MAE): 0.4838
R-squared (R2): 0.2225

Model: Logistic_Regression
Precision: 0.5192
Recall: 0.5192
F1 Score: 0.5192
Accuracy: 0.5192
Mean Squared Error (MSE): 0.6908
Mean Absolute Error (MAE): 0.5462
R-squared (R2): 0.1014





In [10]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

# Train the XGBRegressor model
xgb_regressor = xgb.XGBRegressor()
xgb_regressor.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_regressor.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R2): {r2:.4f}")

Mean Squared Error (MSE): 0.4314
Mean Absolute Error (MAE): 0.4705
R-squared (R2): 0.4388
