# 1. ML Hyperparameter tunning

In [1]:
# Hyperparameter tunning - Logistic regression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression

def tune_logistic_regression(X_train, y_train, X_val, y_val):
    """
    Hiệu chỉnh hyperparameter của LogisticRegression và tìm kết quả tốt nhất.
    Args:
        X_train: Dữ liệu huấn luyện.
        y_train: Nhãn của dữ liệu huấn luyện.
        X_val: Dữ liệu validation.
        y_val: Nhãn của dữ liệu validation.
    Returns:
        Một tuple chứa mô hình LogisticRegression tốt nhất và kết quả đánh giá.
    """
    # Xác định khoảng giá trị cho các hyperparameter
    #'C': [0.001, 0.01, 0.1, 1, 10, 100]
    param_grid = {
         'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['liblinear', 'saga']  # 'l1' penalty works with 'liblinear' and 'saga' solvers
    }

    # Khởi tạo mô hình LogisticRegression
    logreg = LogisticRegression(max_iter=1000)  # Tăng số lần lặp tối đa

    # Sử dụng RandomizedSearchCV để tìm kiếm hyperparameter tốt nhất
    grid_search = RandomizedSearchCV(logreg, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Lấy mô hình tốt nhất
    best_logreg = grid_search.best_estimator_
    Y_predict = best_logreg.predict(X_val)

    # Print the best hyperparameters and score
    print("Best hyperparameters (Logistic regression):", grid_search.best_params_)
    print("Best score (Logistic regression):", grid_search.best_score_)


    # Đánh giá mô hình trên tập validation
    predictions = best_logreg.predict(X_val)
    accuracy = accuracy_score(predictions, y_val)
    precision = precision_score(predictions, y_val)
    recall = recall_score(predictions, y_val)

    auc = roc_auc_score(y_val, Y_predict)
    print('Best Logistic regression Accuracy:',accuracy)
    print('Best Logistic regression Precision:',precision)
    print('Best Logistic regression Recall:',recall)
    print('Best Logistic regression AUC:',auc)
    print()
    print('Classification Report: ', 'LogisticRegression')
    print(classification_report(y_val, predictions))

    false_position_rate, true_position_rate, thresholds = roc_curve(y_val, predictions)
    print('ROC_AUC_SCORE is ', roc_auc_score(y_val, predictions))

    plt.plot(false_position_rate, true_position_rate)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()

    sns.heatmap(confusion_matrix(y_val, predictions), fmt='', annot=True)
    plt.show()

    results = {
        'accuracy': accuracy_score(predictions, y_val),
        'precision': precision_score(predictions, y_val),
        'recall': recall_score(predictions, y_val),
        'auc': roc_auc_score(y_val, Y_predict),
        'fpr': false_position_rate,
        'tpr': true_position_rate,
        'thresholds': thresholds
    }


    return best_logreg, results

In [3]:
# Hyperparameter tunning - RandomForest

from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
def tune_random_forest(X_train, y_train, X_val, y_val):
    """
    Hiệu chỉnh hyperparameter cho RandomForestClassifier và đánh giá hiệu suất.

    Args:
        train_x: Dữ liệu huấn luyện.
        train_y: Nhãn của dữ liệu huấn luyện.
        val_x: Dữ liệu kiểm định.
        val_y: Nhãn của dữ liệu kiểm định.
    """

    # Define the parameter grid for RandomForestClassifier
    param_grid_rf = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(5, 50),
    'min_samples_split':randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2','auto']
    }

    # Create a RandomForestClassifier model
    rf_model = RandomForestClassifier()

    # Perform RandomizedSearchCV to find the best hyperparameters
    grid_search_rf = RandomizedSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
    grid_search_rf.fit(X_train, y_train)

    # Evaluate the best model on the validation set
    best_rf_model = grid_search_rf.best_estimator_
    predictions = best_rf_model.predict(X_val)

    # Print the best hyperparameters and score
    print("Best hyperparameters (RandomForestClassifier):", grid_search_rf.best_params_)
    print("Best score (RandomForestClassifier):", grid_search_rf.best_score_)

    # Đánh giá mô hình trên tập validation
    Y_predict = best_rf_model.predict(X_val)
    auc = roc_auc_score(y_val, Y_predict)
    accuracy = accuracy_score(predictions, y_val)
    precision = precision_score(predictions, y_val)
    recall = recall_score(predictions, y_val)

    print('Best Random Forest Accuracy:',accuracy)
    print('Best Random Forest Precision:',precision)
    print('Best Random Forest Recall:',recall)
    print('Best Random Forest AUC:',auc)

    # Print classification report
    print('Classification Report: ', 'RandomForestClassifier')
    print(classification_report(y_val, predictions))

    # Calculate and print ROC AUC score
    print('ROC_AUC_SCORE is ', roc_auc_score(y_val, predictions))

    # Plot ROC curve
    false_position_rate, true_position_rate, thresholds = roc_curve(y_val, predictions)
    plt.plot(false_position_rate, true_position_rate)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()

    # Plot confusion matrix
    sns.heatmap(confusion_matrix(y_val, predictions), fmt='', annot=True)
    plt.show()
    #result
    results = {
        'accuracy': accuracy_score(predictions, y_val),
        'precision': precision_score(predictions, y_val),
        'recall': recall_score(predictions, y_val),
        'auc': roc_auc_score(y_val, Y_predict),
        'fpr': false_position_rate,
        'tpr': true_position_rate,
        'thresholds': thresholds
    }

    return best_rf_model, results

In [4]:
from sklearn.naive_bayes import MultinomialNB
from scipy.stats import uniform
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def tune_multinomial_nb(X_train, y_train, X_val, y_val):
    """
    Hiệu chỉnh hyperparameter cho Multinomial Naive Bayes và đánh giá hiệu suất.

    Args:
        train_x: Dữ liệu huấn luyện.
        train_y: Nhãn của dữ liệu huấn luyện.
        val_x: Dữ liệu kiểm định.
        val_y: Nhãn của dữ liệu kiểm định.
    """

    # Define the parameter grid for Multinomial Naive Bayes
    param_grid_nb = {
        'alpha': uniform(0.01, 10),  # Smoothing parameter
    #'fit_prior': [True, False]  # Whether to learn class prior probabilities
    }

    # Create a Multinomial Naive Bayes model
    nb_model = MultinomialNB()

    # Perform RandomizedSearchCV to find the best hyperparameters
    grid_search_nb = RandomizedSearchCV(nb_model, param_grid_nb, cv=5, scoring='accuracy')
    grid_search_nb.fit(X_train, y_train)

    # Print the best hyperparameters and score
    print("Best hyperparameters (MultinomialNB):", grid_search_nb.best_params_)
    print("Best score (MultinomialNB):", grid_search_nb.best_score_)



    # Evaluate the best model on the validation set
    best_nb_model = grid_search_nb.best_estimator_
    Y_predict = best_nb_model.predict(X_val)
    auc = roc_auc_score(y_val, Y_predict)
    predictions = best_nb_model.predict(X_val)
    # Đánh giá mô hình trên tập validation
    accuracy = accuracy_score(predictions, y_val)
    precision = precision_score(predictions, y_val)
    recall = recall_score(predictions, y_val)
    print('Best Navie Bayes Accuracy:',accuracy)
    print('Best Naive Bayes Precision:',precision)
    print('Best Navie Bayes Recall:',recall)
    print('Best Naive Bayes AUC:',auc)

    # Print classification report
    print('Classification Report: ', 'MultinomialNB')
    print(classification_report(y_val, predictions))

    # Calculate and print ROC AUC score
    print('ROC_AUC_SCORE is ', roc_auc_score(y_val, predictions))

    #Plot ROC curve
    false_position_rate, true_position_rate, thresholds = roc_curve(y_val, predictions)
    plt.plot(false_position_rate, true_position_rate)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.title('ROC curve')
    plt.show()

    # Plot confusion matrix
    sns.heatmap(confusion_matrix(y_val, predictions), fmt='', annot=True)
    plt.show()

    # result
    results = {
        'accuracy': accuracy_score(predictions, y_val),
        'precision': precision_score(predictions, y_val),
        'recall': recall_score(predictions, y_val),
        'auc': roc_auc_score(y_val, Y_predict),
        'fpr': false_position_rate,
        'tpr': true_position_rate,
        'thresholds': thresholds
    }
    return best_nb_model,results