In [None]:
from models.preferences.preference_utils import (
    get_child_data,
    initialize_child_preference_data,
    print_preference_difference_and_accuracy,
    calculate_percent_of_known_ingredients_to_unknown,
    plot_individual_child_known_percent,
    plot_preference_and_sentiment_accuracies,
    plot_utilities_and_mape,
    plot_utilities_from_json,
)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Configure Matplotlib to use LaTeX for rendering
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",  # Use serif font in conjunction with LaTeX
    "text.latex.preamble": r"\usepackage{times}",
})

def plot_ingredient_data_histograms(ingredient_df, rotation=15, fontsize=12):
    # Convert child_data to a DataFrame
    df = pd.DataFrame(ingredient_df)

    # Plotting histograms for each feature
    features = ['Colour', 'Taste', 'Texture', 'Healthy']

    fig, axes = plt.subplots(2, 2, figsize=(12, 4))
    axes = axes.flatten()

    for idx, feature in enumerate(features):
        if df[feature].dtype == 'object':
            # For categorical data
            df[feature].value_counts().plot(kind='bar', ax=axes[idx], color='skyblue')
            axes[idx].set_ylabel('Count', fontsize=fontsize)
            # Format the x-tick labels
            formatted_labels = [label.replace('_', ' ').title() if isinstance(label, str) else label for label in df[feature].value_counts().index]
            axes[idx].set_xticklabels(formatted_labels, rotation=rotation, fontsize=fontsize)
        else:
            # For numerical data
            df[feature].plot(kind='hist', ax=axes[idx], bins=range(8, 13), color='skyblue')
            axes[idx].set_ylabel('Frequency', fontsize=fontsize)
            axes[idx].set_xticks(range(8, 13))
            axes[idx].tick_params(axis='x', labelsize=fontsize)

        # Capitalize and remove underscores in the feature name for xlabel
        formatted_feature = feature.replace('_', ' ').title()
        axes[idx].set_xlabel(formatted_feature, fontsize=fontsize)

    plt.tight_layout()
    plt.show()

    # Count the number of 1s in specified groups
    groups = ['Group A veg', 'Group A fruit', 'Group BC', 'Group D', 'Group E', 'Bread', 'Confectionary']
    
    group_rename = {
        'Group A veg': 'Vegetables',
        'Group A fruit': 'Fruits',
        'Group BC': 'Protein',
        'Group D': 'Carbs',
        'Group E': 'Dairy',
        'Bread': 'Bread',
        'Confectionary': 'Confectionary'
    }
    
    counts = {group: df[group].sum() for group in groups}

    # Rename the groups for the plot
    renamed_counts = {group_rename[group]: count for group, count in counts.items()}

    # Plot the counts on a bar chart
    fig, ax = plt.subplots(figsize=(6, 3))
    ax.bar(renamed_counts.keys(), renamed_counts.values(), color='skyblue')
    ax.set_xlabel('Groups', fontsize=fontsize)
    ax.set_ylabel('Number of Ingredients in Group', fontsize=fontsize)
    plt.xticks(rotation=15, fontsize=fontsize)
    plt.tight_layout()
    plt.show()
# Configure Matplotlib to use LaTeX for rendering
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",  # Use serif font in conjunction with LaTeX
    "text.latex.preamble": r"\usepackage{times}",
})

    
from utils.process_data import get_data
ingredient_df = get_data("data.csv")


plot_ingredient_data_histograms(ingredient_df)


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Configure Matplotlib to use LaTeX for rendering
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",  # Use serif font in conjunction with LaTeX
    "text.latex.preamble": r"\usepackage{times}",
})

def plot_child_data_histograms(child_data, rotation=0, fontsize=12):
    # Convert child_data to a DataFrame
    df = pd.DataFrame(child_data).T

    # Plotting histograms for each feature
    features = ['age', 'gender', 'health_consideration', 'favorite_cuisine']

    fig, axes = plt.subplots(2, 2, figsize=(12, 4))
    axes = axes.flatten()

    for idx, feature in enumerate(features):
        if df[feature].dtype == 'object':
            # For categorical data
            df[feature].value_counts().plot(kind='bar', ax=axes[idx], color='skyblue')
            axes[idx].set_ylabel('Count', fontsize=fontsize)
            # Format the x-tick labels
            formatted_labels = [label.replace('_', ' ').title() if isinstance(label, str) else label for label in df[feature].value_counts().index]
            axes[idx].set_xticklabels(formatted_labels, rotation=rotation, fontsize=fontsize)
        else:
            # For numerical data
            df[feature].plot(kind='hist', ax=axes[idx], bins=range(8, 13), color='skyblue')
            axes[idx].set_ylabel('Frequency', fontsize=fontsize)
            axes[idx].set_xticks(range(8, 13))
            axes[idx].tick_params(axis='x', labelsize=fontsize)

        # Capitalize and remove underscores in the feature name for xlabel
        formatted_feature = feature.replace('_', ' ').title()
        # axes[idx].set_title(f'Histogram of {formatted_feature}', fontsize=fontsize)
        axes[idx].set_xlabel(formatted_feature, fontsize=fontsize)

    plt.tight_layout()
    plt.show()
# Configure Matplotlib to use LaTeX for rendering
plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",  # Use serif font in conjunction with LaTeX
    "text.latex.preamble": r"\usepackage{times}",
})

    
child_data = get_child_data()
plot_child_data_histograms(child_data)


In [None]:
ingredient_df = get_data("data.csv")
child_data = get_child_data()
data = initialize_child_preference_data(child_data=child_data, ingredient_df=ingredient_df, plot_graphs=True, child_key_plot='c2hild23')

In [None]:
from models.preferences.sentiment_analysis import SentimentAnalyzer
from models.preferences.menu_generators import RandomMenuGenerator
from models.preferences.prediction import PreferenceModel
from models.preferences.voting import IngredientNegotiator

child_data = get_child_data()
ingredient_df = get_data("data.csv")
true_child_preference_data = initialize_child_preference_data(child_data, ingredient_df, seed=None, plot_graphs=False, split = 0.5)

# Set to zero for complete randomness
probability_best = 0
# Random is all equal and score is based on the score of the ingredient in terms of the negotiated list
weight_type = "random"
# weight_type = "score"
menu_plan_length = 10
seed = None

# Complex weight function arguments
complex_weight_func_args = {
    'use_normalize_total_voting_weight': False,
    'use_normalize_vote_categories': True,
    'use_compensatory': True,
    'use_feedback': True,
    'use_fairness': True,
    'target_gini': 0.15,
}
weight_function = 'simple'



In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt

# Initialize the subplot grid
fig, axes = plt.subplots(3, 2, figsize=(15, 12))
axes = axes.flatten()  # Flatten to easily iterate over the axes

# Iterate over each model in the dictionary, except the 'perfect' model
model_name_dict = {
    'roberta': "cardiffnlp/twitter-roberta-base-sentiment",
    'bertweet': "finiteautomata/bertweet-base-sentiment-analysis",
    'distilroberta': "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    '5_star': "nlptown/bert-base-multilingual-uncased-sentiment",
    'perfect': "perfect",  # Skip this one
    'TextBlob': "TextBlob",
    'Vader': "Vader"
}

iterations = 50  # Number of iterations for each model
index = 0  # Index for subplots

for model_name in model_name_dict.keys():
    if model_name == "perfect":
        continue  # Skip the 'perfect' model

    all_true_labels = []
    all_pred_labels = []

    for _ in range(iterations):
        # Initialize and run the preference model
        predictor = PreferenceModel(
            ingredient_df, child_data, true_child_preference_data, visualize_data=False, apply_SMOTE=True, file_path=None, seed=seed
        )
        updated_known_and_predicted_preferences = predictor.run_pipeline()

        previous_feedback = {}
        previous_utility = {}

        # Initial negotiation of ingredients
        negotiator = IngredientNegotiator(
            seed, ingredient_df, updated_known_and_predicted_preferences, previous_feedback, previous_utility, complex_weight_func_args
        )

        negotiated_ingredients, unavailable_ingredients = negotiator.negotiate_ingredients(weight_function=weight_function)

        # Calculate week and day
        week = 1
        day = 1

        # Save negotiation results
        negotiator.close("log_file.json", week=week, day=day)

        menu_generator = RandomMenuGenerator(menu_plan_length=10, weight_type='random', probability_best=probability_best, seed=None)

        # Generate menu based on negotiated list
        menu_plan = menu_generator.generate_menu(negotiated_ingredients, unavailable_ingredients, save_paths={'data': '', 'graphs': ''}, week=week, day=day)

        # Sentiment analysis initiation, initially with true preference data and will adapt it to updated preferences from feedback
        sentiment_analyzer = SentimentAnalyzer(
            true_child_preference_data, menu_plan, model_name=model_name, seed=None
        )

        # Get updated preferences from feedback, the sentiment accuracy and feedback given
        updated_known_unknown_preferences_with_feedback, sentiment_accuracy, feedback_given, true_labels, pred_labels = sentiment_analyzer.get_sentiment_and_update_data(plot_confusion_matrix=False)

        # Remove None values from true_labels and corresponding pred_labels
        filtered_true_labels = []
        filtered_pred_labels = []

        for true_label, pred_label in zip(true_labels, pred_labels):
            if true_label is not None:  # Filter out None values
                filtered_true_labels.append(true_label)
                filtered_pred_labels.append(pred_label)

        # Collect filtered labels from this iteration
        all_true_labels.extend(filtered_true_labels)
        all_pred_labels.extend(filtered_pred_labels)

        # Assign the feedback given to the previous feedback for complex weight calculation
        previous_feedback = feedback_given

    # Calculate overall accuracy
    accuracy = accuracy_score(all_true_labels, all_pred_labels) * 100

    # Generate and plot the confusion matrix
    cm = confusion_matrix(all_true_labels, all_pred_labels, labels=['likes', 'neutral', 'dislikes'])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['likes', 'neutral', 'dislikes'])

    # Plot on the current subplot
    disp.plot(ax=axes[index], cmap=plt.cm.Blues, xticks_rotation=0)
    axes[index].set_title(f'{model_name}, Accuracy: {accuracy:.1f}\\%', fontsize=14)
    axes[index].set_xlabel('Predicted', fontsize=14)
    axes[index].set_ylabel('True', fontsize=14)
    axes[index].tick_params(axis='both', labelsize=12)

    index += 1  # Move to the next subplot

# Adjust layout spacing between subplots
plt.subplots_adjust(hspace=0, wspace=-0.75)

# Adjust layout
plt.tight_layout()
plt.show()


# Prediction

In [1]:
from utils.process_data import get_data
from models.preferences.preference_utils import get_child_data, initialize_child_preference_data, print_preference_difference_and_accuracy
from models.preferences.prediction import PreferenceModel
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer


name ='Random Forest'


child_feature_data = get_child_data()
ingredient_df = get_data("data.csv")
initial_preference = initialize_child_preference_data(child_feature_data, ingredient_df, split = 0.5, seed=None, plot_graphs=False)

predictor = PreferenceModel(
    ingredient_df, child_feature_data, initial_preference, model_name=name, visualize_data=True, apply_SMOTE=True, file_path='preferences_visualization.png', seed=None
)
updated_preferences, true_labels, predicted_labels = predictor.run_pipeline()

accuracy = accuracy_score(true_labels, predicted_labels)

accuracy_total, std = print_preference_difference_and_accuracy(initial_preference, updated_preferences, summary_only=True)

print(accuracy)

print(accuracy_total)

Successfully read 136 lines from the file. Loaded 135 ingredients.

Overall Accuracy of Preferences: 0.961728
Standard Deviation of Accuracies: 0.022798
0.9246108949416343
0.9617283950617282


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from utils.process_data import get_data
from models.preferences.preference_utils import get_child_data, initialize_child_preference_data, print_preference_difference_and_accuracy
from models.preferences.prediction import PreferenceModel
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer

import os
from utils.process_data import get_data
from models.preferences.preference_utils import print_preference_difference_and_accuracy
from models.preferences.preference_utils import get_child_data, initialize_child_preference_data

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', C=10, max_iter=10000, class_weight='balanced'),
    "Support Vector Machine": SVC(C=1.0, kernel='rbf', probability=True, class_weight='balanced'),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss', scale_pos_weight=1),  # XGBoost uses scale_pos_weight for class balancing
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, criterion='gini', class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),  # No direct class_weight support
    "AdaBoost": AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R'),  # No direct class_weight support
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto'),  # No direct class_weight support
    "Decision Tree": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, class_weight='balanced'),
    "Gaussian Naive Bayes": GaussianNB(),  # No direct class_weight support
    "Stochastic Gradient Descent": SGDClassifier(loss='hinge', alpha=0.0001, max_iter=2000, tol=1e-3, class_weight='balanced'),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=2000)  # No direct class_weight support
}

# Define scorers with `average='macro'` set only for the appropriate metrics
scorers = {
'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
'accuracy': make_scorer(accuracy_score)  # No `average` parameter here
}

child_feature_data = get_child_data()
ingredient_df = get_data("data.csv")

# Initial prediction of preferences
file_path = os.path.join('', "preferences_visualization.png")



# Assuming all imports and other necessary function definitions have been done as in the provided code.

def evaluate_models():
    # Evaluate models using cross-validation
    results = []
    
    for iter in range(10):
        initial_preference = initialize_child_preference_data(child_feature_data, ingredient_df, split = 0.7, seed=None, plot_graphs=False)
        for apply_SMOTE in [True, False]:
            for name, model in models.items():

                predictor = PreferenceModel(
                    ingredient_df, child_feature_data, initial_preference, model_name=name, visualize_data=False, apply_SMOTE=apply_SMOTE, file_path=file_path, seed=None
                )
                updated_preferences, true_labels, predicted_labels = predictor.run_pipeline()
                accuracy_total, std = print_preference_difference_and_accuracy(initial_preference, updated_preferences, summary_only=True)
                del updated_preferences
                # Calculate accuracy
                accuracy = accuracy_score(true_labels, predicted_labels)

                # Print the model, accuracy, and iteration
                print(f"Iteration: {iter + 1}, Model: {name}, SMOTE: {apply_SMOTE}, Accuracy: {accuracy:.4f}, Total Accuracy: {accuracy_total:.4f}")

                # Loop over each scorer
                for scorer_name, scorer in scorers.items():
                    # Calculate the score using the scorer
                    if scorer_name in ['precision_macro', 'recall_macro', 'f1_macro']:
                        score = scorer._score_func(true_labels, predicted_labels, average='macro')
                    else:
                        score = scorer._score_func(true_labels, predicted_labels)
                    
                    results.append({
                        "Iteration": iter,
                        "SMOTE": apply_SMOTE,
                        "Model": name,
                        "Metric": scorer_name,
                        "Score": score
                    })

    # Convert results to DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    
    return results_df


def get_ranked_models(results_df):
    # Rank models for each metric
    ranked_results = results_df.copy()
    ranked_results['Rank'] = ranked_results.groupby('Metric')['Score'].rank(ascending=False, method='min')

    # Convert results to DataFrame for easy comparison
    results_df_sorted = ranked_results.sort_values(by=['Metric', 'Rank'])

    print(results_df_sorted)
    return results_df_sorted


# Example usage:

results_df = evaluate_models()
sorted_results_df = get_ranked_models(results_df)

    

Successfully read 136 lines from the file. Loaded 135 ingredients.

Overall Accuracy of Preferences: 0.886667
Standard Deviation of Accuracies: 0.016469
Iteration: 1, Model: Logistic Regression, SMOTE: True, Accuracy: 0.6372, Total Accuracy: 0.8867

Overall Accuracy of Preferences: 0.976049
Standard Deviation of Accuracies: 0.015142
Iteration: 1, Model: Support Vector Machine, SMOTE: True, Accuracy: 0.9233, Total Accuracy: 0.9760


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from models.preferences.prediction import PreferenceModel
import os
from utils.process_data import get_data
from models.preferences.preference_utils import print_preference_difference_and_accuracy
from models.preferences.preference_utils import get_child_data, initialize_child_preference_data

models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', C=1.0, max_iter=10000, class_weight='balanced'),
    "Support Vector Machine": SVC(C=1.0, kernel='rbf', probability=True, class_weight='balanced'),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss', scale_pos_weight=1),  # XGBoost uses scale_pos_weight for class balancing
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, criterion='gini', class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3),  # No direct class_weight support
    "AdaBoost": AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R'),  # No direct class_weight support
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto'),  # No direct class_weight support
    "Decision Tree": DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, class_weight='balanced'),
    "Gaussian Naive Bayes": GaussianNB(),  # No direct class_weight support
    "Stochastic Gradient Descent": SGDClassifier(loss='hinge', alpha=0.0001, max_iter=2000, tol=1e-3, class_weight='balanced'),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=2000)  # No direct class_weight support
}


# Define scorers with `average='macro'` set only for the appropriate metrics
scorers = {
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'accuracy': make_scorer(accuracy_score)  # No `average` parameter here
}

child_feature_data = get_child_data()
ingredient_df = get_data("data.csv")

# Initial prediction of preferences
file_path = os.path.join('', "preferences_visualization.png")

import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, make_scorer

# Assuming all imports and other necessary function definitions have been done as in the provided code.

def evaluate_models():
    # Evaluate models using cross-validation
    results = []
    
    for iter in range(5):
        initial_preference = initialize_child_preference_data(child_feature_data, ingredient_df, split = 0.7, seed=None, plot_graphs=False)
        for apply_SMOTE in [True, False]:
            for name, model in models.items():
                predictor = PreferenceModel(
                    ingredient_df, child_feature_data, initial_preference, model_name=name, visualize_data=False, apply_SMOTE=apply_SMOTE, file_path=file_path, seed=None
                )
                updated_preferences, true_labels, predicted_labels = predictor.run_pipeline()
                print(print_preference_difference_and_accuracy(initial_preference, updated_preferences, summary_only=True))
                accuracy_total, std = print_preference_difference_and_accuracy(initial_preference, updated_preferences, summary_only=True)
                # Calculate accuracy
                accuracy = accuracy_score(true_labels, predicted_labels)

                # Print the model, accuracy, and iteration
                print(f"Iteration: {iter + 1}, Model: {name}, SMOTE: {apply_SMOTE}, Accuracy: {accuracy:.4f}, Total Accuracy: {accuracy_total:.4f}")

                # Loop over each scorer
                for scorer_name, scorer in scorers.items():
                    # Calculate the score using the scorer
                    if scorer_name in ['precision_macro', 'recall_macro', 'f1_macro']:
                        score = scorer._score_func(true_labels, predicted_labels, average='macro')
                    else:
                        score = scorer._score_func(true_labels, predicted_labels)
                    
                    results.append({
                        "Iteration": iter,
                        "SMOTE": apply_SMOTE,
                        "Model": name,
                        "Metric": scorer_name,
                        "Score": score
                    })

    # Convert results to DataFrame for easy comparison
    results_df = pd.DataFrame(results)
    
    return results_df


def get_ranked_models(results_df):
    # Rank models for each metric
    ranked_results = results_df.copy()
    ranked_results['Rank'] = ranked_results.groupby('Metric')['Score'].rank(ascending=False, method='min')

    # Convert results to DataFrame for easy comparison
    results_df_sorted = ranked_results.sort_values(by=['Metric', 'Rank'])

    print(results_df_sorted)
    return results_df_sorted


# Example usage:

results_df = evaluate_models()
sorted_results_df = get_ranked_models(results_df)

    

Successfully read 136 lines from the file. Loaded 135 ingredients.

Overall Accuracy of Preferences: 0.886667
Standard Deviation of Accuracies: 0.026718
(0.8866666666666667, 0.026718057476846562)

Overall Accuracy of Preferences: 0.886667
Standard Deviation of Accuracies: 0.026718
Iteration: 1, Model: Logistic Regression, SMOTE: True, Accuracy: 0.3715, Total Accuracy: 0.8867

Overall Accuracy of Preferences: 0.977778
Standard Deviation of Accuracies: 0.015301
(0.9777777777777779, 0.015300674947979917)

Overall Accuracy of Preferences: 0.977778
Standard Deviation of Accuracies: 0.015301
Iteration: 1, Model: Support Vector Machine, SMOTE: True, Accuracy: 0.2972, Total Accuracy: 0.9778

Overall Accuracy of Preferences: 0.951111
Standard Deviation of Accuracies: 0.024834
(0.9511111111111112, 0.02483415498405956)

Overall Accuracy of Preferences: 0.951111
Standard Deviation of Accuracies: 0.024834
Iteration: 1, Model: XGBoost, SMOTE: True, Accuracy: 0.3526, Total Accuracy: 0.9511

Overall A




Overall Accuracy of Preferences: 0.880741
Standard Deviation of Accuracies: 0.021499
(0.8807407407407408, 0.021498503260286645)

Overall Accuracy of Preferences: 0.880741
Standard Deviation of Accuracies: 0.021499
Iteration: 1, Model: AdaBoost, SMOTE: True, Accuracy: 0.3715, Total Accuracy: 0.8807

Overall Accuracy of Preferences: 0.958765
Standard Deviation of Accuracies: 0.021270
(0.9587654320987654, 0.021270426226451117)

Overall Accuracy of Preferences: 0.958765
Standard Deviation of Accuracies: 0.021270
Iteration: 1, Model: K-Nearest Neighbors, SMOTE: True, Accuracy: 0.3296, Total Accuracy: 0.9588

Overall Accuracy of Preferences: 0.980741
Standard Deviation of Accuracies: 0.015253
(0.9807407407407409, 0.01525278539405481)

Overall Accuracy of Preferences: 0.980741
Standard Deviation of Accuracies: 0.015253
Iteration: 1, Model: Decision Tree, SMOTE: True, Accuracy: 0.3004, Total Accuracy: 0.9807

Overall Accuracy of Preferences: 0.850617
Standard Deviation of Accuracies: 0.041291




Overall Accuracy of Preferences: 0.984198
Standard Deviation of Accuracies: 0.012356
(0.9841975308641976, 0.012355551608095618)

Overall Accuracy of Preferences: 0.984198
Standard Deviation of Accuracies: 0.012356
Iteration: 1, Model: MLP Classifier, SMOTE: True, Accuracy: 0.2972, Total Accuracy: 0.9842

Overall Accuracy of Preferences: 0.889136
Standard Deviation of Accuracies: 0.021167
(0.8891358024691358, 0.021166989797676818)

Overall Accuracy of Preferences: 0.889136
Standard Deviation of Accuracies: 0.021167
Iteration: 1, Model: Logistic Regression, SMOTE: False, Accuracy: 0.3644, Total Accuracy: 0.8891

Overall Accuracy of Preferences: 0.979012
Standard Deviation of Accuracies: 0.012984
(0.9790123456790123, 0.012983522790256698)

Overall Accuracy of Preferences: 0.979012
Standard Deviation of Accuracies: 0.012984
Iteration: 1, Model: Support Vector Machine, SMOTE: False, Accuracy: 0.2972, Total Accuracy: 0.9790

Overall Accuracy of Preferences: 0.958272
Standard Deviation of Ac




Overall Accuracy of Preferences: 0.898025
Standard Deviation of Accuracies: 0.024469
(0.8980246913580248, 0.024469372499509204)

Overall Accuracy of Preferences: 0.898025
Standard Deviation of Accuracies: 0.024469
Iteration: 1, Model: AdaBoost, SMOTE: False, Accuracy: 0.3700, Total Accuracy: 0.8980

Overall Accuracy of Preferences: 0.960000
Standard Deviation of Accuracies: 0.021520
(0.9600000000000002, 0.021519761550124358)

Overall Accuracy of Preferences: 0.960000
Standard Deviation of Accuracies: 0.021520
Iteration: 1, Model: K-Nearest Neighbors, SMOTE: False, Accuracy: 0.3249, Total Accuracy: 0.9600

Overall Accuracy of Preferences: 0.984198
Standard Deviation of Accuracies: 0.011272
(0.9841975308641975, 0.011271814528902059)

Overall Accuracy of Preferences: 0.984198
Standard Deviation of Accuracies: 0.011272
Iteration: 1, Model: Decision Tree, SMOTE: False, Accuracy: 0.2996, Total Accuracy: 0.9842

Overall Accuracy of Preferences: 0.852840
Standard Deviation of Accuracies: 0.04




Overall Accuracy of Preferences: 0.984444
Standard Deviation of Accuracies: 0.012149
(0.9844444444444443, 0.012149051456930906)

Overall Accuracy of Preferences: 0.984444
Standard Deviation of Accuracies: 0.012149
Iteration: 1, Model: MLP Classifier, SMOTE: False, Accuracy: 0.2964, Total Accuracy: 0.9844

Overall Accuracy of Preferences: 0.893827
Standard Deviation of Accuracies: 0.024888
(0.8938271604938273, 0.024888105023909434)

Overall Accuracy of Preferences: 0.893827
Standard Deviation of Accuracies: 0.024888
Iteration: 2, Model: Logistic Regression, SMOTE: True, Accuracy: 0.3700, Total Accuracy: 0.8938

Overall Accuracy of Preferences: 0.980247
Standard Deviation of Accuracies: 0.011042
(0.980246913580247, 0.01104231099999896)

Overall Accuracy of Preferences: 0.980247
Standard Deviation of Accuracies: 0.011042
Iteration: 2, Model: Support Vector Machine, SMOTE: True, Accuracy: 0.2957, Total Accuracy: 0.9802

Overall Accuracy of Preferences: 0.948889
Standard Deviation of Accur




Overall Accuracy of Preferences: 0.884691
Standard Deviation of Accuracies: 0.020836
(0.8846913580246915, 0.02083605377757014)

Overall Accuracy of Preferences: 0.884691
Standard Deviation of Accuracies: 0.020836
Iteration: 2, Model: AdaBoost, SMOTE: True, Accuracy: 0.3597, Total Accuracy: 0.8847

Overall Accuracy of Preferences: 0.957037
Standard Deviation of Accuracies: 0.022500
(0.9570370370370369, 0.022500317530655824)

Overall Accuracy of Preferences: 0.957037
Standard Deviation of Accuracies: 0.022500
Iteration: 2, Model: K-Nearest Neighbors, SMOTE: True, Accuracy: 0.3217, Total Accuracy: 0.9570

Overall Accuracy of Preferences: 0.980247
Standard Deviation of Accuracies: 0.012590
(0.980246913580247, 0.012590171638500697)

Overall Accuracy of Preferences: 0.980247
Standard Deviation of Accuracies: 0.012590
Iteration: 2, Model: Decision Tree, SMOTE: True, Accuracy: 0.2941, Total Accuracy: 0.9802

Overall Accuracy of Preferences: 0.830370
Standard Deviation of Accuracies: 0.027407





Overall Accuracy of Preferences: 0.983457
Standard Deviation of Accuracies: 0.008485
(0.9834567901234568, 0.008485353223426965)

Overall Accuracy of Preferences: 0.983457
Standard Deviation of Accuracies: 0.008485
Iteration: 2, Model: MLP Classifier, SMOTE: True, Accuracy: 0.2909, Total Accuracy: 0.9835

Overall Accuracy of Preferences: 0.889630
Standard Deviation of Accuracies: 0.024816
(0.8896296296296299, 0.024815736117041437)

Overall Accuracy of Preferences: 0.889630
Standard Deviation of Accuracies: 0.024816
Iteration: 2, Model: Logistic Regression, SMOTE: False, Accuracy: 0.3621, Total Accuracy: 0.8896

Overall Accuracy of Preferences: 0.975802
Standard Deviation of Accuracies: 0.017730
(0.9758024691358024, 0.017729701798534555)

Overall Accuracy of Preferences: 0.975802
Standard Deviation of Accuracies: 0.017730
Iteration: 2, Model: Support Vector Machine, SMOTE: False, Accuracy: 0.3059, Total Accuracy: 0.9758

Overall Accuracy of Preferences: 0.956543
Standard Deviation of Ac




Overall Accuracy of Preferences: 0.885679
Standard Deviation of Accuracies: 0.025133
(0.885679012345679, 0.025133085787847974)

Overall Accuracy of Preferences: 0.885679
Standard Deviation of Accuracies: 0.025133
Iteration: 2, Model: AdaBoost, SMOTE: False, Accuracy: 0.3542, Total Accuracy: 0.8857

Overall Accuracy of Preferences: 0.953333
Standard Deviation of Accuracies: 0.024429
(0.9533333333333334, 0.024429475401739727)

Overall Accuracy of Preferences: 0.953333
Standard Deviation of Accuracies: 0.024429
Iteration: 2, Model: K-Nearest Neighbors, SMOTE: False, Accuracy: 0.3178, Total Accuracy: 0.9533

Overall Accuracy of Preferences: 0.982469
Standard Deviation of Accuracies: 0.010207
(0.9824691358024692, 0.010207420547454034)

Overall Accuracy of Preferences: 0.982469
Standard Deviation of Accuracies: 0.010207
Iteration: 2, Model: Decision Tree, SMOTE: False, Accuracy: 0.2972, Total Accuracy: 0.9825

Overall Accuracy of Preferences: 0.834815
Standard Deviation of Accuracies: 0.029




Overall Accuracy of Preferences: 0.983951
Standard Deviation of Accuracies: 0.007670
(0.9839506172839506, 0.007670234354078563)

Overall Accuracy of Preferences: 0.983951
Standard Deviation of Accuracies: 0.007670
Iteration: 2, Model: MLP Classifier, SMOTE: False, Accuracy: 0.2988, Total Accuracy: 0.9840

Overall Accuracy of Preferences: 0.881975
Standard Deviation of Accuracies: 0.023883
(0.8819753086419754, 0.023883066250555536)

Overall Accuracy of Preferences: 0.881975
Standard Deviation of Accuracies: 0.023883
Iteration: 3, Model: Logistic Regression, SMOTE: True, Accuracy: 0.3628, Total Accuracy: 0.8820

Overall Accuracy of Preferences: 0.973827
Standard Deviation of Accuracies: 0.014406
(0.9738271604938272, 0.014405878662533154)

Overall Accuracy of Preferences: 0.973827
Standard Deviation of Accuracies: 0.014406
Iteration: 3, Model: Support Vector Machine, SMOTE: True, Accuracy: 0.3146, Total Accuracy: 0.9738

Overall Accuracy of Preferences: 0.940000
Standard Deviation of Acc




Overall Accuracy of Preferences: 0.873333
Standard Deviation of Accuracies: 0.023295
(0.8733333333333333, 0.023295089175579826)

Overall Accuracy of Preferences: 0.873333
Standard Deviation of Accuracies: 0.023295
Iteration: 3, Model: AdaBoost, SMOTE: True, Accuracy: 0.3621, Total Accuracy: 0.8733

Overall Accuracy of Preferences: 0.955062
Standard Deviation of Accuracies: 0.021968
(0.9550617283950618, 0.021968371772380462)

Overall Accuracy of Preferences: 0.955062
Standard Deviation of Accuracies: 0.021968
Iteration: 3, Model: K-Nearest Neighbors, SMOTE: True, Accuracy: 0.3241, Total Accuracy: 0.9551

Overall Accuracy of Preferences: 0.978765
Standard Deviation of Accuracies: 0.012207
(0.9787654320987654, 0.012206624290674834)

Overall Accuracy of Preferences: 0.978765
Standard Deviation of Accuracies: 0.012207
Iteration: 3, Model: Decision Tree, SMOTE: True, Accuracy: 0.3075, Total Accuracy: 0.9788

Overall Accuracy of Preferences: 0.846420
Standard Deviation of Accuracies: 0.03510




Overall Accuracy of Preferences: 0.982222
Standard Deviation of Accuracies: 0.011728
(0.9822222222222223, 0.011727745271148567)

Overall Accuracy of Preferences: 0.982222
Standard Deviation of Accuracies: 0.011728
Iteration: 3, Model: MLP Classifier, SMOTE: True, Accuracy: 0.3028, Total Accuracy: 0.9822

Overall Accuracy of Preferences: 0.890617
Standard Deviation of Accuracies: 0.021184
(0.8906172839506173, 0.02118426427535964)

Overall Accuracy of Preferences: 0.890617
Standard Deviation of Accuracies: 0.021184
Iteration: 3, Model: Logistic Regression, SMOTE: False, Accuracy: 0.3486, Total Accuracy: 0.8906

Overall Accuracy of Preferences: 0.974074
Standard Deviation of Accuracies: 0.013216
(0.9740740740740741, 0.01321622092945445)

Overall Accuracy of Preferences: 0.974074
Standard Deviation of Accuracies: 0.013216
Iteration: 3, Model: Support Vector Machine, SMOTE: False, Accuracy: 0.3012, Total Accuracy: 0.9741

Overall Accuracy of Preferences: 0.948148
Standard Deviation of Accu




Overall Accuracy of Preferences: 0.887654
Standard Deviation of Accuracies: 0.026782
(0.8876543209876544, 0.026781872769435968)

Overall Accuracy of Preferences: 0.887654
Standard Deviation of Accuracies: 0.026782
Iteration: 3, Model: AdaBoost, SMOTE: False, Accuracy: 0.3787, Total Accuracy: 0.8877

Overall Accuracy of Preferences: 0.951358
Standard Deviation of Accuracies: 0.024017
(0.951358024691358, 0.024016709115114238)

Overall Accuracy of Preferences: 0.951358
Standard Deviation of Accuracies: 0.024017
Iteration: 3, Model: K-Nearest Neighbors, SMOTE: False, Accuracy: 0.3194, Total Accuracy: 0.9514

Overall Accuracy of Preferences: 0.980741
Standard Deviation of Accuracies: 0.010751
(0.9807407407407408, 0.010751378299791236)

Overall Accuracy of Preferences: 0.980741
Standard Deviation of Accuracies: 0.010751
Iteration: 3, Model: Decision Tree, SMOTE: False, Accuracy: 0.3075, Total Accuracy: 0.9807

Overall Accuracy of Preferences: 0.847407
Standard Deviation of Accuracies: 0.036




Overall Accuracy of Preferences: 0.982716
Standard Deviation of Accuracies: 0.010875
(0.9827160493827163, 0.010875415084219879)

Overall Accuracy of Preferences: 0.982716
Standard Deviation of Accuracies: 0.010875
Iteration: 3, Model: MLP Classifier, SMOTE: False, Accuracy: 0.3051, Total Accuracy: 0.9827

Overall Accuracy of Preferences: 0.887160
Standard Deviation of Accuracies: 0.030463
(0.8871604938271604, 0.030462572076494495)

Overall Accuracy of Preferences: 0.887160
Standard Deviation of Accuracies: 0.030463
Iteration: 4, Model: Logistic Regression, SMOTE: True, Accuracy: 0.3652, Total Accuracy: 0.8872

Overall Accuracy of Preferences: 0.975802
Standard Deviation of Accuracies: 0.014046
(0.9758024691358026, 0.014045889040323847)

Overall Accuracy of Preferences: 0.975802
Standard Deviation of Accuracies: 0.014046
Iteration: 4, Model: Support Vector Machine, SMOTE: True, Accuracy: 0.3225, Total Accuracy: 0.9758

Overall Accuracy of Preferences: 0.951852
Standard Deviation of Acc




Overall Accuracy of Preferences: 0.880000
Standard Deviation of Accuracies: 0.022090
(0.8800000000000001, 0.022090142465525955)

Overall Accuracy of Preferences: 0.880000
Standard Deviation of Accuracies: 0.022090
Iteration: 4, Model: AdaBoost, SMOTE: True, Accuracy: 0.3874, Total Accuracy: 0.8800

Overall Accuracy of Preferences: 0.956790
Standard Deviation of Accuracies: 0.021132
(0.9567901234567905, 0.02113239847978232)

Overall Accuracy of Preferences: 0.956790
Standard Deviation of Accuracies: 0.021132
Iteration: 4, Model: K-Nearest Neighbors, SMOTE: True, Accuracy: 0.3241, Total Accuracy: 0.9568

Overall Accuracy of Preferences: 0.980988
Standard Deviation of Accuracies: 0.014400
(0.9809876543209876, 0.01439952919686255)

Overall Accuracy of Preferences: 0.980988
Standard Deviation of Accuracies: 0.014400
Iteration: 4, Model: Decision Tree, SMOTE: True, Accuracy: 0.3099, Total Accuracy: 0.9810

Overall Accuracy of Preferences: 0.858765
Standard Deviation of Accuracies: 0.036285





Overall Accuracy of Preferences: 0.982963
Standard Deviation of Accuracies: 0.011657
(0.9829629629629629, 0.011657354700358396)

Overall Accuracy of Preferences: 0.982963
Standard Deviation of Accuracies: 0.011657
Iteration: 4, Model: MLP Classifier, SMOTE: True, Accuracy: 0.3091, Total Accuracy: 0.9830

Overall Accuracy of Preferences: 0.890370
Standard Deviation of Accuracies: 0.026391
(0.8903703703703705, 0.026390892278296073)

Overall Accuracy of Preferences: 0.890370
Standard Deviation of Accuracies: 0.026391
Iteration: 4, Model: Logistic Regression, SMOTE: False, Accuracy: 0.3755, Total Accuracy: 0.8904

Overall Accuracy of Preferences: 0.976790
Standard Deviation of Accuracies: 0.011108
(0.9767901234567901, 0.011108367288100732)

Overall Accuracy of Preferences: 0.976790
Standard Deviation of Accuracies: 0.011108
Iteration: 4, Model: Support Vector Machine, SMOTE: False, Accuracy: 0.3012, Total Accuracy: 0.9768

Overall Accuracy of Preferences: 0.955802
Standard Deviation of Ac




Overall Accuracy of Preferences: 0.897284
Standard Deviation of Accuracies: 0.022930
(0.897283950617284, 0.022929751637024976)

Overall Accuracy of Preferences: 0.897284
Standard Deviation of Accuracies: 0.022930
Iteration: 4, Model: AdaBoost, SMOTE: False, Accuracy: 0.3597, Total Accuracy: 0.8973

Overall Accuracy of Preferences: 0.957531
Standard Deviation of Accuracies: 0.016992
(0.9575308641975309, 0.016992247441071837)

Overall Accuracy of Preferences: 0.957531
Standard Deviation of Accuracies: 0.016992
Iteration: 4, Model: K-Nearest Neighbors, SMOTE: False, Accuracy: 0.3304, Total Accuracy: 0.9575

Overall Accuracy of Preferences: 0.982222
Standard Deviation of Accuracies: 0.013333
(0.982222222222222, 0.013333333333333343)

Overall Accuracy of Preferences: 0.982222
Standard Deviation of Accuracies: 0.013333
Iteration: 4, Model: Decision Tree, SMOTE: False, Accuracy: 0.3138, Total Accuracy: 0.9822

Overall Accuracy of Preferences: 0.857778
Standard Deviation of Accuracies: 0.0381




Overall Accuracy of Preferences: 0.981235
Standard Deviation of Accuracies: 0.012648
(0.9812345679012348, 0.012648146641842665)

Overall Accuracy of Preferences: 0.981235
Standard Deviation of Accuracies: 0.012648
Iteration: 4, Model: MLP Classifier, SMOTE: False, Accuracy: 0.3130, Total Accuracy: 0.9812

Overall Accuracy of Preferences: 0.886667
Standard Deviation of Accuracies: 0.028313
(0.8866666666666666, 0.028313355338690952)

Overall Accuracy of Preferences: 0.886667
Standard Deviation of Accuracies: 0.028313
Iteration: 5, Model: Logistic Regression, SMOTE: True, Accuracy: 0.3628, Total Accuracy: 0.8867

Overall Accuracy of Preferences: 0.980000
Standard Deviation of Accuracies: 0.012269
(0.9800000000000002, 0.012268897035856645)

Overall Accuracy of Preferences: 0.980000
Standard Deviation of Accuracies: 0.012269
Iteration: 5, Model: Support Vector Machine, SMOTE: True, Accuracy: 0.3154, Total Accuracy: 0.9800

Overall Accuracy of Preferences: 0.945185
Standard Deviation of Acc




Overall Accuracy of Preferences: 0.877778
Standard Deviation of Accuracies: 0.026895
(0.8777777777777778, 0.026895452043966303)

Overall Accuracy of Preferences: 0.877778
Standard Deviation of Accuracies: 0.026895
Iteration: 5, Model: AdaBoost, SMOTE: True, Accuracy: 0.3621, Total Accuracy: 0.8778

Overall Accuracy of Preferences: 0.959506
Standard Deviation of Accuracies: 0.025064
(0.9595061728395062, 0.025063856870023736)

Overall Accuracy of Preferences: 0.959506
Standard Deviation of Accuracies: 0.025064
Iteration: 5, Model: K-Nearest Neighbors, SMOTE: True, Accuracy: 0.3439, Total Accuracy: 0.9595

Overall Accuracy of Preferences: 0.980000
Standard Deviation of Accuracies: 0.013944
(0.9799999999999999, 0.013943513978760985)

Overall Accuracy of Preferences: 0.980000
Standard Deviation of Accuracies: 0.013944
Iteration: 5, Model: Decision Tree, SMOTE: True, Accuracy: 0.3170, Total Accuracy: 0.9800

Overall Accuracy of Preferences: 0.847654
Standard Deviation of Accuracies: 0.03105




Overall Accuracy of Preferences: 0.984938
Standard Deviation of Accuracies: 0.009655
(0.9849382716049383, 0.009654920886922052)

Overall Accuracy of Preferences: 0.984938
Standard Deviation of Accuracies: 0.009655
Iteration: 5, Model: MLP Classifier, SMOTE: True, Accuracy: 0.3154, Total Accuracy: 0.9849

Overall Accuracy of Preferences: 0.883210
Standard Deviation of Accuracies: 0.024395
(0.88320987654321, 0.024394511914631874)

Overall Accuracy of Preferences: 0.883210
Standard Deviation of Accuracies: 0.024395
Iteration: 5, Model: Logistic Regression, SMOTE: False, Accuracy: 0.3628, Total Accuracy: 0.8832

Overall Accuracy of Preferences: 0.976543
Standard Deviation of Accuracies: 0.017745
(0.9765432098765433, 0.01774516899637625)

Overall Accuracy of Preferences: 0.976543
Standard Deviation of Accuracies: 0.017745
Iteration: 5, Model: Support Vector Machine, SMOTE: False, Accuracy: 0.3162, Total Accuracy: 0.9765

Overall Accuracy of Preferences: 0.954321
Standard Deviation of Accur




Overall Accuracy of Preferences: 0.881481
Standard Deviation of Accuracies: 0.028943
(0.8814814814814816, 0.028942653357219957)

Overall Accuracy of Preferences: 0.881481
Standard Deviation of Accuracies: 0.028943
Iteration: 5, Model: AdaBoost, SMOTE: False, Accuracy: 0.3399, Total Accuracy: 0.8815

Overall Accuracy of Preferences: 0.951358
Standard Deviation of Accuracies: 0.022114
(0.951358024691358, 0.022113589069192478)

Overall Accuracy of Preferences: 0.951358
Standard Deviation of Accuracies: 0.022114
Iteration: 5, Model: K-Nearest Neighbors, SMOTE: False, Accuracy: 0.3281, Total Accuracy: 0.9514

Overall Accuracy of Preferences: 0.981975
Standard Deviation of Accuracies: 0.010766
(0.9819753086419751, 0.010765545361279829)

Overall Accuracy of Preferences: 0.981975
Standard Deviation of Accuracies: 0.010766
Iteration: 5, Model: Decision Tree, SMOTE: False, Accuracy: 0.3202, Total Accuracy: 0.9820

Overall Accuracy of Preferences: 0.846420
Standard Deviation of Accuracies: 0.029




Overall Accuracy of Preferences: 0.985185
Standard Deviation of Accuracies: 0.009370
(0.9851851851851852, 0.009369711585684089)

Overall Accuracy of Preferences: 0.985185
Standard Deviation of Accuracies: 0.009370
Iteration: 5, Model: MLP Classifier, SMOTE: False, Accuracy: 0.3170, Total Accuracy: 0.9852
     Iteration  SMOTE                   Model        Metric     Score   Rank
35           0   True    Gaussian Naive Bayes      accuracy  0.433202    1.0
387          4   True    Gaussian Naive Bayes      accuracy  0.432411    2.0
79           0  False    Gaussian Naive Bayes      accuracy  0.426877    3.0
299          3   True    Gaussian Naive Bayes      accuracy  0.421344    4.0
211          2   True    Gaussian Naive Bayes      accuracy  0.418182    5.0
..         ...    ...                     ...           ...       ...    ...
5            0   True  Support Vector Machine  recall_macro  0.324905  106.0
93           1   True  Support Vector Machine  recall_macro  0.322290  107.0


In [None]:
# Assuming `sorted_results_df` is your sorted DataFrame from the previous code
output_file_path = "sorted_results.csv"

# Save the DataFrame to a CSV file
sorted_results_df.to_csv(output_file_path, index=False)

print(f"Sorted results have been saved to {output_file_path}")


In [None]:
import pandas as pd
import optuna
import os
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from models.preferences.prediction import PreferenceModel
from models.preferences.preference_utils import print_preference_difference_and_accuracy
from models.preferences.preference_utils import get_child_data, initialize_child_preference_data
from utils.process_data import get_data

# Define models with initial hyperparameters
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear', C=1.0, max_iter=10000, class_weight='balanced'),
    "Support Vector Machine": SVC(C=1.0, kernel='rbf', probability=True, class_weight='balanced'),
    "XGBoost": XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, eval_metric='logloss', scale_pos_weight=1),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, criterion='gini', class_weight='balanced'),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
}

# Define scorers
scorers = {
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0),
    'accuracy': make_scorer(accuracy_score)
}

# Load and preprocess data
child_feature_data = get_child_data()
ingredient_df = get_data("data.csv")
file_path = os.path.join('', "preferences_visualization.png")

# Function to optimize hyperparameters using Optuna
def optimize_hyperparameters(model_name, ingredient_df, child_feature_data, n_trials=50):
    def objective(trial):
        # Re-initialize preferences for each trial
        initial_preference = initialize_child_preference_data(child_feature_data, ingredient_df, split=0.5, seed=None, plot_graphs=False)
        
        # Define the hyperparameter search space
        if model_name == "XGBoost":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
                'scale_pos_weight': trial.suggest_uniform('scale_pos_weight', 1.0, 3.0),
                'eval_metric': 'mlogloss',
                'use_label_encoder': False
            }
            model = XGBClassifier(**params)
        elif model_name == "Random Forest":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'max_depth': trial.suggest_int('max_depth', 5, 30),
                'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
                'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
                'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2']),
                'class_weight': 'balanced'
            }
            model = RandomForestClassifier(**params)
        elif model_name == "Support Vector Machine":
            params = {
                'C': trial.suggest_loguniform('C', 1e-3, 1e2),
                'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
                'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e-1),
                'probability': True,
                'class_weight': 'balanced'
            }
            model = SVC(**params)
        elif model_name == "Logistic Regression":
            params = {
                'C': trial.suggest_loguniform('C', 1e-3, 1e2),
                'solver': trial.suggest_categorical('solver', ['liblinear', 'lbfgs', 'sag']),
                'penalty': 'l2',
                'class_weight': 'balanced',
                'max_iter': 10000
            }
            model = LogisticRegression(**params)
        elif model_name == "Gradient Boosting":
            params = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 300),
                'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e-1),
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
                'max_features': trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])
            }
            model = GradientBoostingClassifier(**params)
        else:
            return 0

        file_path = ''
        # Re-run the preference model with the new hyperparameters
        predictor = PreferenceModel(
            ingredient_df, child_feature_data, initial_preference, model_name = model_name, visualize_data=False, apply_SMOTE=True, file_path=file_path, seed=None
        )
        updated_known_and_predicted_preferences = predictor.run_pipeline()
        
        accuracy, _, _, _ = print_preference_difference_and_accuracy(
            initial_preference, updated_known_and_predicted_preferences, summary_only=True
        )
        
        return accuracy
    
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
    study.optimize(objective, n_trials=50)
    
    print(f"\nBest hyperparameters for {model_name}:")
    print(study.best_params)
    print(f"Best cross-validation accuracy: {study.best_value:.4f}")
    
    return study.best_params, study.best_value

# Main execution
top_3_models = sorted_results_df['Model'].value_counts().index[:3].tolist()

# Initialize preferences for Optuna optimization
initial_preference = initialize_child_preference_data(child_feature_data, ingredient_df, split=0.5, seed=None, plot_graphs=False)

# Optimize hyperparameters for each of the top 3 models
optimized_models = {}
for model_name in top_3_models:
    print(f"\nOptimizing hyperparameters for {model_name}...")
    best_params, best_score = optimize_hyperparameters(model_name, ingredient_df, child_feature_data, initial_preference)
    optimized_models[model_name] = {
        'best_params': best_params,
        'best_score': best_score
    }

# Display optimized hyperparameters and scores
print("\nOptimized Hyperparameters and Scores for Top 3 Models:")
for model_name in optimized_models:
    print(f"\nModel: {model_name}")
    print(f"Best Hyperparameters: {optimized_models[model_name]['best_params']}")
    print(f"Best Cross-Validation Accuracy: {optimized_models[model_name]['best_score']:.4f}")
