Step 1: Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
import re

Step 2: Load the dataset

In [None]:
file_path = 'preprocessed_food_data499.csv'  # Update path as needed
df = pd.read_csv(file_path)

Display the first few rows of the dataset

In [None]:
df.head()

Check for missing values and data types

In [None]:
df.info()
df.isnull().sum()

Step 3: Preprocessing the data<br>
Encode categorical columns

In [None]:
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])
df['Meal_Type'] = label_encoder.fit_transform(df['Meal_Type'])

Normalize the numeric columns

In [None]:
scaler = StandardScaler()
numeric_columns = ['Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 
                   'Fat (g)', 'Fiber (g)', 'Sugars (g)', 'Sodium (mg)', 'Cholesterol (mg)']
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

Step 4: Train KMeans Model (Clustering)

In [None]:
X = df[numeric_columns]
kmeans = KMeans(n_clusters=5, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

Visualize KMeans Clusters using PCA

In [None]:
X_pca = PCA(n_components=2).fit_transform(df[numeric_columns])

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=df['Cluster'], cmap='viridis')
plt.title('KMeans Clustering of Food Items')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.colorbar(label='Cluster')
plt.show()

Step 5: Decision Tree Classifier<br>
Define features and target

In [None]:
y = df['Cluster']
tree_model = DecisionTreeClassifier(random_state=42)

Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Train the Decision Tree model

In [None]:
tree_model.fit(X_train, y_train)

Predict on test set and calculate accuracy

In [None]:
y_pred = tree_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of Decision Tree Classifier: {accuracy * 100:.2f}%")

Step 6: Calculate Precision, Recall, F1 Score for Decision Tree Model<br>
Get the classification report (Precision, Recall, F1 Score)

In [None]:
report = classification_report(y_test, y_pred, output_dict=True)
precision = report['weighted avg']['precision']
recall = report['weighted avg']['recall']
f1 = report['weighted avg']['f1-score']

Print the classification report

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Step 7: Visualize Precision, Recall, F1 Score in a table

In [None]:
metrics_df = pd.DataFrame({
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1]
})

Plotting the table

In [None]:
plt.figure(figsize=(6, 3))
plt.axis('off')
sns.heatmap(metrics_df, annot=True, fmt=".2f", cmap="Blues", cbar=False, annot_kws={"size": 15})
plt.title('Precision, Recall, and F1 Score for Decision Tree Classifier')
plt.show()

Step 8: Plotting Precision, Recall, and F1 Score in a Bar Graph

In [None]:
metrics_df.plot(kind='bar', figsize=(8, 6), color=['skyblue', 'lightgreen', 'salmon'])
plt.title('Precision, Recall, and F1 Score Comparison')
plt.ylabel('Score')
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.show()

Step 9: Confusion Matrix for Additional Insights

In [None]:
cm = confusion_matrix(y_test, y_pred)

Plot confusion matrix

In [None]:
plt.figure(figsize=(6, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[str(i) for i in range(5)], yticklabels=[str(i) for i in range(5)])
plt.title('Confusion Matrix - Decision Tree Classifier')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

Step 10: K-Nearest Neighbors (KNN) for Similar Food Retrieval

In [None]:
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(X)

Query similar foods (for example, the first food item)

In [None]:
query_food = X.iloc[0:1]  # Example: First food item
distances, indices = knn.kneighbors(query_food)

Visualize KNN similar food retrieval

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c='gray', alpha=0.5, label='Other Foods')

Visualize filtered foods (highlight in red)

In [None]:
plt.scatter(X_pca[indices[0], 0], X_pca[indices[0], 1], c='red', marker='x', label='Similar Foods')
plt.title("KNN - Nearest Foods to Query")
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()

Step 11: Function to map user input (low, medium, high, exact) to nutritional ranges

In [None]:
def map_query_to_range(query, column):
    """
    Maps a user query to a range of numeric values for the given column (e.g., low, medium, high, or exact).
    """
    if query == 'low':
        return df[column].quantile(0), df[column].quantile(0.25)  # low = 0-25%
    elif query == 'medium':
        return df[column].quantile(0.25), df[column].quantile(0.75)  # medium = 25-75%
    elif query == 'high':
        return df[column].quantile(0.75), df[column].quantile(1)  # high = 75-100%
    elif query.isnumeric():
        return float(query), float(query)  # exact match
    else:
        return None, None

Step 12: Function to parse user query and filter the data based on nutritional conditions

In [None]:
def parse_query_and_filter(query):
    """
    Parses the user query to extract nutritional conditions and filters the dataset accordingly.
    The query format could be like:
    'low fat, high protein, exact calories 300'
    """
    query_dict = {}
    
    # Define patterns for different conditions (low, medium, high, exact)
    patterns = {
        'Calories (kcal)': r'(low|medium|high|\d+)\s*calories',
        'Protein (g)': r'(low|medium|high|\d+)\s*protein',
        'Carbohydrates (g)': r'(low|medium|high|\d+)\s*carbohydrates',
        'Fat (g)': r'(low|medium|high|\d+)\s*fat',
        'Fiber (g)': r'(low|medium|high|\d+)\s*fiber',
        'Sugars (g)': r'(low|medium|high|\d+)\s*sugars',
        'Sodium (mg)': r'(low|medium|high|\d+)\s*sodium',
        'Cholesterol (mg)': r'(low|medium|high|\d+)\s*cholesterol'
    }
    
    # Search for each pattern in the query and store it in the query_dict
    for nutrient, pattern in patterns.items():
        match = re.search(pattern, query.lower())
        if match:
            query_dict[nutrient] = match.group(1)
    
    # Step 13: Filter the dataframe based on the extracted conditions
    filtered_df = df.copy()
    
    for nutrient, condition in query_dict.items():
        low, high = map_query_to_range(condition, nutrient)
        if low is not None and high is not None:
            filtered_df = filtered_df[(filtered_df[nutrient] >= low) & (filtered_df[nutrient] <= high)]
    
    return filtered_df

Step 14: Test the user query system

In [None]:
user_query = input("Please enter your query (e.g., 'low fat, high protein, exact calories 300'): ")
filtered_foods = parse_query_and_filter(user_query)

Display filtered foods

In [None]:
print("Filtered Foods based on your query:")
print(filtered_foods[['Food_Item', 'Calories (kcal)', 'Protein (g)', 'Fat (g)', 'Fiber (g)', 'Sugars (g)', 'Sodium (mg)', 'Cholesterol (mg)']])

Step 15: Visualizing the filtered foods based on PCA (Optional)

In [None]:
X_pca = PCA(n_components=2).fit_transform(df[numeric_columns])

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c='gray', alpha=0.5, label='Other Foods')

Visualize filtered foods (highlight in green)

In [None]:
plt.scatter(X_pca[filtered_foods.index, 0], X_pca[filtered_foods.index, 1], c='green', marker='o', label='Filtered Foods')
plt.title("Filtered Foods Based on User Query")
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.show()