In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


xls = pd.ExcelFile("SHORT VERSION OF FOOD NUTRITUION.xlsx")
df = pd.read_excel(xls, sheet_name="Sheet1")

# Data Cleaning
print("Cleaning and preprocessing data...")
df["categories_en"] = df["categories_en"].fillna("")
df["traces_en"] = df["traces_en"].fillna("")

# Expanded list of gluten and dairy keywords including alternative spellings
allergen_keywords = [
    "gluten", "wheat", "barley", "rye", "dairy", "milk", "cheese", "butter",
    "casein", "lactose", "yogurt", "cream", "custard", "bread", "cracker", "pasta",
    "skimmed milk", "whole milk", "milk solids", "buttermilk", "margarine", "sour cream",
    "ghee", "cheddar", "mozzarella", "parmesan", "ricotta", "cottage cheese", "feta",
    "spelt", "oats", "couscous", "semolina", "farro", "malt", "seitan", "durum"
]
# Strict word boundaries
pattern = "|".join([fr"\b{k}\b" for k in allergen_keywords])
df["allergen_label"] = (
    df["categories_en"].str.contains(pattern, case=False, na=False) |
    df["traces_en"].str.contains(pattern, case=False, na=False)
).astype(int)

# Ensure there are at least two classes
if df["allergen_label"].nunique() < 2:
    raise ValueError("Dataset contains only one class. Ensure dataset balance before training.")

# Define features and labels
X = df["categories_en"] + " " + df["traces_en"]
y = df["allergen_label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Improved SVM model with refined hyperparameters
print("Training optimized SVM model and evaluating accuracy...")
svm_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 4), stop_words="english", max_features=15000, sublinear_tf=True, analyzer='char_wb')),
    ("classifier", SVC(kernel="rbf", C=10.0, gamma="scale", probability=True))
])
svm_pipeline.fit(X_train, y_train)
y_pred = svm_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Optimized SVM Accuracy: {accuracy:.4f}")

# Filter allergen-free foods using the improved SVM model
df["allergen_prediction"] = svm_pipeline.predict(df["categories_en"] + " " + df["traces_en"])
filtered_df = df[df["allergen_prediction"] == 0].drop(columns=["allergen_prediction"])

# Save filtered dataset
filtered_file_path = "filtered_allergy_free_foods_svm_final_v4.csv"
filtered_df.to_csv(filtered_file_path, index=False)
print(f"Filtered dataset saved as {filtered_file_path}. Upload and download it from Google Colab!")


Cleaning and preprocessing data...
Training optimized SVM model and evaluating accuracy...




Optimized SVM Accuracy: 0.9862
Filtered dataset saved as filtered_allergy_free_foods_svm_final_v4.csv. Upload and download it from Google Colab!


# New Section