In [1]:
pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

In [3]:
df = pd.read_csv("../data/recipes.csv")

In [4]:
df.drop(columns=["Unnamed: 0", "prep_time", "cook_time", "total_time", "servings",
    "yield", "rating", "url", "nutrition", "img_src"], inplace = True)

In [5]:
df = df[['ingredients', 'directions','timing','cuisine_path', 'recipe_name']].dropna().drop_duplicates()

In [6]:
cuisine_counts = df['cuisine_path'].value_counts()
threshold = 4
df['cuisine_grouped'] = df['cuisine_path'].apply(lambda x: x if cuisine_counts[x] > threshold else 'Other')

In [7]:
df['combined_text'] = df['ingredients'] + ' ' + df['directions'] + ' ' + df['timing']

In [8]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\d+', '', text)  # remove numbers
    return text

df['combined_text'] = df['combined_text'].apply(clean_text)


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(df['combined_text'])

In [10]:
y_cuisine = df["cuisine_grouped"]
y_recipe = df["recipe_name"]

In [11]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_cuisine, test_size=0.2, random_state=42)
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_recipe, test_size=0.2, random_state=42)

In [12]:
cuisine_model = RandomForestClassifier()
cuisine_model.fit(X_train_c, y_train_c)

recipe_model = RandomForestClassifier()
recipe_model.fit(X_train_r, y_train_r)

In [13]:
pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
cat_model = CatBoostClassifier(iterations=200, verbose=100, task_type='GPU')
cat_model.fit(X_train_c, y_train_c)

Learning rate set to 0.235039
0:	learn: 2.5421158	total: 395ms	remaining: 1m 18s
100:	learn: 0.7977797	total: 18.6s	remaining: 18.3s
199:	learn: 0.5069335	total: 34.9s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x26b7f2f8dd0>

: 

In [None]:
cat_model = CatBoostClassifier(iterations=200, verbose=100, task_type='GPU')
cat_model.fit(X_train_r, y_train_r)

Learning rate set to 0.235039


In [None]:
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_c, y_train_c)

In [None]:
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train_r, y_train_r)

In [None]:
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train_c, y_train_c)

In [None]:
svm_rbf = SVC(kernel='rbf')
svm_rbf.fit(X_train_r, y_train_r)

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_c, y_train_c)

In [None]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_r, y_train_r)

In [None]:
models = {
    "CatBoost": cat_model,
    "Linear SVM": svm_linear,
    "RBF SVM": svm_rbf,
    "Logistic Regression": log_reg
}

for name, model in models.items():
    y_pred = model.predict(X_test_c)
    acc = accuracy_score(y_test_c, y_pred)
    print(f"{name} Accuracy (Cuisine Prediction): {acc:.2f}")

In [None]:
models = {
    "CatBoost": cat_model,
    "Linear SVM": svm_linear,
    "RBF SVM": svm_rbf,
    "Logistic Regression": log_reg
}

for name, model in models.items():
    y_pred = model.predict(X_test_r)
    acc = accuracy_score(y_test_r, y_pred)
    print(f"{name} Accuracy (Recipe Prediction): {acc:.2f}")

In [None]:
    from sklearn.ensemble import VotingClassifier

    ensemble_model = VotingClassifier(estimators=[
        ('svm_linear', svm_linear),
        ('svm_rbf', svm_rbf),
        ('log_reg', log_reg)
    ], voting='hard')
    ensemble_model.fit(X_train_c, y_train_c)

    y_pred_ensemble = ensemble_model.predict(X_test_c)
    ensemble_acc = accuracy_score(y_test_c, y_pred_ensemble)
    print(f"Ensemble Model Accuracy (Cuisine Prediction): {ensemble_acc:.2f}")


In [None]:
    from sklearn.ensemble import VotingClassifier

    ensemble_model = VotingClassifier(estimators=[
        ('svm_linear', svm_linear),
        ('svm_rbf', svm_rbf),
        ('log_reg', log_reg)
    ], voting='hard')
    ensemble_model.fit(X_train_r, y_train_r)

    y_pred_ensemble = ensemble_model.predict(X_test_r)
    ensemble_acc = accuracy_score(y_test_r, y_pred_ensemble)
    print(f"Ensemble Model Accuracy (recipe Prediction): {ensemble_acc:.2f}")


In [None]:
import matplotlib.pyplot as plt
df['cuisine_path'].value_counts().plot(kind='barh', figsize=(10,6))
plt.title('Cuisine Distribution')
plt.show()