In [45]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report   

In [46]:
df = pd.read_csv("../data/recipes.csv")

In [47]:
df.columns

Index(['Unnamed: 0', 'recipe_name', 'prep_time', 'cook_time', 'total_time',
       'servings', 'yield', 'ingredients', 'directions', 'rating', 'url',
       'cuisine_path', 'nutrition', 'timing', 'img_src'],
      dtype='object')

In [48]:
# def time_to_minutes(time_str):
#     """Convert time strings like '1 hrs 30 mins' into total minutes."""
#     import re
#     if pd.isna(time_str):
#         return np.nan
#     hours = re.findall(r'(\d+)\s*hrs?', time_str)
#     minutes = re.findall(r'(\d+)\s*mins?', time_str)
#     total_minutes = int(hours[0]) * 60 if hours else 0
#     total_minutes += int(minutes[0]) if minutes else 0
#     return total_minutes

# df['prep_time'] = df['prep_time'].apply(time_to_minutes)
# df['cook_time'] = df['cook_time'].apply(time_to_minutes)
# df['total_time'] = df['total_time'].apply(time_to_minutes)

In [49]:
def time_to_minutes(time_str):
    """Convert time strings like '1 hrs 30 mins' into total minutes."""
    if pd.isna(time_str):
        return np.nan
    time_str = str(time_str)  # Convert to string to avoid TypeError
    hours = re.findall(r'(\d+)\s*hrs?', time_str)
    minutes = re.findall(r'(\d+)\s*mins?', time_str)
    total_minutes = int(hours[0]) * 60 if hours else 0
    total_minutes += int(minutes[0]) if minutes else 0
    return total_minutes

# Apply safely to all columns
df['prep_time'] = df['prep_time'].apply(time_to_minutes)
df['cook_time'] = df['cook_time'].apply(time_to_minutes)
df['total_time'] = df['total_time'].apply(time_to_minutes)

In [50]:
df['cleaned_ingredients'] = df['ingredients'].str.lower()  # Convert to lowercase
df['cleaned_ingredients'] = df['cleaned_ingredients'].str.replace(r'[^a-zA-Z\s]', '', regex=True)  # Remove punctuation
df['cleaned_ingredients'] = df['cleaned_ingredients'].str.replace(r'\s+', ' ', regex=True).str.strip()  # Remove extra spaces

print(df[['ingredients', 'cleaned_ingredients']].head())


                                         ingredients  \
0  3 tablespoons butter, 2 pounds Granny Smith ap...   
1  8 small Granny Smith apples, or as needed, ½ c...   
2  4  apples - peeled, cored and chopped, ¾ cup w...   
3  10 cups all-purpose apples, peeled, cored and ...   
4  18 cups thinly sliced apples, 3 tablespoons le...   

                                 cleaned_ingredients  
0  tablespoons butter pounds granny smith apples ...  
1  small granny smith apples or as needed cup uns...  
2  apples peeled cored and chopped cup water cup ...  
3  cups allpurpose apples peeled cored and sliced...  
4  cups thinly sliced apples tablespoons lemon ju...  


In [51]:
df.dropna(subset=['recipe_name', 'cleaned_ingredients', 'directions'], inplace=True)

In [52]:
# Filter out rare classes (classes with fewer than 2 samples)
class_counts = df['recipe_name'].value_counts()
df = df[df['recipe_name'].isin(class_counts[class_counts > 1].index)]

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_ingredients = TfidfVectorizer(max_features=5000, stop_words='english')
X_ingredients = vectorizer_ingredients.fit_transform(df['cleaned_ingredients'])

# vectorizer_directions = TfidfVectorizer(max_features=5000, stop_words='english')
# X_directions = vectorizer_directions.fit_transform(df['directions'])

In [54]:
# # Combine features
# X = np.hstack((X_ingredients.toarray(), X_directions.toarray()))
# y = df['recipe_name']

In [55]:
X = X_ingredients
y = df['recipe_name']

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [57]:
scaler = StandardScaler(with_mean=False)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [58]:
# Initialize models
logistic_model = LogisticRegression(max_iter=1000, random_state=42)
svc_model = SVC(kernel='linear', probability=True, random_state=42)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [59]:
models = {
    "Logistic Regression": logistic_model,
    "SVM": svc_model,
    "Random Forest": rf_model
}

print("Cross-Validation Results:")
for model_name, model in models.items():
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    print(f"{model_name}: {np.mean(scores):.4f}")

Cross-Validation Results:




Logistic Regression: 0.9097




SVM: 0.9097




Random Forest: 0.9097


In [60]:
best_model = logistic_model  # Replace with the best model based on cross-validation results
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

In [61]:
print("\nTest Set Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


Test Set Results:
Accuracy: 0.8947
Classification Report:
                                                               precision    recall  f1-score   support

                                        Amazing Apple Butter       1.00      1.00      1.00         1
                                                 Apple Crisp       1.00      1.00      1.00         1
                              Apple Crisp - Perfect and Easy       1.00      1.00      1.00         2
                                             Apple Crumb Pie       1.00      1.00      1.00         2
                                           Apple Pie Filling       1.00      1.00      1.00         3
                                             Apple Turnovers       1.00      1.00      1.00         1
                                       Apple-Cranberry Crisp       1.00      1.00      1.00         1
                                    Apple-Cranberry Crostada       1.00      1.00      1.00         2
                     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [62]:
import joblib

# Save the vectorizer and model
joblib.dump(vectorizer_ingredients, "vectorizer_ingredients.pkl")
joblib.dump(best_model, "recipe_model.pkl")

['recipe_model.pkl']

In [63]:
print("Model was trained on:", best_model.n_features_in_, "features")
print("Vectorizer outputs:", len(vectorizer_ingredients.get_feature_names_out()), "features")

Model was trained on: 317 features
Vectorizer outputs: 317 features


In [64]:
import joblib
import re

# Load the vectorizer and model
vectorizer_ingredients = joblib.load("vectorizer_ingredients.pkl")
best_model = joblib.load("recipe_model.pkl")

print(f"Number of features in vectorizer: {len(vectorizer_ingredients.get_feature_names_out())}")

# Function to preprocess user input
def preprocess_ingredients_input(ingredients, vectorizer_ingredients):
    cleaned_ingredients = re.sub(r'[^a-zA-Z\s]', '', ingredients.lower()).strip()
    ingredients_vector = vectorizer_ingredients.transform([cleaned_ingredients])
    return ingredients_vector

# Example: User Input
user_ingredients = input("Enter the ingredients (comma-separated): ")
user_vector = preprocess_ingredients_input(user_ingredients, vectorizer_ingredients)

# Make a prediction
predicted_recipe = best_model.predict(user_vector)
print(f"Predicted Recipe Name: {predicted_recipe[0]}")

Number of features in vectorizer: 317
Predicted Recipe Name: Roquefort Pear Salad
