In [1]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from transformers import BertTokenizer, BertModel
import spacy






In [2]:
# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load dataset
file_path = 'full_format_recipes.json' 
data = pd.read_json(file_path)




In [4]:

# Basic exploration
print(f"Dataset shape: {data.shape}")
print("Columns:", data.columns)

Dataset shape: (20130, 11)
Columns: Index(['directions', 'fat', 'date', 'categories', 'calories', 'desc',
       'protein', 'rating', 'title', 'ingredients', 'sodium'],
      dtype='object')


In [5]:
# Focus on relevant columns
StringData = data[['directions', 'desc', 'rating','categories','title']].copy()
StringData.dropna(inplace=True)



In [6]:
# Analyze relationships ##TODO: SMTH better than this shit
print(StringData.groupby('rating').size())


rating
0.000    1153
1.250     113
1.875      70
2.500     360
3.125     979
3.750    3322
4.375    5438
5.000    2049
dtype: int64


In [7]:
# Text preprocessing function
nlp = spacy.load('en_core_web_sm')
def preprocess_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])


In [8]:


# Apply preprocessing
StringData['processed_directions'] = StringData['directions'].apply(lambda x: preprocess_text(" ".join(x) if isinstance(x, list) else x))
StringData['processed_desc'] = StringData['desc'].apply(lambda x: preprocess_text(" ".join(x) if isinstance(x, list) else x))
StringData['processed_categories'] = StringData['categories'].apply(lambda x: preprocess_text(" ".join(x) if isinstance(x, list) else x))
StringData['processed_title'] = StringData['title'].apply(lambda x: preprocess_text(" ".join(x) if isinstance(x, list) else x))

# TF-IDF vectorization for individual columns
vectorizer = TfidfVectorizer(max_features=5000)
categories_tfidf = vectorizer.fit_transform(data['processed_categories'])
desc_tfidf = vectorizer.fit_transform(data['processed_desc'])
directions_tfidf = vectorizer.fit_transform(data['processed_directions'])
title_tfidf = vectorizer.fit_transform(data['processed_title'])


In [9]:

# Display a few samples of the processed data
print(StringData[['processed_directions', 'processed_desc', 'processed_categories', 'processed_title']].head())


                                processed_directions  \
1  combine ingredient heavy medium saucepan add s...   
3  heat oil heavy large skillet medium high heat ...   
5  Mix basil mayonnaise butter processor basil fi...   
6  cook potato carrot large pot boiling salt wate...   
7  stir sugar chili powder whisk egg white water ...   

                                      processed_desc  \
1  use ingredient find boudin blanc classic frenc...   
3  sicilian style tomato sauce ton Mediterranean ...   
5                              recipe prepare minute   
6  serve newfangle main course salad crisp flatbr...   
7                 pop mouth burst bittersweet flavor   

                                processed_categories  \
1  Food Processor Onion Pork Bake Bastille Day Ne...   
3  fish Olive Tomato Sauté Low Fat Low Cal High F...   
5  Sandwich Food Processor Tomato kid Friendly Qu...   
6  Salad Potato Easter Low Fat Quick Easy Ham Asp...   
7  Egg Fruit Cook Cocktail Party Vegetarian Wi

In [12]:
# Combine all features into a single DataFrame
numeric_features = data[['fat', 'protein', 'calories', 'sodium']]
scaler = StandardScaler()
numeric_features = scaler.fit_transform(numeric_features)


vectorizer = TfidfVectorizer(max_features=5000)  # Use top 5000 features
categories_tfidf = vectorizer.fit_transform(StringData['processed_categories'])
desc_tfidf = vectorizer.fit_transform(StringData['processed_desc'])
directions_tfidf = vectorizer.fit_transform(StringData['processed_directions'])
title_tfidf = vectorizer.fit_transform(StringData['processed_title'])

In [13]:
#Convert data to X and Y
X = torch.tensor(
    pd.concat(
        [pd.DataFrame(categories_tfidf.toarray()), pd.DataFrame(desc_tfidf.toarray()), pd.DataFrame(directions_tfidf.toarray()), pd.DataFrame(title_tfidf.toarray()), pd.DataFrame(numeric_features)],
        axis=1
    ).values,
    dtype=torch.float32
)
y = torch.tensor(data['rating'].values, dtype=torch.float32).unsqueeze(1)  # Target variable



In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [15]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

SyntaxError: cannot delete function call (261873487.py, line 1)

In [None]:
RF = RandomForestRegressor(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)


TypeError: 'RandomForestRegressor' object is not callable

In [None]:

# Predictions and evaluation
rf_predictions = rf_pipeline.predict(X_test)
print("Random Forest MSE:", mean_squared_error(y_test, rf_predictions))
print("Random Forest R2 Score:", r2_score(y_test, rf_predictions))

Random Forest MSE: 1.3997153112293332
Random Forest R2 Score: 0.1760447856207914


In [23]:
# Using BERT embeddings for contextual embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

In [24]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512).to(device)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().cpu().numpy()


In [25]:

# Generate BERT embeddings for a small subset (for demonstration purposes)##TODO:Don't preprocess
#StringData_reduced100 = StringData[:100]  # Use smaller subset to avoid memory issues
sample_embeddings = np.vstack(StringData['processed_directions'].apply(get_bert_embedding))

In [26]:
# Split embeddings
y_sample = StringData['rating']
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    sample_embeddings, y_sample, test_size=0.2, random_state=42)



In [27]:
# Train a neural network using PyTorch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.fc(x)

input_size = sample_embeddings.shape[1]
model = SimpleNN(input_size).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [28]:
# Convert data to tensors
X_train_tensor = torch.tensor(X_train_bert, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_bert.values, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test_bert, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test_bert.values, dtype=torch.float32).view(-1, 1).to(device)

In [None]:
# Training loop
for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")



Epoch 10, Loss: 1.933030128479004
Epoch 20, Loss: 2.492696523666382
Epoch 30, Loss: 2.001760959625244
Epoch 40, Loss: 1.7109516859054565
Epoch 50, Loss: 1.6753886938095093


In [30]:
# Evaluation

model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_loss = criterion(test_predictions, y_test_tensor)
    print("Neural Network Test Loss:", test_loss.item())


Neural Network Test Loss: 1.9380100965499878
