In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

from transformers import BertTokenizer, BertModel
import nltk
import spacy






In [None]:
# Load dataset
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")# use gpu when available
file_path = 'full_format_recipes.json' 
data = pd.read_json(file_path)




In [None]:

# Basic exploration
print(f"Dataset shape: {data.shape}")
print("Columns:", data.columns)



In [None]:
# Focus on relevant columns
data = data[['directions', 'desc', 'rating']]
data.dropna(inplace=True)



In [None]:
# Analyze relationships
print(data.groupby('rating').size())




In [None]:
# Text preprocessing function
def preprocess_text(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])



In [None]:

# Apply preprocessing

data['processed_directions'] = data['directions'].apply(lambda x: preprocess_text(" ".join(x) if isinstance(x, list) else x))
data['processed_desc'] = data['desc'].apply(lambda x: preprocess_text(" ".join(x) if isinstance(x, list) else x))




In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    data['processed_directions'], data['rating'], test_size=0.2, random_state=42)


In [None]:

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer(max_features=5000)


In [None]:
# Model pipeline for Random Forest
rf_pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

rf_pipeline.fit(X_train, y_train)



In [None]:

# Predictions and evaluation
rf_predictions = rf_pipeline.predict(X_test)
print("Random Forest MSE:", mean_squared_error(y_test, rf_predictions))
print("Random Forest R2 Score:", r2_score(y_test, rf_predictions))

In [None]:
# Using BERT embeddings for contextual embeddings
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



In [None]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()


In [None]:

# Generate BERT embeddings for a small subset (for demonstration purposes)
sample_data = data[:100]  # Use smaller subset to avoid memory issues
sample_embeddings = np.vstack(sample_data['processed_directions'].apply(get_bert_embedding))

In [None]:
# Split embeddings
y_sample = sample_data['rating']
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(
    sample_embeddings, y_sample, test_size=0.2, random_state=42)


In [None]:
# Train a neural network using PyTorch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self, input_size):
        super(SimpleNN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.fc(x)

input_size = sample_embeddings.shape[1]
model = SimpleNN(input_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
# Convert data to tensors
X_train_tensor = torch.tensor(X_train_bert, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_bert.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test_bert, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_bert.values, dtype=torch.float32).view(-1, 1)

In [None]:
# Training loop
for epoch in range(50):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")



In [None]:
# Evaluation
model.eval()
with torch.no_grad():
    test_predictions = model(X_test_tensor)
    test_loss = criterion(test_predictions, y_test_tensor)
    print("Neural Network Test Loss:", test_loss.item())