In [1]:
#Task 12
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Load the interactions dataset with user ratings
interactions_df = pd.read_csv('RAW_interactions_filtered.csv')

# Split data into training and test sets (80% training, 20% testing for each user)
train_data = []
test_data = []

for user_id, user_ratings in interactions_df.groupby('user_id'):
    if len(user_ratings) < 5:  # threshold that makes sense for data
        continue  # Skip or handle users with very few ratings
    # Split ratings for each user
    user_train, user_test = train_test_split(user_ratings, test_size=0.2, random_state=42)
    train_data.append(user_train)
    test_data.append(user_test)

# Concatenate user train and test sets into single DataFrames
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

def predict_rating_c(user_id, recipe_id, train_ratings, similarity_matrix):
    # Filter the user's ratings from the training set
    user_train_ratings = train_ratings[train_ratings['user_id'] == user_id]
    rated_recipes = user_train_ratings['recipe_id'].values
    ratings = user_train_ratings['rating'].values
    
    # Find similarity scores between target recipe and all recipes user rated in training set
    similarities = similarity_matrix.loc[recipe_id, rated_recipes].values
    
    # Weighted average of the ratings based on similarity
    if similarities.sum() > 0:
        predicted_rating = np.dot(similarities, ratings) / similarities.sum()
    else:
        predicted_rating = np.mean(ratings)  # Default to average rating if no similarities

    return predicted_rating

# Load the dataset
recipes_df = pd.read_csv('RAW_recipes_filtered.csv')

# Preprocess the ingredients column
# Join the list of ingredients into a single string for each recipe
recipes_df['ingredients_text'] = recipes_df['ingredients'].apply(lambda x: ' '.join(eval(x)) if pd.notna(x) else '')

# Initialize TF-IDF Vectorizer
tfidf_vectorizer_in = TfidfVectorizer()
tfidf_matrix_in= tfidf_vectorizer_in.fit_transform(recipes_df['ingredients_text'])

# Compute the cosine similarity matrix
cosine_sim_matrix_in = cosine_similarity(tfidf_matrix_in)

# Convert the similarity matrix to a DataFrame for easy lookup
similarity_df_in = pd.DataFrame(cosine_sim_matrix_in, index=recipes_df['id'], columns=recipes_df['id'])

# Load the dataset
recipes_df_1 = pd.read_csv('RAW_recipes_filtered.csv')

# Preprocess the descriptions by filling NaNs with an empty string
recipes_df_1['description'] = recipes_df_1['description'].fillna('')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer_re = TfidfVectorizer(stop_words='english')
tfidf_matrix_re= tfidf_vectorizer_re.fit_transform(recipes_df_1['description'])

# Compute the cosine similarity matrix based on descriptions
cosine_sim_matrix_re = cosine_similarity(tfidf_matrix_re)

# Convert the similarity matrix to a DataFrame for easy lookup
similarity_df_re = pd.DataFrame(cosine_sim_matrix_re, index=recipes_df_1['id'], columns=recipes_df_1['id'])


# Load the dataset
recipes_df_tag= pd.read_csv('RAW_recipes_filtered.csv')

# Preprocess the tags column
# Join the list of tags into a single string for each recipe
recipes_df_tag['tags_text'] = recipes_df_tag['tags'].apply(lambda x: ' '.join(eval(x)) if pd.notna(x) else '')

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer_tag = TfidfVectorizer()
tfidf_matrix_tag= tfidf_vectorizer_tag.fit_transform(recipes_df_tag['tags_text'])

# Compute the cosine similarity matrix based on tags
cosine_sim_matrix_tag = cosine_similarity(tfidf_matrix_tag)

# Convert the similarity matrix to a DataFrame for easy lookup
similarity_df_tag = pd.DataFrame(cosine_sim_matrix_tag, index=recipes_df_tag['id'], columns=recipes_df_tag['id'])
# Save the dataset similarity_df


ingredient_similarity_df = similarity_df_in
description_similarity_df = similarity_df_tag
review_similarity_df = similarity_df_re

def calculate_user_similarity(train_df):
    # Pivot the training data to get a user-item rating matrix
    user_item_matrix = train_df.pivot(index='user_id', columns='recipe_id', values='rating')
    user_means = user_item_matrix.mean(axis=1)

    # Subtract the user's average rating from each rating to center data
    user_item_matrix_centered = user_item_matrix.sub(user_means, axis=0)

    # Calculate the Pearson correlation coefficient matrix between users
    user_similarity = user_item_matrix_centered.T.corr(method='pearson').fillna(0)
    return user_similarity

user_similarity = calculate_user_similarity(train_df)


def predict_rating(user_id, recipe_id, train_df, user_similarity, k=5):
    # Filter the user's ratings from the training set
    user_train_ratings = train_df[train_df['recipe_id'] == recipe_id]
    if user_train_ratings.empty:
        return np.nan  # If no ratings, we can't predict

    # Find similar users who have rated this recipe
    similarities = user_similarity[user_id]
    ratings_by_similar_users = user_train_ratings.set_index('user_id')['rating']

    # Keep only the top k most similar users who have rated the recipe
    similar_users = similarities[ratings_by_similar_users.index].nlargest(k)
    if similar_users.sum() == 0:
        return np.nan  # If no similar users, we can't predict

    # Calculate weighted average rating
    weighted_ratings = ratings_by_similar_users.loc[similar_users.index] * similar_users
    predicted_rating = weighted_ratings.sum() / similar_users.sum()
    return predicted_rating


# Define a function for the content-based prediction as done in Task 10
def content_based_predict(user_id, recipe_id, train_ratings, ingredient_sim, desc_sim, review_sim, alpha1=0.4, alpha2=0.3, alpha3=0.3):
    # Predict using a weighted combination of ingredient, description, and review-based similarities
    content_score = (alpha1 * predict_rating_c(user_id, recipe_id, train_ratings, ingredient_sim) +
                     alpha2 * predict_rating_c(user_id, recipe_id, train_ratings, desc_sim) +
                     alpha3 * predict_rating_c(user_id, recipe_id, train_ratings, review_sim))
    return content_score

# Define a function for the collaborative filtering prediction as done in Task 11
def collaborative_filtering_predict(user_id, recipe_id, train_ratings, user_similarity, k=5):
    return predict_rating(user_id, recipe_id, train_ratings, user_similarity, k)

# Define the hybrid prediction function
def hybrid_predict(user_id, recipe_id, train_ratings, ingredient_sim, desc_sim, review_sim, user_similarity, alpha=0.5):
    # Content-based prediction
    content_score = content_based_predict(user_id, recipe_id, train_ratings, ingredient_sim, desc_sim, review_sim)
    
    # Collaborative filtering prediction
    collaborative_score = collaborative_filtering_predict(user_id, recipe_id, train_ratings, user_similarity)
    
    # Combine predictions
    if np.isnan(content_score) and not np.isnan(collaborative_score):
        return collaborative_score
    elif np.isnan(collaborative_score) and not np.isnan(content_score):
        return content_score
    elif np.isnan(content_score) and np.isnan(collaborative_score):
        return np.nan  # No prediction if both are NaN

    # Weighted average of content and collaborative predictions
    hybrid_score = alpha * content_score + (1 - alpha) * collaborative_score
    return hybrid_score


# Evaluate the hybrid model
def evaluate_hybrid_model(test_df, train_df, ingredient_sim, desc_sim, review_sim, user_similarity, alpha=0.5):
    true_ratings = []
    predicted_ratings = []

    for _, row in test_df.iterrows():
        user_id = row['user_id']
        recipe_id = row['recipe_id']
        true_rating = row['rating']
        
        # Predict rating based on the hybrid model
        predicted_rating = hybrid_predict(user_id, recipe_id, train_df, ingredient_sim, desc_sim, review_sim, user_similarity, alpha)
        
        if not np.isnan(predicted_rating):
            # Store true and predicted ratings
            true_ratings.append(true_rating)
            predicted_ratings.append(predicted_rating)
    
    # Calculate evaluation metrics
    mae = mean_absolute_error(true_ratings, predicted_ratings).round(2)
    rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings)).round(2)
    return mae, rmse

# Run evaluation for the hybrid model with a chosen alpha
alpha = 0.5  # Adjust this value to balance content and collaborative filtering
mae, rmse = evaluate_hybrid_model(test_df, train_df, ingredient_similarity_df, description_similarity_df, review_similarity_df, user_similarity, alpha=alpha)
print(f"Hybrid Recommendation Model - MAE: {mae:.4f}, RMSE: {rmse:.4f}")

# Create a DataFrame to save the results
results_df = pd.DataFrame({
    'Model Based': ['Hybrid'],
    'MAE': [mae],
    'RMSE': [rmse]
})

# Save the DataFrame to a CSV file
results_df.to_csv('/evaluation_results_12.csv', index=False)



Hybrid Recommendation Model - MAE: 0.5400, RMSE: 1.0400
