In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import os

# --- Global variables for preprocessing consistency in recommendation ---
global scaler_recommender, all_features_cols_recommender
global original_categorical_cols_recommender, original_binary_cols_recommender, original_all_cuisines_recommender
global df_processed_for_recommender # The final processed DataFrame for recommendations

print("--- CELL 1: Data Preprocessing for Recommendation System ---")

# --- 1. Load the Dataset ---
print("--- Step 1: Loading the Dataset ---")
file_path = 'Dataset .csv'
if not os.path.exists(file_path):
    print(f"Error: '{file_path}' not found. Please ensure the dataset file is in the same directory.")
    exit()

df = pd.read_csv(file_path)
print("Dataset loaded successfully!")

print(f"Initial dataset shape: {df.shape}")
print("Initial 5 rows:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 2. Handle Missing Values ---
print("\n--- Step 2: Handling Missing Values ---")
print("Missing values before handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))

df.dropna(subset=['Cuisines'], inplace=True)
print(f"Dataset shape after dropping rows with missing 'Cuisines': {df.shape}")
print("Missing values after handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))


# --- 3. Encode Categorical Variables ---
print("\n--- Step 3: Encoding Categorical Variables ---")

# Convert binary 'Yes'/'No' columns to 1/0
original_binary_cols_recommender = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
for col in original_binary_cols_recommender:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)
print("Binary 'Yes'/'No' columns converted to 1/0.")
print(df[original_binary_cols_recommender].head().to_markdown(index=False, numalign="left", stralign="left"))


# Store all unique cuisines for consistent multi-label encoding
original_all_cuisines_recommender = sorted(df['Cuisines'].str.split(', ').explode().unique())


# Identify columns to drop (less relevant for content-based similarity or redundant)
# We KEEP 'Restaurant ID', 'Restaurant Name', 'Aggregate rating' for recommendations
columns_to_drop_recommender = [
    'Address',
    'Locality',
    'Locality Verbose',
    'Switch to order menu', # Only 'No' values, not useful
    'Rating color',         # Highly correlated with target, less useful for independent content-based recs
    'Rating text'           # Highly correlated with target
]
df.drop(columns=columns_to_drop_recommender, inplace=True)
print(f"Dropped less relevant/redundant columns. Current shape: {df.shape}")


# One-Hot Encode nominal categorical columns
original_categorical_cols_recommender = ['Country Code', 'City', 'Currency']
df = pd.get_dummies(df, columns=original_categorical_cols_recommender, drop_first=True)
print(f"Nominal categorical columns one-hot encoded. Current shape: {df.shape}")

# --- FIX: Explicitly drop the original columns after one-hot encoding ---
# This ensures they are not present in df_processed_for_recommender
df.drop(columns=[col for col in original_categorical_cols_recommender if col in df.columns], inplace=True, errors='ignore')
print("Explicitly dropped original categorical columns after one-hot encoding.")


# Handle 'Cuisines' column with multi-label one-hot encoding
cuisine_dummies = df['Cuisines'].str.get_dummies(sep=', ')
df = pd.concat([df, cuisine_dummies], axis=1)
df.drop(columns=['Cuisines'], inplace=True)
print(f"'Cuisines' column multi-label encoded. Current shape: {df.shape}")


# --- 4. Identify Features for Similarity Calculation and Scaling ---
# Features that describe the restaurant's content/attributes
# Exclude ID, Name, Rating as they are not features for similarity calculation
features_for_similarity = [col for col in df.columns if col not in ['Restaurant ID', 'Restaurant Name', 'Aggregate rating']]
numerical_cols_to_scale = ['Longitude', 'Latitude', 'Average Cost for two', 'Price range', 'Votes', 'Has Table booking', 'Has Online delivery', 'Is delivering now']
# Ensure only numerical columns that are actually in the df are in the list to scale
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col in df.columns and pd.api.types.is_numeric_dtype(df[col])]

# Store all feature columns for consistent user profile creation
# This will now correctly contain only the one-hot encoded and numerical features
all_features_cols_recommender = df.drop(columns=['Restaurant ID', 'Restaurant Name', 'Aggregate rating'], errors='ignore').columns


# --- 5. Feature Scaling ---
print("\n--- Step 5: Feature Scaling ---")
scaler_recommender = StandardScaler()
# Fit and transform only the numerical features
df[numerical_cols_to_scale] = scaler_recommender.fit_transform(df[numerical_cols_to_scale])
print("Numerical features scaled using StandardScaler.")

# Final DataFrame for the recommender, including ID, Name, Rating, and all processed features
df_processed_for_recommender = df.copy()

print("\nPreprocessing for Recommendation System complete!")
print(f"Final processed DataFrame shape: {df_processed_for_recommender.shape}")
print("First 5 rows of processed data (features and IDs):")
print(df_processed_for_recommender.head().to_markdown(index=False, numalign="left", stralign="left"))


--- CELL 1: Data Preprocessing for Recommendation System ---
--- Step 1: Loading the Dataset ---
Dataset loaded successfully!
Initial dataset shape: (9551, 21)
Initial 5 rows:
| Restaurant ID   | Restaurant Name        | Country Code   | City             | Address                                                                 | Locality                                   | Locality Verbose                                             | Longitude   | Latitude   | Cuisines                         | Average Cost for two   | Currency         | Has Table booking   | Has Online delivery   | Is delivering now   | Switch to order menu   | Price range   | Aggregate rating   | Rating color   | Rating text   | Votes   |
|:----------------|:-----------------------|:---------------|:-----------------|:------------------------------------------------------------------------|:-------------------------------------------|:-------------------------------------------------------------|:------------|:-----

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

print("--- CELL 2: Recommendation System Core Logic ---")

def create_user_profile(user_preferences, df_template, scaler_obj, ohe_cat_cols, bin_cols, all_cuisines_list, num_cols_to_scale):
    """
    Creates a preprocessed user profile vector based on preferences,
    matching the feature space of the restaurant data.

    Args:
        user_preferences (dict): Dictionary of user preferences.
                                 Keys should match original column names.
                                 'Cuisines' should be a list of strings.
                                 'Min_Rating' is a filter, not a feature for similarity.
        df_template (pd.DataFrame): A DataFrame with the same columns as the processed
                                    restaurant features (e.g., X_train.columns or all_features_cols_recommender).
                                    Used to ensure consistent column order and presence.
        scaler_obj (StandardScaler): The fitted scaler used for numerical features.
        ohe_cat_cols (list): List of original categorical columns for one-hot encoding.
        bin_cols (list): List of original binary columns for 0/1 conversion.
        all_cuisines_list (list): List of all unique cuisines for multi-label encoding.
        num_cols_to_scale (list): List of numerical columns that were scaled.

    Returns:
        pd.Series: A preprocessed user profile vector.
    """
    # Create a DataFrame for the user's preferences
    user_df = pd.DataFrame([user_preferences])

    # 1. Handle binary columns
    for col in bin_cols:
        if col in user_df.columns:
            user_df[col] = user_df[col].apply(lambda x: 1 if x == 'Yes' else 0)
        else:
            user_df[col] = 0 # Assume 'No' if not provided

    # 2. Handle Cuisines (multi-label one-hot encoding)
    cuisine_data = {cuisine: 0 for cuisine in all_cuisines_list}
    if 'Cuisines' in user_df.columns and user_df['Cuisines'].iloc[0]:
        # Ensure the cuisine list is not empty
        cuisines_list = user_df['Cuisines'].iloc[0]
        if isinstance(cuisines_list, list):
             current_cuisines = [c.strip() for c in cuisines_list if c.strip()]
             for cuisine in current_cuisines:
                 if cuisine in cuisine_data:
                     cuisine_data[cuisine] = 1
    new_cuisine_df = pd.DataFrame([cuisine_data]) # Convert dict to DataFrame for concat


    # Drop original 'Cuisines' from user_df if it exists
    if 'Cuisines' in user_df.columns:
        user_df.drop(columns=['Cuisines'], inplace=True)

    # 3. One-Hot Encode other nominal categorical columns
    # Identify which of the ohe_cat_cols are actually in the user_df
    cols_to_ohe_in_user_df = [col for col in ohe_cat_cols if col in user_df.columns]
    user_df_encoded = pd.get_dummies(user_df, columns=cols_to_ohe_in_user_df, drop_first=True)

    # Concatenate cuisine dummies and other encoded features
    user_profile_df = pd.concat([user_df_encoded, new_cuisine_df], axis=1)

    # Align columns with the restaurant features (df_template)
    # This is crucial: add missing columns from template and fill with 0
    # Drop extra columns that are not in the template
    user_profile_aligned = user_profile_df.reindex(columns=df_template.columns, fill_value=0)

    # 4. Feature Scaling for numerical columns
    # Ensure numerical_cols_to_scale only contains columns present in user_profile_aligned
    cols_to_scale_in_user_profile = [col for col in num_cols_to_scale if col in user_profile_aligned.columns]
    user_profile_aligned[cols_to_scale_in_user_profile] = scaler_obj.transform(user_profile_aligned[cols_to_scale_in_user_profile])

    return user_profile_aligned.iloc[0] # Return as a Series


def get_recommendations(user_preferences, df_restaurants_features, scaler_obj, ohe_cat_cols, bin_cols, all_cuisines_list, num_cols_to_scale, top_n=10):
    """
    Recommends restaurants based on user preferences using content-based filtering.

    Args:
        user_preferences (dict): Dictionary of user preferences.
                                 Can include 'Min_Rating' for filtering.
        df_restaurants_features (pd.DataFrame): The preprocessed DataFrame of all restaurants
                                                with features, ID, Name, and Aggregate rating.
        scaler_obj (StandardScaler): The fitted scaler used for numerical features.
        ohe_cat_cols (list): List of original categorical columns for one-hot encoding.
        bin_cols (list): List of original binary columns for 0/1 conversion.
        all_cuisines_list (list): List of all unique cuisines for multi-label encoding.
        num_cols_to_scale (list): List of numerical columns that were scaled.
        top_n (int): Number of top recommendations to return.

    Returns:
        pd.DataFrame: A DataFrame of top N recommended restaurants with their details.
    """
    # 1. Filter restaurants based on explicit user criteria (e.g., minimum rating)
    filtered_df = df_restaurants_features.copy()
    user_preferences_for_profile = user_preferences.copy() # Create a copy to modify for profile creation

    if 'Min_Rating' in user_preferences_for_profile:
        min_rating = user_preferences_for_profile['Min_Rating']
        filtered_df = filtered_df[filtered_df['Aggregate rating'] >= min_rating]
        print(f"Filtered for restaurants with Min_Rating >= {min_rating}. Remaining: {len(filtered_df)} restaurants.")
        # Remove Min_Rating from user_preferences_for_profile as it's a filter, not a feature
        del user_preferences_for_profile['Min_Rating']


    # If City is a hard filter, apply it before similarity
    if 'City' in user_preferences_for_profile:
        city_pref = user_preferences_for_profile['City']
        # Find the one-hot encoded city column name
        city_col_name = f'City_{city_pref}'
        if city_col_name in filtered_df.columns:
            filtered_df = filtered_df[filtered_df[city_col_name] == 1].copy() # Use .copy() to avoid SettingWithCopyWarning
            print(f"Filtered for City: {city_pref}. Remaining: {len(filtered_df)} restaurants.")
        else:
             # Handle cases where the city preference doesn't exist in the dataset after filtering
             print(f"Warning: City '{city_pref}' not found in the dataset after other filters. No restaurants remaining.")
             return pd.DataFrame(columns=['Restaurant Name', 'Aggregate rating', 'Similarity Score'])

        # Remove City from user_preferences_for_profile as it's already filtered
        del user_preferences_for_profile['City']

    # Add filtering for 'Votes' if present in user preferences
    if 'Votes' in user_preferences_for_profile:
        min_votes = user_preferences_for_profile['Votes']
        filtered_df = filtered_df[filtered_df['Votes'] >= min_votes].copy() # Use .copy()
        print(f"Filtered for restaurants with at least {min_votes} votes. Remaining: {len(filtered_df)} restaurants.")
        # Remove Votes from user_preferences_for_profile as it's a filter
        del user_preferences_for_profile['Votes']


    # Ensure there are restaurants left after filtering
    if filtered_df.empty:
        print("No restaurants found matching the filtering criteria.")
        return pd.DataFrame(columns=['Restaurant Name', 'Aggregate rating', 'Similarity Score'])

    # 2. Create user profile vector
    # Get the feature columns from the filtered_df (excluding ID, Name, Rating)
    restaurant_feature_cols = [col for col in filtered_df.columns if col not in ['Restaurant ID', 'Restaurant Name', 'Aggregate rating', 'Similarity Score']]


    # Pass a template DataFrame for user profile creation that matches feature columns
    user_profile_vector = create_user_profile(
        user_preferences_for_profile,
        filtered_df[restaurant_feature_cols], # Use filtered_df's feature columns as template
        scaler_obj,
        ohe_cat_cols,
        bin_cols,
        all_cuisines_list,
        num_cols_to_scale
    )

    # Convert user profile to a 2D array for cosine_similarity
    user_profile_array = user_profile_vector.values.reshape(1, -1)

    # Extract features from filtered restaurants for similarity calculation
    restaurant_features_array = filtered_df[restaurant_feature_cols].values

    # 3. Calculate Cosine Similarity
    # Handle cases where user profile might have all zeros (e.g., no preferences matching features)
    if np.linalg.norm(user_profile_array) == 0:
        print("User profile is all zeros. Cannot calculate meaningful similarity.")
        return pd.DataFrame(columns=['Restaurant Name', 'Aggregate rating', 'Similarity Score'])

    similarity_scores = cosine_similarity(user_profile_array, restaurant_features_array).flatten()

    # 4. Add similarity scores to the filtered DataFrame
    filtered_df['Similarity Score'] = similarity_scores

    # 5. Sort by similarity score in descending order
    recommended_restaurants = filtered_df.sort_values(by='Similarity Score', ascending=False)

    # 6. Return top N recommendations
    return recommended_restaurants[['Restaurant Name', 'Aggregate rating', 'Similarity Score']].head(top_n)

print("Recommendation system core logic loaded.")

--- CELL 2: Recommendation System Core Logic ---
Recommendation system core logic loaded.


In [None]:
import pandas as pd # Ensure pandas is imported if running this cell independently
# Assuming df_processed_for_recommender, scaler_recommender, etc. are global from Cell 1 and 2

print("--- CELL 3: Test the Recommendation System ---")

# Define sample user preferences
user_pref_1 = {
    'City': 'New Delhi',
    'Cuisines': ['Italian', 'Pizza'],
    'Price range': 2,
    'Has Online delivery': 'Yes',
    'Min_Rating': 3.8, # User wants highly-rated restaurants
    'Votes': 100 # User prefers restaurants with at least some votes
}

user_pref_2 = {
    'City': 'London',
    'Cuisines': ['Cafe', 'Desserts', 'European'],
    'Average Cost for two': 50, # USD for London
    'Currency': 'Pounds(GBP)',
    'Has Table booking': 'Yes',
    'Min_Rating': 4.0
}

user_pref_3 = {
    'City': 'Bangalore',
    'Cuisines': ['North Indian', 'Biryani'],
    'Price range': 3,
    'Has Online delivery': 'No', # User prefers no online delivery
    'Min_Rating': 3.0
}

print("\n--- Testing with User Preferences 1 (New Delhi, Italian/Pizza, High Rating) ---")
recommendations_1 = get_recommendations(
    user_pref_1,
    df_processed_for_recommender,
    scaler_recommender,
    original_categorical_cols_recommender,
    original_binary_cols_recommender,
    original_all_cuisines_recommender,
    numerical_cols_to_scale # Pass numerical_cols_to_scale
)
if not recommendations_1.empty:
    print(recommendations_1.to_markdown(index=False, numalign="left", stralign="left"))
else:
    print("No recommendations found for this preference.")


print("\n--- Testing with User Preferences 2 (London, Cafe/Desserts, Very High Rating) ---")
recommendations_2 = get_recommendations(
    user_pref_2,
    df_processed_for_recommender,
    scaler_recommender,
    original_categorical_cols_recommender,
    original_binary_cols_recommender,
    original_all_cuisines_recommender,
    numerical_cols_to_scale # Pass numerical_cols_to_scale
)
if not recommendations_2.empty:
    print(recommendations_2.to_markdown(index=False, numalign="left", stralign="left"))
else:
    print("No recommendations found for this preference.")


print("\n--- Testing with User Preferences 3 (Bangalore, North Indian/Biryani, Moderate Rating, No Online Delivery) ---")
recommendations_3 = get_recommendations(
    user_pref_3,
    df_processed_for_recommender,
    scaler_recommender,
    original_categorical_cols_recommender,
    original_binary_cols_recommender,
    original_all_cuisines_recommender,
    numerical_cols_to_scale # Pass numerical_cols_to_scale
)
if not recommendations_3.empty:
    print(recommendations_3.to_markdown(index=False, numalign="left", stralign="left"))
else:
    print("No recommendations found for this preference.")


print("\nRecommendation system testing complete.")


--- CELL 3: Test the Recommendation System ---

--- Testing with User Preferences 1 (New Delhi, Italian/Pizza, High Rating) ---
Filtered for restaurants with Min_Rating >= 3.8. Remaining: 2109 restaurants.
Filtered for City: New Delhi. Remaining: 688 restaurants.
Filtered for restaurants with at least 100 votes. Remaining: 0 restaurants.
No restaurants found matching the filtering criteria.
No recommendations found for this preference.

--- Testing with User Preferences 2 (London, Cafe/Desserts, Very High Rating) ---
Filtered for restaurants with Min_Rating >= 4.0. Remaining: 1378 restaurants.
Filtered for City: London. Remaining: 19 restaurants.
| Restaurant Name   | Aggregate rating   | Similarity Score   |
|:------------------|:-------------------|:-------------------|
| Masala Zone       | 4.1                | 0.184763           |
| Roti Chai         | 4.5                | 0.0777129          |
| Gymkhana          | 4.7                | -0.0127951         |
| Nobu              | 4.4