In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import os

# --- Global variables for consistent preprocessing ---
global scaler_cuisine_classifier, mlb_cuisines, X_cuisine_train_cols
global original_categorical_cols_cuisine, original_binary_cols_cuisine

print("--- CELL 1: Data Preprocessing for Cuisine Classification ---")

# --- 1. Load the Dataset ---
print("--- Step 1: Loading the Dataset ---")
file_path = 'Dataset .csv'
if not os.path.exists(file_path):
    print(f"Error: '{file_path}' not found. Please ensure the dataset file is in the same directory.")
    exit()

df = pd.read_csv('/content/Dataset .csv')
print("Dataset loaded successfully!")

print(f"Initial dataset shape: {df.shape}")
print("Initial 5 rows:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 2. Handle Missing Values ---
print("\n--- Step 2: Handling Missing Values ---")
print("Missing values before handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))

# Drop rows with missing values in 'Cuisines' as it's our target variable
df.dropna(subset=['Cuisines'], inplace=True)
print(f"Dataset shape after dropping rows with missing 'Cuisines': {df.shape}")
print("Missing values after handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))


# --- 3. Encode Categorical Variables ---
print("\n--- Step 3: Encoding Categorical Variables ---")

# Convert binary 'Yes'/'No' columns to 1/0
original_binary_cols_cuisine = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
for col in original_binary_cols_cuisine:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)
print("Binary 'Yes'/'No' columns converted to 1/0.")
print(df[original_binary_cols_cuisine].head().to_markdown(index=False, numalign="left", stralign="left"))


# Drop high cardinality and redundant columns (excluding 'Cuisines' as it's the target)
columns_to_drop_for_classification = [
    'Restaurant Name',
    'Address',
    'Locality',
    'Locality Verbose',
    'Switch to order menu', # Only 'No' values, not useful
    'Restaurant ID',        # Identifier, not a feature
    'Aggregate rating'      # This is the target for Task 1, but a feature here
]
df.drop(columns=columns_to_drop_for_classification, inplace=True)
print(f"Dropped high cardinality/redundant columns. Current shape: {df.shape}")


# One-Hot Encode other nominal categorical features
original_categorical_cols_cuisine = ['Country Code', 'City', 'Currency', 'Rating color', 'Rating text']
df = pd.get_dummies(df, columns=original_categorical_cols_cuisine, drop_first=True)
print(f"Nominal categorical features one-hot encoded. Current shape: {df.shape}")


# --- 4. Prepare Target Variable (Cuisines) for Multi-label Classification ---
print("\n--- Step 4: Preparing Target Variable (Cuisines) ---")
# Split the 'Cuisines' string into a list of individual cuisines
df['Cuisines_List'] = df['Cuisines'].apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])

# Initialize MultiLabelBinarizer
mlb_cuisines = MultiLabelBinarizer()

# Fit and transform the 'Cuisines_List' to create the multi-hot encoded target variable (y)
y = mlb_cuisines.fit_transform(df['Cuisines_List'])
y_df = pd.DataFrame(y, columns=mlb_cuisines.classes_) # Convert to DataFrame for easier inspection

print(f"Shape of multi-label target (y): {y.shape}")
print(f"Number of unique cuisines: {len(mlb_cuisines.classes_)}")
print("First 5 rows of multi-label target (y):")
print(y_df.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 5. Separate Features (X) ---
print("\n--- Step 5: Separating Features (X) ---")
# X will be all columns except the original 'Cuisines' and 'Cuisines_List'
X = df.drop(columns=['Cuisines', 'Cuisines_List'])

print(f"Shape of features (X): {X.shape}")
print("First 5 rows of X (before scaling numerical features):")
print(X.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 6. Feature Scaling ---
print("\n--- Step 6: Feature Scaling ---")
# Identify numerical columns to scale (excluding binary and one-hot encoded features)
numerical_cols_to_scale = ['Longitude', 'Latitude', 'Average Cost for two', 'Price range', 'Votes']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]

scaler_cuisine_classifier = StandardScaler()
X[numerical_cols_to_scale] = scaler_cuisine_classifier.fit_transform(X[numerical_cols_to_scale])
print("Numerical features scaled using StandardScaler.")
print("First 5 rows of X (after scaling numerical features):")
print(X.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 7. Splitting the Data into Training and Testing Sets ---
print("\n--- Step 7: Splitting Data into Training and Testing Sets ---")
# Use the multi-label y for splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_cuisine_train_cols = X_train.columns # Store column names for consistent prediction

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\nPreprocessing for Cuisine Classification complete! Data is ready for model training.")


--- CELL 1: Data Preprocessing for Cuisine Classification ---
--- Step 1: Loading the Dataset ---
Dataset loaded successfully!
Initial dataset shape: (9551, 21)
Initial 5 rows:
| Restaurant ID   | Restaurant Name        | Country Code   | City             | Address                                                                 | Locality                                   | Locality Verbose                                             | Longitude   | Latitude   | Cuisines                         | Average Cost for two   | Currency         | Has Table booking   | Has Online delivery   | Is delivering now   | Switch to order menu   | Price range   | Aggregate rating   | Rating color   | Rating text   | Votes   |
|:----------------|:-----------------------|:---------------|:-----------------|:------------------------------------------------------------------------|:-------------------------------------------|:-------------------------------------------------------------|:------------|:----

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
import os


# Load the dataset
file_path = 'Dataset .csv'
if not os.path.exists(file_path):
    print(f"Error: '{file_path}' not found. Please ensure the dataset file is in the same directory.")
    exit()
df = pd.read_csv(file_path)

# Handle Missing Values
df.dropna(subset=['Cuisines'], inplace=True)

# Convert binary 'Yes'/'No' columns to 1/0
original_binary_cols_cuisine = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
for col in original_binary_cols_cuisine:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)

# Drop high cardinality and redundant columns
columns_to_drop_for_classification = [
    'Restaurant Name', 'Address', 'Locality', 'Locality Verbose',
    'Switch to order menu', 'Restaurant ID', 'Aggregate rating'
]
df.drop(columns=columns_to_drop_for_classification, inplace=True)

# One-Hot Encode other nominal categorical features
original_categorical_cols_cuisine = ['Country Code', 'City', 'Currency', 'Rating color', 'Rating text']
df = pd.get_dummies(df, columns=original_categorical_cols_cuisine, drop_first=True)

# Prepare Target Variable (Cuisines) for Multi-label Classification
df['Cuisines_List'] = df['Cuisines'].apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
mlb_cuisines = MultiLabelBinarizer()
y = mlb_cuisines.fit_transform(df['Cuisines_List'])

# Separate Features (X)
X = df.drop(columns=['Cuisines', 'Cuisines_List'])

# Feature Scaling
numerical_cols_to_scale = ['Longitude', 'Latitude', 'Average Cost for two', 'Price range', 'Votes']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]
scaler_cuisine_classifier = StandardScaler()
X[numerical_cols_to_scale] = scaler_cuisine_classifier.fit_transform(X[numerical_cols_to_scale])

# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_cuisine_train_cols = X_train.columns

print("--- Preprocessing complete. Data ready for model training. ---")

# --- Model Selection and Training ---
print("\n--- CELL 2: Model Selection and Training (Cuisine Classification) ---")

base_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)


print("\nTraining OneVsRestClassifier with RandomForestClassifier base estimator...")
global cuisine_classifier_model # Make global for evaluation
cuisine_classifier_model = OneVsRestClassifier(base_classifier)
cuisine_classifier_model.fit(X_train, y_train)
print("Cuisine Classification Model trained successfully!")

print("\nModel is trained and ready for evaluation.")


--- Preprocessing complete. Data ready for model training. ---

--- CELL 2: Model Selection and Training (Cuisine Classification) ---

Training OneVsRestClassifier with RandomForestClassifier base estimator...




Cuisine Classification Model trained successfully!

Model is trained and ready for evaluation.


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
import os
import numpy as np # For thresholding predictions



# Load the dataset
file_path = 'Dataset .csv'
if not os.path.exists(file_path):
    print(f"Error: '{file_path}' not found. Please ensure the dataset file is in the same directory.")
    exit()
df = pd.read_csv(file_path)

# Handle Missing Values
df.dropna(subset=['Cuisines'], inplace=True)

# Convert binary 'Yes'/'No' columns to 1/0
original_binary_cols_cuisine = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
for col in original_binary_cols_cuisine:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)

# Drop high cardinality and redundant columns
columns_to_drop_for_classification = [
    'Restaurant Name', 'Address', 'Locality', 'Locality Verbose',
    'Switch to order menu', 'Restaurant ID', 'Aggregate rating'
]
df.drop(columns=columns_to_drop_for_classification, inplace=True)

# One-Hot Encode other nominal categorical features
original_categorical_cols_cuisine = ['Country Code', 'City', 'Currency', 'Rating color', 'Rating text']
df = pd.get_dummies(df, columns=original_categorical_cols_cuisine, drop_first=True)

# Prepare Target Variable (Cuisines) for Multi-label Classification
df['Cuisines_List'] = df['Cuisines'].apply(lambda x: [c.strip() for c in x.split(',') if c.strip()])
mlb_cuisines = MultiLabelBinarizer()
y = mlb_cuisines.fit_transform(df['Cuisines_List'])

# Separate Features (X)
X = df.drop(columns=['Cuisines', 'Cuisines_List'])

# Feature Scaling
numerical_cols_to_scale = ['Longitude', 'Latitude', 'Average Cost for two', 'Price range', 'Votes']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col in X.columns and pd.api.types.is_numeric_dtype(X[col])]
scaler_cuisine_classifier = StandardScaler()
X[numerical_cols_to_scale] = scaler_cuisine_classifier.fit_transform(X[numerical_cols_to_scale])

# Splitting the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_cuisine_train_cols = X_train.columns

# Model Training
base_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
cuisine_classifier_model = OneVsRestClassifier(base_classifier)
cuisine_classifier_model.fit(X_train, y_train)

print("--- Preprocessing and Model Training complete. Ready for evaluation. ---")

# --- Evaluate Model Performance ---
print("\n--- CELL 3: Evaluate Model Performance (Cuisine Classification) ---")

# Predict probabilities on the test set
y_pred_proba = cuisine_classifier_model.predict_proba(X_test)

# Convert probabilities to binary predictions using a threshold
# A common threshold is 0.5. You can tune this if needed.
threshold = 0.5
y_pred = (y_pred_proba >= threshold).astype(int)

print(f"\n--- Evaluation Metrics (Threshold = {threshold}) ---")

# Calculate overall metrics (micro average is often good for multi-label, accounts for class imbalance)
accuracy = accuracy_score(y_test, y_pred)
precision_micro = precision_score(y_test, y_pred, average='micro')
recall_micro = recall_score(y_test, y_pred, average='micro')
f1_micro = f1_score(y_test, y_pred, average='micro')

print(f"Overall Accuracy: {accuracy:.4f}")
print(f"Micro-averaged Precision: {precision_micro:.4f}")
print(f"Micro-averaged Recall: {recall_micro:.4f}")
print(f"Micro-averaged F1-score: {f1_micro:.4f}")

# Detailed Classification Report (per-class metrics)
print("\n--- Detailed Classification Report (Per Cuisine) ---")
# Use mlb_cuisines.classes_ for target_names to get meaningful labels
print(classification_report(y_test, y_pred, target_names=mlb_cuisines.classes_, zero_division=0))

print("\nModel evaluation complete.")




--- Preprocessing and Model Training complete. Ready for evaluation. ---

--- CELL 3: Evaluate Model Performance (Cuisine Classification) ---

--- Evaluation Metrics (Threshold = 0.5) ---
Overall Accuracy: 0.0681
Micro-averaged Precision: 0.4975
Micro-averaged Recall: 0.2188
Micro-averaged F1-score: 0.3040

--- Detailed Classification Report (Per Cuisine) ---
                   precision    recall  f1-score   support

          Afghani       0.00      0.00      0.00         4
          African       0.00      0.00      0.00         3
         American       0.32      0.08      0.13        73
           Andhra       0.00      0.00      0.00         2
          Arabian       0.00      0.00      0.00         2
        Argentine       0.00      0.00      0.00         1
         Armenian       0.00      0.00      0.00         0
            Asian       0.33      0.02      0.04        50
     Asian Fusion       0.00      0.00      0.00         1
         Assamese       0.00      0.00      0.0