In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# --- 1. Load the Dataset ---
print("--- Step 1: Loading the Dataset ---")
try:
    df = pd.read_csv('Dataset .csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'Dataset .csv' not found. Please ensure the file is in the correct directory.")
    exit()

print(f"Initial dataset shape: {df.shape}")
print("Initial 5 rows:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 2. Handle Missing Values ---
print("\n--- Step 2: Handling Missing Values ---")
print("Missing values before handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))

# Drop rows with missing values in 'Cuisines' (only 9 missing, small percentage)
df.dropna(subset=['Cuisines'], inplace=True)
print(f"Dataset shape after dropping rows with missing 'Cuisines': {df.shape}")
print("Missing values after handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))


# --- 3. Encode Categorical Variables ---
print("\n--- Step 3: Encoding Categorical Variables ---")

# Convert binary 'Yes'/'No' columns to 1/0
binary_cols = ['Has Table booking', 'Has Online delivery', 'Is delivering now', 'Switch to order menu']
for col in binary_cols:
    df[col] = df[col].apply(lambda x: 1 if x == 'Yes' else 0)
print("Binary 'Yes'/'No' columns converted to 1/0.")
print(df[binary_cols].head().to_markdown(index=False, numalign="left", stralign="left"))


# Store the original categorical columns for later use with new data in prediction
# These lists are crucial for consistent preprocessing of new data
global original_categorical_cols_for_ohe
original_categorical_cols_for_ohe = ['Country Code', 'City', 'Currency']
global original_binary_cols_for_new_data
original_binary_cols_for_new_data = ['Has Table booking', 'Has Online delivery', 'Is delivering now']
global original_all_cuisines # To ensure all possible cuisine columns are created
original_all_cuisines = sorted(df['Cuisines'].str.split(', ').explode().unique())


# Drop high cardinality and redundant columns, AND NOW ALSO Rating color and Rating text
columns_to_drop_for_training = [
    'Restaurant Name',
    'Address',
    'Locality',
    'Locality Verbose',
    'Switch to order menu', # This column only has '0' after conversion
    'Rating color',         # EXCLUDING THIS FEATURE
    'Rating text'           # EXCLUDING THIS FEATURE
]
df.drop(columns=columns_to_drop_for_training, inplace=True)
print(f"Dropped high cardinality/redundant columns, including Rating Color/Text. Current shape: {df.shape}")


# One-Hot Encode other nominal categorical columns
df = pd.get_dummies(df, columns=original_categorical_cols_for_ohe, drop_first=True)
print(f"Nominal categorical columns one-hot encoded. Current shape: {df.shape}")


# Handle 'Cuisines' column with multi-label one-hot encoding
cuisine_dummies = df['Cuisines'].str.get_dummies(sep=', ')
df = pd.concat([df, cuisine_dummies], axis=1)
df.drop(columns=['Cuisines'], inplace=True)
print(f"'Cuisines' column multi-label encoded. Current shape: {df.shape}")


# --- 4. Separate Features (X) and Target (y) ---
print("\n--- Step 4: Separating Features (X) and Target (y) ---")
if 'Restaurant ID' in df.columns:
    X = df.drop(columns=['Aggregate rating', 'Restaurant ID'])
else:
    X = df.drop(columns=['Aggregate rating'])
y = df['Aggregate rating']

print(f"Shape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")
print("First 5 rows of X (before scaling numerical features):")
print(X.head().to_markdown(index=False, numalign="left", stralign="left"))
print("First 5 rows of y:")
print(y.head().to_markdown(numalign="left", stralign="left"))


# --- 5. Feature Scaling ---
print("\n--- Step 5: Feature Scaling ---")
global numerical_cols_to_scale # Make it global for prediction function
numerical_cols_to_scale = ['Longitude', 'Latitude', 'Average Cost for two', 'Price range', 'Votes']
numerical_cols_to_scale = [col for col in numerical_cols_to_scale if col in X.columns]

global scaler # Make scaler global for prediction function
scaler = StandardScaler()
X[numerical_cols_to_scale] = scaler.fit_transform(X[numerical_cols_to_scale])
print("Numerical features scaled using StandardScaler.")
print("First 5 rows of X (after scaling numerical features):")
print(X.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 6. Splitting the Data into Training and Testing Sets ---
print("\n--- Step 6: Splitting Data into Training and Testing Sets ---")
global X_train, X_test, y_train, y_test # Make these global for subsequent cells
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

print("\nPreprocessing complete! The data is now ready for model training.")


--- Step 1: Loading the Dataset ---
Dataset loaded successfully!
Initial dataset shape: (9551, 21)
Initial 5 rows:
| Restaurant ID   | Restaurant Name        | Country Code   | City             | Address                                                                 | Locality                                   | Locality Verbose                                             | Longitude   | Latitude   | Cuisines                         | Average Cost for two   | Currency         | Has Table booking   | Has Online delivery   | Is delivering now   | Switch to order menu   | Price range   | Aggregate rating   | Rating color   | Rating text   | Votes   |
|:----------------|:-----------------------|:---------------|:-----------------|:------------------------------------------------------------------------|:-------------------------------------------|:-------------------------------------------------------------|:------------|:-----------|:---------------------------------|:------------------

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# --- Model Selection and Training ---
print("\n--- Step: Selecting and Training Regression Algorithms ---")

# 1. Linear Regression Model
print("\nTraining Linear Regression Model...")
global linear_reg_model # Make global for potential future use
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train, y_train)
print("Linear Regression Model trained successfully!")

# 2. Random Forest Regressor Model
print("\nTraining Random Forest Regressor Model...")
global random_forest_model # Make random_forest_model global for prediction function
random_forest_model = RandomForestRegressor(random_state=42, n_jobs=-1)
random_forest_model.fit(X_train, y_train)
print("Random Forest Regressor Model trained successfully!")

print("\nBoth regression models are now trained and ready for evaluation.")

# Store models for later use
trained_models = {
    "Linear Regression": linear_reg_model,
    "Random Forest Regressor": random_forest_model
}



--- Step: Selecting and Training Regression Algorithms ---

Training Linear Regression Model...
Linear Regression Model trained successfully!

Training Random Forest Regressor Model...
Random Forest Regressor Model trained successfully!

Both regression models are now trained and ready for evaluation.


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

print("\n--- Step: Evaluating Model Performance ---")

# 1. Evaluate Linear Regression Model
print("\nEvaluating Linear Regression Model:")
y_pred_lr = linear_reg_model.predict(X_test)
global mse_lr, r2_lr # Make global for Cell 4 summary
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"  Mean Squared Error (MSE): {mse_lr:.4f}")
print(f"  R-squared (R2): {r2_lr:.4f}")

# 2. Evaluate Random Forest Regressor Model
print("\nEvaluating Random Forest Regressor Model:")
y_pred_rf = random_forest_model.predict(X_test)
global mse_rf, r2_rf # Make global for Cell 4 summary
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"  Mean Squared Error (MSE): {mse_rf:.4f}")
print(f"  R-squared (R2): {r2_rf:.4f}")

print("\nModel evaluation complete.")



--- Step: Evaluating Model Performance ---

Evaluating Linear Regression Model:
  Mean Squared Error (MSE): 1.3688
  R-squared (R2): 0.4023

Evaluating Random Forest Regressor Model:
  Mean Squared Error (MSE): 0.0881
  R-squared (R2): 0.9615

Model evaluation complete.


In [None]:
import pandas as pd
import numpy as np

print("\n--- Step: Interpret Model Results and Analyze Influential Features ---")

print("\n### 1. Model Performance Summary (Without Rating Color/Text) ###")
print(f"Linear Regression:")
print(f"  Mean Squared Error (MSE): {mse_lr:.4f}")
print(f"  R-squared (R2): {r2_lr:.4f}")
print(f"\nRandom Forest Regressor:")
print(f"  Mean Squared Error (MSE): {mse_rf:.4f}")
print(f"  R-squared (R2): {r2_rf:.4f}")

print("\n--- Interpretation ---")
print("With 'Rating color' and 'Rating text' removed, the R-squared values are now more realistic for predicting ratings based on independent restaurant attributes.")
print("The Random Forest Regressor (R2: {:.4f}) still significantly outperforms the Linear Regression model (R2: {:.4f}). This reinforces that non-linear relationships are crucial for accurately predicting restaurant ratings from these features.".format(r2_rf, r2_lr))
print("The lower MSE for Random Forest also confirms its superior predictive accuracy on unseen data in this more challenging scenario.")


print("\n### 2. Feature Importance Analysis (Without Rating Color/Text) ###")

# --- Linear Regression Feature Importance (Coefficients) ---
print("\n--- Linear Regression: Most Influential Features (Coefficients) ---")
lr_coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': linear_reg_model.coef_
})
lr_coefficients['Abs_Coefficient'] = np.abs(lr_coefficients['Coefficient'])
lr_coefficients = lr_coefficients.sort_values(by='Abs_Coefficient', ascending=False).drop(columns='Abs_Coefficient')

print("Top 10 Most Influential Features for Linear Regression:")
print(lr_coefficients.head(10).to_markdown(index=False, numalign="left", stralign="left"))
print("\nInterpretation: For Linear Regression, a positive coefficient means an increase in that feature leads to an increase in rating, and vice-versa for negative coefficients. The magnitude indicates strength of influence.")


# --- Random Forest Regressor Feature Importance ---
print("\n--- Random Forest Regressor: Most Influential Features ---")
rf_importances = random_forest_model.feature_importances_
feature_importances_rf = pd.Series(rf_importances, index=X.columns)
feature_importances_rf = feature_importances_rf.sort_values(ascending=False)

print("Top 10 Most Influential Features for Random Forest Regressor:")
print(feature_importances_rf.head(10).to_markdown(numalign="left", stralign="left"))
print("\nInterpretation: For Random Forest, feature importance indicates how much each feature contributed to reducing impurity (e.g., variance) across all trees. Higher values mean greater influence.")

print("\n--- Overall Conclusion ---")
print("Now that 'Rating color' and 'Rating text' have been excluded, the feature importance analysis reveals the true independent drivers of restaurant ratings.")
print("The Random Forest Regressor remains the better model, and we can now see which restaurant attributes (beyond the rating itself) are most predictive of the aggregate rating.")


# --- 3. Demonstrate Prediction for a New Restaurant ---
print("\n--- Step: Demonstrating Prediction for a New Restaurant ---")

def preprocess_and_predict(new_restaurant_data_dict, model, scaler_obj, X_train_cols, num_cols_to_scale, ohe_cat_cols, bin_cols, all_cuisines_list):
    """
    Preprocesses a single new restaurant data dictionary and makes a prediction.

    Args:
        new_restaurant_data_dict (dict): Dictionary containing the new restaurant's features.
        model: The trained scikit-learn regression model.
        scaler_obj: The fitted StandardScaler object from training.
        X_train_cols (pd.Index): The columns of the X_train DataFrame.
        num_cols_to_scale (list): List of numerical columns that were scaled.
        ohe_cat_cols (list): List of original categorical columns that were one-hot encoded.
        bin_cols (list): List of original binary columns that were converted to 0/1.
        all_cuisines_list (list): List of all unique cuisines encountered during training.

    Returns:
        float: The predicted aggregate rating.
    """
    # Convert to DataFrame
    new_df = pd.DataFrame([new_restaurant_data_dict])

    # 1. Convert binary 'Yes'/'No' columns to 1/0
    for col in bin_cols:
        if col in new_df.columns:
            new_df[col] = new_df[col].apply(lambda x: 1 if x == 'Yes' else 0)
        else:
            new_df[col] = 0 # Assume 'No' if column is missing

    # 2. Drop high cardinality and redundant columns (same as training)
    local_columns_to_drop = [
        'Restaurant Name', 'Address', 'Locality', 'Locality Verbose',
        'Switch to order menu', 'Restaurant ID', 'Rating color', 'Rating text'
    ]
    new_df.drop(columns=[col for col in local_columns_to_drop if col in new_df.columns], inplace=True, errors='ignore')


    # 3. Handle 'Cuisines' column with multi-label one-hot encoding for new data
    cuisine_data = {cuisine: 0 for cuisine in all_cuisines_list}
    if 'Cuisines' in new_df.columns and pd.notna(new_df['Cuisines'].iloc[0]):
        current_cuisines = [c.strip() for c in new_df['Cuisines'].iloc[0].split(',') if c.strip()] # Handle empty strings
        for cuisine in current_cuisines:
            if cuisine in cuisine_data:
                cuisine_data[cuisine] = 1
    new_cuisine_df = pd.DataFrame([cuisine_data])

    if 'Cuisines' in new_df.columns:
        new_df.drop(columns=['Cuisines'], inplace=True)

    # 4. One-Hot Encode other nominal categorical columns
    new_df_encoded = pd.get_dummies(new_df, columns=ohe_cat_cols, drop_first=True)

    # Concatenate the cuisine dummies
    new_df_final = pd.concat([new_df_encoded, new_cuisine_df], axis=1)

    # --- FIX FOR FRAGMENTATION WARNING: Align columns efficiently ---
    # Reindex the new DataFrame to match the columns of X_train
    # This will add missing columns (from X_train) as NaN and fill with 0
    # And drop extra columns (if any, though unlikely if input is controlled)
    new_processed_data = new_df_final.reindex(columns=X_train_cols, fill_value=0)

    # 5. Feature Scaling for numerical columns
    new_processed_data[num_cols_to_scale] = scaler_obj.transform(new_processed_data[num_cols_to_scale])

    # Make the prediction
    predicted_rating = model.predict(new_processed_data)

    return predicted_rating[0]

# Example new restaurant data
example_restaurant_data = {
    'Restaurant Name': 'The Cozy Corner Cafe',
    'Country Code': 1, # India
    'City': 'New Delhi',
    'Longitude': 77.22,
    'Latitude': 28.63,
    'Cuisines': 'Cafe, Desserts, Italian',
    'Average Cost for two': 700,
    'Currency': 'Indian Rupees(Rs.)',
    'Has Table booking': 'Yes',
    'Has Online delivery': 'No',
    'Is delivering now': 'No',
    'Price range': 2,
    'Votes': 150,
}

predicted_rating_example = preprocess_and_predict(
    example_restaurant_data,
    random_forest_model,
    scaler,
    X_train.columns,
    numerical_cols_to_scale,
    original_categorical_cols_for_ohe,
    original_binary_cols_for_new_data,
    original_all_cuisines
)

print(f"\nPredicted Aggregate Rating for 'The Cozy Corner Cafe': {predicted_rating_example:.2f}")



--- Step: Interpret Model Results and Analyze Influential Features ---

### 1. Model Performance Summary (Without Rating Color/Text) ###
Linear Regression:
  Mean Squared Error (MSE): 1.3688
  R-squared (R2): 0.4023

Random Forest Regressor:
  Mean Squared Error (MSE): 0.0881
  R-squared (R2): 0.9615

--- Interpretation ---
With 'Rating color' and 'Rating text' removed, the R-squared values are now more realistic for predicting ratings based on independent restaurant attributes.
The Random Forest Regressor (R2: 0.9615) still significantly outperforms the Linear Regression model (R2: 0.4023). This reinforces that non-linear relationships are crucial for accurately predicting restaurant ratings from these features.
The lower MSE for Random Forest also confirms its superior predictive accuracy on unseen data in this more challenging scenario.

### 2. Feature Importance Analysis (Without Rating Color/Text) ###

--- Linear Regression: Most Influential Features (Coefficients) ---
Top 10 Mos