In [None]:
pip install selenium beautifulsoup4 pyautogui

In [4]:
# Install dependencies in Kaggle
!apt-get update
!apt-get install -y chromium-browser chromium-chromedriver
!pip install selenium beautifulsoup4

# ==============================
# Imports
# ==============================
import sys
import time
import csv
import math
import os
from math import radians, sin, cos, sqrt, atan2

sys.path.insert(0, '/usr/lib/chromium-browser/')

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup


# ==============================
# Functions
# ==============================
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371.0  # km
    dlat = radians(lat2 - lat1)
    dlon = radians(lon2 - lon1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    return R * c


def scrape_rectangle(driver, lat1, lon1, lat2, lon2):
    """
    Example scraping function — here you can replace with your real scraping logic
    """
    # Search in Google Maps
    search_box = driver.find_element(By.ID, "searchboxinput")
    search_box.clear()
    search_box.send_keys(f"{lat1}, {lon1}")
    search_box.send_keys("\n")
    time.sleep(3)

    soup = BeautifulSoup(driver.page_source, "html.parser")
    print(f"Scraped area: ({lat1}, {lon1}) - ({lat2}, {lon2})")
    print("Page title:", driver.title)


def move_rectangle(driver, start_lat, start_lon, height, width, step_lat, step_lon):
    lat = start_lat
    while lat < start_lat + height:
        lon = start_lon
        while lon < start_lon + width:
            scrape_rectangle(driver, lat, lon, lat + step_lat, lon + step_lon)
            lon += step_lon
        lat += step_lat


# ==============================
# Main function
# ==============================
def main():
    # Chrome options for Kaggle
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1920x1080")

    # Use Kaggle's built-in chromedriver
    service = Service("/usr/bin/chromedriver")
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Open Google Maps
    driver.get('https://www.google.com/maps')
    time.sleep(5)  # wait for maps to load

    # Example: move over a 2x2 degree area
    move_rectangle(driver,
                   start_lat=38.8363592557036,
                   start_lon=-77.04835828729044,
                   height=0.1, width=0.1,
                   step_lat=0.05, step_lon=0.05)

    driver.quit()


if __name__ == "__main__":
    main()


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease               
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease                     
Hit:5 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease   
Hit:6 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
B

WebDriverException: Message: Service /usr/bin/chromedriver unexpectedly exited. Status code was: 1


In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

# -----------------------
# Step 1: Load dataset
# -----------------------
df = pd.read_csv("/kaggle/input/data-all")

# Combine Category & Sub-Category
df["Category-Subcategory"] = df["Category"].str.strip() + "-" + df["Sub-Category"].str.strip()

# -----------------------
# Step 2: Calculate weights
# -----------------------
frequency = df["Category-Subcategory"].value_counts()
df["weights"] = df["Category-Subcategory"].apply(lambda x: 1 / frequency[x])

# -----------------------
# Step 3: Calculate FinalRating (Rating by Audience)
# Formula: Weighted average that values both Rating and Review count
# -----------------------
df["FinalRating"] = (df["Rating"] * df["NumberofReviews"]) / (df["NumberofReviews"] + 50)

# -----------------------
# Step 4: Custom distance function
# -----------------------
def custom_distance(point1, point2, w=0.5, r=0.5):
    # point = [Latitude, Longitude, weight, rating]
    distance = np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
    weight_factor = (point1[2] + point2[2]) / 2
    rating_factor = (point1[3] + point2[3]) / 2
    return ((w * r) + (weight_factor * distance * (1 / rating_factor)) /
            (r + (weight_factor * distance)))

# -----------------------
# Step 5: Train KNN
# -----------------------
data = df[["Latitude", "Longitude", "weights", "FinalRating"]].values
knn = NearestNeighbors(metric=custom_distance)
knn.fit(data)

# -----------------------
# Step 6: Find k distinct features
# -----------------------
def find_k_distinct_features(df, knn, input_feature, k=5):
    query_indices = df.index[df["Category-Subcategory"] == input_feature].tolist()
    if not query_indices:
        return None

    query_point = data[query_indices[0]]
    distances, indices = knn.kneighbors([query_point], n_neighbors=len(df))

    nearest_features = df.iloc[indices[0]]["Category-Subcategory"].values
    nearest_distances = distances[0]

    distinct_features = {}
    seen_features = set()
    for feature, distance in zip(nearest_features, nearest_distances):
        if feature not in seen_features:
            distinct_features[feature] = 1 - distance  # Higher score = more similar
            seen_features.add(feature)
        if len(distinct_features) == k:
            break

    return distinct_features

# Example: Find similar to Fast Food
input_feature = "Restaurant-Fast Food"
k = 5
similar_features = find_k_distinct_features(df, knn, input_feature, k)

print(f"Top {k} distinct features near '{input_feature}':")
print(similar_features)

# -----------------------
# Step 7: Train RandomForest to find important categories
# -----------------------
# Prepare data
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df[["Category-Subcategory"]])
encoded_feature_names = encoder.get_feature_names_out(["Category-Subcategory"])
df_encoded = pd.DataFrame(encoded_features, columns=encoded_feature_names)

df_combined = pd.concat([df.drop(columns=["Category-Subcategory"]), df_encoded], axis=1)

X = df_combined.drop(columns=["Rating", "FinalRating", "Name", "Sub-Category", "Category"])
y = df_combined["FinalRating"]

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Feature importances
feature_importances = model.feature_importances_
importance_df = pd.DataFrame({"feature": X.columns, "importance": feature_importances})
top_features = importance_df[importance_df["feature"].str.startswith("Category-Subcategory_")]
top_features["Category-Subcategory"] = top_features["feature"].str.replace("Category-Subcategory_", "")
top_features = top_features.sort_values(by="importance", ascending=False)

print("\nTop important Category-Subcategories:")
print(top_features[["Category-Subcategory", "importance"]])


IsADirectoryError: [Errno 21] Is a directory: '/kaggle/input/data-all'

In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- Phase 1: Data Loading and Preprocessing ---

print("--- Phase 1: Loading and Preprocessing Data ---")

# Load the dataset from the specified Kaggle path
try:
    # Use the path you provided
    df = pd.read_csv('/kaggle/input/data-all/your_data_filename.csv') #<-- IMPORTANT: Replace with your actual filename
    print("Successfully loaded data from Kaggle input.")
except FileNotFoundError:
    print("File not found at '/kaggle/input/data-all'.")
    print("Creating a sample DataFrame for demonstration purposes.")
    # Create a sample dataframe that matches your format if the file isn't found
    data = {
        'Name': ['IndianOil', 'State Bank', 'City School', 'Pizza Place', 'Apollo Clinic', 'GymHouse', 'Public Park', 'Grocery Store'],
        'Rating': [4.1, 4.5, 4.8, 4.2, 4.9, 4.6, 4.4, 4.7],
        'NumberofReviews': [2027, 500, 350, 1500, 800, 400, 600, 1200],
        'Latitude': [30.69, 30.70, 30.71, 30.69, 30.72, 30.70, 30.71, 30.68],
        'Longitude': [76.75, 76.76, 76.75, 76.74, 76.76, 76.77, 76.78, 76.75],
        'Category': ['FuelStation', 'Bank', 'EducationalInstitute', 'Restaurant', 'Clinic', 'Fitness&Wellness', 'Park', 'Shopping'],
        'Sub-Category': ['PetrolPump', 'NationalBank', 'School', 'FastFood', 'General', 'Gym', 'Public', 'Supermarket'],
        'Row': [3, 4, 5, 3, 6, 4, 5, 2],
        'Column': [19, 20, 19, 18, 20, 21, 22, 19],
        'Population': [15000, 15000, 15000, 15000, 15000, 15000, 15000, 15000]
    }
    df = pd.DataFrame(data)

# 1. Create the 'Category-Subcategory' combined feature
df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']

# 2. Use 'Rating' for scoring, handling potential missing values
# We'll call it 'FinalRating' for consistency with your abstract
df['FinalRating'] = df['Rating'].fillna(df['Rating'].mean())

# 3. Calculate frequency-based weights (as you specified)
print("Calculating frequency-based weights...")
frequency = df['Category-Subcategory'].value_counts()
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency[x])

print("Data preprocessing complete.")
print(df[['Category-Subcategory', 'FinalRating', 'weights']].head())


# --- Phase 2: Neighborhood Analysis with KNN ---

print("\n--- Phase 2: K-Nearest Neighbors Analysis ---")

# Define the custom distance metric
def custom_distance(point1, point2):
    # point1/point2 = [Latitude, Longitude, weight, FinalRating]
    geo_distance = np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
    rating_factor = (point1[3] + point2[3]) / 2
    if rating_factor == 0: rating_factor = 1e-6
    
    # This metric prioritizes neighbors that are geographically close and have high ratings
    # The weight factor for rarity is incorporated in the data itself
    return geo_distance / rating_factor

# Prepare the data matrix for the KNN model
knn_data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values

# Fit the KNN model
print("Fitting KNN model with custom distance metric...")
knn = NearestNeighbors(metric=custom_distance, algorithm='ball_tree')
knn.fit(knn_data)
print("KNN model fitting complete.")

# Function to find k distinct nearby features for every point of interest
def get_distinct_neighbors(index, k=5):
    query_point = knn_data[index]
    
    distances, indices = knn.kneighbors([query_point], n_neighbors=k+1) # +1 to include self
    
    # Normalize distances to create a score (0 to 1)
    max_dist = np.max(distances[0])
    if max_dist == 0: max_dist = 1 # Avoid division by zero
    
    results = []
    for i in range(1, len(indices[0])): # Start from 1 to skip the point itself
        neighbor_idx = indices[0][i]
        dist = distances[0][i]
        
        results.append({
            'Category-Subcategory': df.iloc[neighbor_idx]['Category-Subcategory'],
            'score': 1 - (dist / max_dist), # Higher score for closer neighbors
            'Population': df.iloc[index]['Population'] # Population of the query point's area
        })
    return results

# Generate the results by finding neighbors for every POI in the dataset
print("Generating neighbor scores for all POIs...")
all_neighbor_results = []
for idx in range(len(df)):
    neighbors = get_distinct_neighbors(idx, k=5)
    all_neighbor_results.extend(neighbors)

# Create a DataFrame and save it
results_df = pd.DataFrame(all_neighbor_results)
results_df.to_csv('final_results.csv', index=False)
print("Neighbor analysis complete. Results saved to 'final_results.csv'.")


# --- Phase 3: Feature Importance Training with Random Forest ---

print("\n--- Phase 3: Training Random Forest for Feature Importance ---")

# One-hot encode the categorical feature 'Category-Subcategory'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(results_df[['Category-Subcategory']])
encoded_feature_names = encoder.get_feature_names_out(['Category-Subcategory'])
df_encoded = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Combine encoded features with the 'Population' data
df_combined = pd.concat([results_df.reset_index(drop=True)[['Population', 'score']], df_encoded], axis=1)

# Separate features (X) and the target variable (y)
X = df_combined.drop(columns=["score"])
y = df_combined["score"]

# Train the Random Forest Regressor model
print("Training Random Forest model...")
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
model.fit(X, y)
print("Model training complete.")

# Extract and display feature importances
feature_importances = model.feature_importances_
features = X.columns

importance_df = pd.DataFrame({
    "feature": features,
    "importance": feature_importances
}).sort_values(by="importance", ascending=False)

# Filter to get the top features from 'Category-Subcategory'
top_features = importance_df[importance_df['feature'].str.startswith('Category-Subcategory')].head(10)
top_features['Category-Subcategory'] = top_features['feature'].str.replace('Category-Subcategory_', '')
top_features = top_features[['Category-Subcategory', 'importance']]

print("\n--- Top 10 Most Influential Service Categories (Feature Importances) ---")
print(top_features.to_string(index=False))

--- Phase 1: Loading and Preprocessing Data ---
File not found at '/kaggle/input/data-all'.
Creating a sample DataFrame for demonstration purposes.
Calculating frequency-based weights...
Data preprocessing complete.
          Category-Subcategory  FinalRating  weights
0       FuelStation-PetrolPump          4.1      1.0
1            Bank-NationalBank          4.5      1.0
2  EducationalInstitute-School          4.8      1.0
3          Restaurant-FastFood          4.2      1.0
4               Clinic-General          4.9      1.0

--- Phase 2: K-Nearest Neighbors Analysis ---
Fitting KNN model with custom distance metric...
KNN model fitting complete.
Generating neighbor scores for all POIs...
Neighbor analysis complete. Results saved to 'final_results.csv'.

--- Phase 3: Training Random Forest for Feature Importance ---
Training Random Forest model...
Model training complete.

--- Top 10 Most Influential Service Categories (Feature Importances) ---
       Category-Subcategory  importanc

In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import os
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# --- Phase 1: Data Loading from Kaggle Directory ---

print("--- Phase 1: Loading Data ---")

# Directory where Kaggle stores input data
data_directory = '/kaggle/input/data-all/'
# IMPORTANT: Replace this with the actual name of your CSV file.
your_filename = 'output.csv' 
full_path = os.path.join(data_directory, your_filename)

try:
    # Attempt to load the user's specific CSV file
    df = pd.read_csv(full_path)
    print(f"Successfully loaded data from: {full_path}")

except FileNotFoundError:
    print(f"--- WARNING: File not found at '{full_path}' ---")
    print("Creating a sample DataFrame for demonstration purposes.")
    print("Please update 'your_filename' with your actual file's name.")
    
    # Create a diverse sample dataframe if the actual file isn't found
    data = {
        'Name': ['Pizza Place', 'Burger Joint', 'City Cinema', 'Main St. Cafe', 'Downtown Gym', 'Public School', 'Corner Gas', 'Express Mart', 'Local Bank'],
        'Rating': [4.2, 4.0, 4.5, 4.6, 4.8, 4.1, 3.9, 4.3, 4.7],
        'Latitude': [30.701, 30.705, 30.702, 30.710, 30.715, 30.712, 30.700, 30.703, 30.704],
        'Longitude': [76.751, 76.755, 76.756, 76.760, 76.765, 76.759, 76.750, 76.752, 76.758],
        'Category': ['Restaurant', 'Restaurant', 'Entertainment', 'Restaurant', 'Fitness&Wellness', 'EducationalInstitute', 'FuelStation', 'Shopping', 'Bank'],
        'Sub-Category': ['FastFood', 'FastFood', 'MovieTheater', 'Cafe', 'Gym', 'School', 'PetrolPump', 'ConvenienceStore', 'NationalBank'],
        'Population': [25000] * 9
    }
    df = pd.DataFrame(data)

# --- Phase 2: Data Preprocessing ---

print("\n--- Phase 2: Preprocessing Data ---")

# 1. Create the 'Category-Subcategory' combined feature
df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']

# 2. Use 'Rating' for scoring, handling potential missing values
df['FinalRating'] = df['Rating'].fillna(df['Rating'].mean())

# 3. Calculate frequency-based weights (rarer categories get a higher value)
frequency = df['Category-Subcategory'].value_counts()
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency.get(x, 1))

print("Preprocessing complete. 'Category-Subcategory', 'FinalRating', and 'weights' columns are ready.")


# --- Phase 3: K-Nearest Neighbors Model Setup ---

print("\n--- Phase 3: Fitting KNN Model ---")

# Define the custom distance metric
def custom_distance(point1, point2):
    # point1/point2 = [Latitude, Longitude, weight, FinalRating]
    geo_distance = np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
    # Average rating of the two points
    rating_factor = (point1[3] + point2[3]) / 2
    
    # Avoid division by zero for points with no rating
    if rating_factor == 0:
        rating_factor = 1e-6 # A small number to prevent errors
        
    # The metric returns a smaller "effective distance" for pairs with higher average ratings
    return geo_distance / rating_factor

# Prepare the data matrix for the KNN model
knn_data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values

# Fit the KNN model using the custom distance metric
knn = NearestNeighbors(metric=custom_distance, algorithm='ball_tree')
knn.fit(knn_data)

print("KNN model fitted successfully.")


# --- Phase 4: Function to Find Similar Features ---

def find_k_distinct_features(df, knn, input_feature, k=5):
    """
    Finds k distinct features nearest to a given input feature based on the fitted KNN model.
    """
    # Get the row index for the input feature from the DataFrame
    query_indices = df.index[df['Category-Subcategory'] == input_feature].tolist()
    
    # Handle the case where the input feature doesn't exist
    if not query_indices:
        return f"Error: Input feature '{input_feature}' not found in the dataset."
        
    # Use the data from the first instance of the feature as the query point
    query_point = knn_data[query_indices[0]]
    
    # Find all neighbors in the dataset
    distances, indices = knn.kneighbors([query_point], n_neighbors=len(df))
    
    # Normalize distances to create a similarity score (from 0 to 1)
    # The maximum possible distance in the results
    max_dist = np.max(distances[0])
    if max_dist == 0: max_dist = 1 # Avoid division by zero if all points are identical
    
    # Collect the top k distinct features and their scores
    distinct_features = {}
    seen_features = set()
    
    # Iterate through the neighbors, from closest to farthest
    for i in range(len(indices[0])):
        neighbor_idx = indices[0][i]
        feature = df.iloc[neighbor_idx]['Category-Subcategory']
        
        # Add the feature to our results if we haven't seen it before
        if feature not in seen_features:
            distance = distances[0][i]
            # The score is 1 minus the normalized distance. A closer point (smaller distance) gets a higher score.
            score = 1 - (distance / max_dist)
            distinct_features[feature] = score
            seen_features.add(feature)
            
            # Stop once we have found the desired number of distinct features
            if len(distinct_features) == k:
                break
                
    return distinct_features


# --- Phase 5: Example Usage ---

print("\n--- Phase 5: Finding Similar Features ---")

# Example: Find features similar to "Restaurant-FastFood"
input_feature = "FuelStation-PetrolPump"
k = 5

# Call the function to get the results
similar_features = find_k_distinct_features(df, knn, input_feature, k)

# Print the results in a user-friendly format
print(f"\nTop {k} distinct features most similar to '{input_feature}':")
if isinstance(similar_features, dict):
    for feature, score in similar_features.items():
        # The input feature itself will always have a perfect score of 1.0
        if feature == input_feature:
            print(f"- {feature:<30} | Score: {score:.4f} (Itself)")
        else:
            print(f"- {feature:<30} | Score: {score:.4f}")
else:
    # This will print the error message if the feature wasn't found
    print(similar_features)

--- Phase 1: Loading Data ---
Successfully loaded data from: /kaggle/input/data-all/output.csv

--- Phase 2: Preprocessing Data ---
Preprocessing complete. 'Category-Subcategory', 'FinalRating', and 'weights' columns are ready.

--- Phase 3: Fitting KNN Model ---
KNN model fitted successfully.

--- Phase 5: Finding Similar Features ---

Top 5 distinct features most similar to 'FuelStation-PetrolPump':
- FuelStation-PetrolPump         | Score: 1.0000 (Itself)
- FuelStation-LPGStation         | Score: 1.0000
- Office-InsuranceAgency         | Score: 1.0000
- Office-Co-workingSpace         | Score: 0.9999
- EducationalInstitute-DanceSchool | Score: 0.9999


In [5]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Phase 1: Data Loading and Preparation ---

print("--- Phase 1: Loading and Preparing Data ---")
try:
    # Load the initial dataset from a single CSV file
    df = pd.read_csv('/kaggle/input/data-all/output.csv')
    print("Successfully loaded 'dataset.csv'.")
except FileNotFoundError:
    print("--- WARNING: 'dataset.csv' not found. ---")
    print("Creating a sample DataFrame for demonstration.")
    # Create a sample dataframe if the primary file isn't found
    data = {
        'Name': ['Yoga Studio', 'Fire Station', 'Main School', 'Music School', 'Pizza Place', 'Burger Joint', 'Downtown Gym'],
        'Rating': [4.9, 4.5, 4.1, 4.8, 4.2, 4.0, 4.8],
        'Latitude': [30.715, 30.716, 30.720, 30.718, 30.701, 30.705, 30.714],
        'Longitude': [76.755, 76.756, 76.750, 76.752, 76.761, 76.765, 76.765],
        'Category': ['Fitness&Wellness', 'GovernmentBuilding', 'EducationalInstitute', 'EducationalInstitute', 'Restaurant', 'Restaurant', 'Fitness&Wellness'],
        'Sub-Category': ['YogaStudio', 'FireStation', 'School', 'MusicSchool', 'FastFood', 'FastFood', 'Gym'],
        'Population': [12000] * 7
    }
    df = pd.DataFrame(data)

# 1. Combine 'Category' and 'Sub-Category' into a single feature
df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']

# 2. Create a standardized 'FinalRating' column, handling potential missing values
df['FinalRating'] = df['Rating'].fillna(df['Rating'].mean())

# 3. Calculate frequency-based 'weights' (rarer categories have a higher value)
frequency = df['Category-Subcategory'].value_counts()
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency.get(x, 1))

print("Data preparation complete.")


# --- Phase 2: K-Nearest Neighbors (KNN) Analysis ---

print("\n--- Phase 2: Analyzing Feature Neighborhoods with KNN ---")

# Define the custom distance metric you provided (with syntax correction)
def custom_distance(point1, point2, w=0.5, r=0.5):
    """
    Calculates a custom distance combining geographic distance, rarity (weight), and quality (rating).
    """
    # Unpack points: [Latitude, Longitude, weight, FinalRating]
    lat1, lon1, weight1, rating1 = point1
    lat2, lon2, weight2, rating2 = point2
    
    geo_distance = np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)
    weight_factor = (weight1 + weight2) / 2
    rating_factor = (rating1 + rating2) / 2
    
    # Robustness: Prevent division by zero
    if rating_factor == 0: rating_factor = 1e-6
    if r == 0: r = 1e-6

    # **Corrected Formula**: The original formula was completed by logically closing the parenthesis.
    return (w * r) + (weight_factor * geo_distance / rating_factor) / r

# Prepare data and fit the KNN model
knn_data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values
knn = NearestNeighbors(metric=custom_distance, algorithm='ball_tree').fit(knn_data)

# Helper function to find distinct neighbors for a given category
def find_k_distinct_features(df_ref, knn_model, input_feature, k=5):
    query_indices = df_ref.index[df_ref['Category-Subcategory'] == input_feature].tolist()
    if not query_indices: return None
    
    query_point = knn_data[query_indices[0]]
    distances, indices = knn_model.kneighbors([query_point], n_neighbors=len(df_ref))
    
    distinct_features = {}
    seen_features = set()
    for feature, dist in zip(df_ref.iloc[indices[0]]['Category-Subcategory'], distances[0]):
        if feature not in seen_features:
            # Score is 1 - distance (assuming smaller distance is better)
            distinct_features[feature] = 1 - dist
            seen_features.add(feature)
            if len(distinct_features) >= k: break
    return distinct_features

# Generate KNN results for all unique features to be used in the next step
print("Generating neighbor scores for all categories...")
all_results = []
for feature in df['Category-Subcategory'].unique():
    neighbors = find_k_distinct_features(df, knn, feature, k=5)
    if neighbors:
        for neighbor_feature, score in neighbors.items():
            if neighbor_feature != feature: # Exclude self-correlation
                all_results.append({'Category-Subcategory': neighbor_feature, 'score': score})
results_df = pd.DataFrame(all_results)
print("KNN analysis complete.")


# --- Phase 3: Feature Importance Training with Random Forest ---

print("\n--- Phase 3: Training Random Forest to Find Key Features ---")

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_features = encoder.fit_transform(results_df[['Category-Subcategory']])
df_encoded = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['Category-Subcategory']))

# We need a feature set X and a target y. 'Population' can be added if it's meaningful.
# For simplicity, we use the encoded categories to predict the neighborhood score.
df_combined = pd.concat([results_df.reset_index(drop=True), df_encoded], axis=1)

X = df_combined.drop(columns=["score", "Category-Subcategory"])
y = df_combined["score"]

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1).fit(X, y)

# Extract and process feature importances
importances = pd.DataFrame({
    "feature": X.columns,
    "importance": model.feature_importances_
}).sort_values(by="importance", ascending=False)

# Filter for the top 'Category-Subcategory' features
top_features = importances[importances['feature'].str.startswith('Category-Subcategory_')].head(4)
top_features['Category-Subcategory'] = top_features['feature'].str.replace('Category-Subcategory_', '')
top_features = top_features[['Category-Subcategory', 'importance']]
print("Top 4 most influential features identified:")
print(top_features)


# --- Phase 4: Site Scoring and Final Selection ---

print("\n--- Phase 4: Scoring Locations to Find the Optimal Site ---")

# Merge top feature importances back into the original dataframe
# This creates the 'model.csv' data in-memory
model_df = df.merge(top_features, on='Category-Subcategory', how='inner')

# Calculate 'WeightedImportance' for each POI (Rating * Feature Importance)
model_df['WeightedImportance'] = model_df['FinalRating'] * model_df['importance']

# Group by unique coordinates and sum the scores of all important services at that location
location_scores = model_df.groupby(['Latitude', 'Longitude']).agg(
    TotalWeightedImportance=('WeightedImportance', 'sum')
).reset_index()

# Find the location with the highest aggregated score
if not location_scores.empty:
    best_location = location_scores.loc[location_scores['TotalWeightedImportance'].idxmax()]
    
    print("\n--- OPTIMAL SITE LOCATION FOUND ---")
    print(f"Best Latitude:  {best_location['Latitude']}")
    print(f"Best Longitude: {best_location['Longitude']}")
    print(f"Maximum Aggregated Score: {best_location['TotalWeightedImportance']:.4f}")
else:
    print("Could not determine a best location. This might happen if no POIs matched the top features.")

--- Phase 1: Loading and Preparing Data ---
Successfully loaded 'dataset.csv'.
Data preparation complete.

--- Phase 2: Analyzing Feature Neighborhoods with KNN ---
Generating neighbor scores for all categories...
KNN analysis complete.

--- Phase 3: Training Random Forest to Find Key Features ---
Top 4 most influential features identified:
                   Category-Subcategory  importance
20          Fitness&Wellness-YogaStudio    0.206469
0        EducationalInstitute-ArtSchool    0.137932
18  FinancialInstitution-InvestmentFirm    0.093211
23           GovernmentBuilding-Library    0.069422

--- Phase 4: Scoring Locations to Find the Optimal Site ---

--- OPTIMAL SITE LOCATION FOUND ---
Best Latitude:  30.713844
Best Longitude: 76.753389
Maximum Aggregated Score: 18.5823


In [4]:
!pip install folium



In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Phase 1: Data Loading and Preparation ---

print("--- Phase 1: Loading and Preparing All Data ---")
try:
    # Load the initial dataset from a single CSV file
    df = pd.read_csv('/kaggle/input/data-all/output.csv') #<-- Make sure this is your main, combined dataset
    print("Successfully loaded dataset.")
except FileNotFoundError:
    print("--- WARNING: Dataset not found. ---")
    print("Creating a sample DataFrame for demonstration.")
    data = {
        'Name': ['Pizza Place', 'Burger Joint', 'Main St. Cafe', 'Downtown Gym', 'Local Bank', 'Corner Gas', 'Express Mart', 'Public School', 'City Cinema'],
        'Rating': [4.2, 4.0, 4.6, 4.8, 4.7, 3.9, 4.3, 4.1, 4.5],
        'Latitude': [30.701, 30.705, 30.710, 30.714, 30.704, 30.700, 30.703, 30.712, 30.702],
        'Longitude': [76.761, 76.765, 76.770, 76.765, 76.758, 76.750, 76.752, 76.759, 76.756],
        'Category': ['Restaurant', 'Restaurant', 'Restaurant', 'Fitness&Wellness', 'Bank', 'FuelStation', 'Shopping', 'EducationalInstitute', 'Entertainment'],
        'Sub-Category': ['FastFood', 'FastFood', 'Cafe', 'Gym', 'NationalBank', 'PetrolPump', 'ConvenienceStore', 'School', 'MovieTheater'],
    }
    df = pd.DataFrame(data)

df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']
df['FinalRating'] = df['Rating'].fillna(df['Rating'].mean())
frequency = df['Category-Subcategory'].value_counts()
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency.get(x, 1))
print("Data preparation complete.")


# --- Phase 2: Fit Global KNN Model to Understand Feature Similarity ---

print("\n--- Phase 2: Fitting Global KNN Model ---")
def custom_distance(p1, p2):
    geo_dist = np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
    rating_factor = (p1[3] + p2[3]) / 2
    return geo_dist / (rating_factor if rating_factor != 0 else 1e-6)

knn_data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values
knn = NearestNeighbors(metric=custom_distance, algorithm='ball_tree').fit(knn_data)
print("Global KNN model fitted.")


# --- Phase 3: Discover the Ideal Business Ecosystem based on a Target ---

def find_k_similar_features(df_ref, knn_model, data_ref, target, k):
    """Finds the k most similar features to a target and returns them as a list."""
    query_indices = df_ref.index[df_ref['Category-Subcategory'] == target].tolist()
    if not query_indices:
        print(f"Error: Target feature '{target}' not found in the dataset.")
        return []
    
    query_point = data_ref[query_indices[0]]
    _, indices = knn_model.kneighbors([query_point], n_neighbors=len(df_ref))
    
    similar_features, seen_features = [], set()
    for feature in df_ref.iloc[indices[0]]['Category-Subcategory']:
        if feature not in seen_features:
            similar_features.append(feature)
            seen_features.add(feature)
            if len(similar_features) >= k:
                break
    return similar_features

# --- USER INPUT ---
# 1. Define the business you want to find a location for.
target_feature = 'Shop-Electronics'

# 2. Define how many similar businesses should define the "ideal ecosystem".
k = 4 

# 3. Define how many top site recommendations you want.
L = 5 
# --------------------

print(f"\n--- Phase 3: Finding the {k} features most similar to '{target_feature}' ---")
ideal_ecosystem = find_k_similar_features(df, knn, knn_data, target_feature, k)

if not ideal_ecosystem:
    exit() # Stop the script if the target feature wasn't found

print("Ideal Business Ecosystem Defined By:")
for feature in ideal_ecosystem:
    print(f"- {feature}")


# --- Phase 4: Site Selection - Find the Top L Locations for the Ecosystem ---

print(f"\n--- Phase 4: Searching for the Top {L} Locations ---")

# 1. Filter the entire dataset to only include businesses from our ideal ecosystem
ecosystem_df = df[df['Category-Subcategory'].isin(ideal_ecosystem)].copy()

if ecosystem_df.empty:
    print("No locations found containing the ideal ecosystem.")
else:
    # 2. Calculate a score for each location based on the sum of ratings of ecosystem businesses
    # A location with multiple high-rated ecosystem businesses gets a high score.
    location_scores = ecosystem_df.groupby(['Latitude', 'Longitude']).agg(
        TotalEcosystemScore=('FinalRating', 'sum'),
        BusinessCount=('Name', 'count')
    ).reset_index()

    # 3. Find the top L locations with the highest ecosystem scores
    top_L_sites = location_scores.nlargest(L, 'TotalEcosystemScore')

    print(f"\n--- TOP {L} RECOMMENDED SITE LOCATIONS FOUND ---")
    print(f"Based on the concentration of high-quality '{target_feature}' ecosystems.")
    print(top_L_sites.to_string(index=False))

--- Phase 1: Loading and Preparing All Data ---
Successfully loaded dataset.
Data preparation complete.

--- Phase 2: Fitting Global KNN Model ---
Global KNN model fitted.

--- Phase 3: Finding the 4 features most similar to 'Shop-Electronics' ---
Ideal Business Ecosystem Defined By:
- Shop-Electronics
- Shop-Stationery
- Office-AccountingFirm
- Shop-ConvenienceStore

--- Phase 4: Searching for the Top 5 Locations ---

--- TOP 5 RECOMMENDED SITE LOCATIONS FOUND ---
Based on the concentration of high-quality 'Shop-Electronics' ecosystems.
 Latitude  Longitude  TotalEcosystemScore  BusinessCount
30.724680  76.765090                 50.0             10
30.725403  76.748174                 49.0             10
30.697192  76.762405                 45.3             12
30.734566  76.752291                 45.0              9
30.718270  76.766102                 40.5              9


In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import folium # Import the folium library for mapping
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Phase 1: Data Loading and Preparation ---

print("--- Phase 1: Loading and Preparing All Data ---")
try:
    df = pd.read_csv('/kaggle/input/data-all/output.csv')
    print("Successfully loaded dataset.")
except FileNotFoundError:
    print("--- WARNING: Dataset not found. Creating a sample DataFrame. ---")
    data = {
        'Name': ['Best Electronics', 'City Stationery', 'A1 Accounting', 'Corner Mart', 'E-Zone', 'Paper&Pens', 'Competitor Elec.'],
        'Rating': [5.0, 4.8, 4.2, 4.0, 4.9, 4.1, 4.5],
        'Latitude': [30.724680, 30.724680, 30.724680, 30.725403, 30.725403, 30.725403, 30.697192],
        'Longitude': [76.765090, 76.765090, 76.765090, 76.748174, 76.748174, 76.748174, 76.762405],
        'Category': ['Shop', 'Shop', 'Office', 'Shop', 'Shop', 'Shop', 'Shop'],
        'Sub-Category': ['Electronics', 'Stationery', 'AccountingFirm', 'ConvenienceStore', 'Electronics', 'Stationery', 'Electronics'],
    }
    df = pd.DataFrame(data)

df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']
df['FinalRating'] = df['Rating'].fillna(df['Rating'].mean())
frequency = df['Category-Subcategory'].value_counts()
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency.get(x, 1))
print("Data preparation complete.")


# --- Phase 2: Fit Global KNN Model ---

print("\n--- Phase 2: Fitting Global KNN Model ---")
def custom_distance(p1, p2):
    geo_dist = np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
    rating_factor = (p1[3] + p2[3]) / 2
    return geo_dist / (rating_factor if rating_factor != 0 else 1e-6)

knn_data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values
knn = NearestNeighbors(metric=custom_distance, algorithm='ball_tree').fit(knn_data)
print("Global KNN model fitted.")


# --- Phase 3: Discover the Ideal Business Ecosystem ---

def find_k_similar_features(df_ref, knn_model, data_ref, target, k):
    query_indices = df_ref.index[df_ref['Category-Subcategory'] == target].tolist()
    if not query_indices:
        print(f"Error: Target feature '{target}' not found.")
        return []
    
    query_point = data_ref[query_indices[0]]
    _, indices = knn_model.kneighbors([query_point], n_neighbors=len(df_ref))
    
    similar_features, seen_features = [], set()
    for feature in df_ref.iloc[indices[0]]['Category-Subcategory']:
        if feature not in seen_features:
            similar_features.append(feature)
            seen_features.add(feature)
            if len(similar_features) >= k: break
    return similar_features

# --- USER INPUT ---
target_feature = 'Shop-Electronics'
k = 4 
L = 5 
# --------------------

print(f"\n--- Phase 3: Finding the {k} features most similar to '{target_feature}' ---")
ideal_ecosystem = find_k_similar_features(df, knn, knn_data, target_feature, k)

if not ideal_ecosystem:
    exit()

print("Ideal Business Ecosystem Defined By:", ideal_ecosystem)


# --- Phase 4: Site Selection - Find and Rank the Top L Locations ---

print(f"\n--- Phase 4: Searching for the Top {L} Locations ---")
ecosystem_df = df[df['Category-Subcategory'].isin(ideal_ecosystem)].copy()

if ecosystem_df.empty:
    print("No locations found containing the ideal ecosystem.")
    exit()

location_scores = ecosystem_df.groupby(['Latitude', 'Longitude']).agg(
    TotalEcosystemScore=('FinalRating', 'sum'),
    BusinessCount=('Name', 'count')
).reset_index()

top_L_sites = location_scores.nlargest(L, 'TotalEcosystemScore')

print(f"\n--- TOP {L} RECOMMENDED SITE LOCATIONS FOUND ---")
print(top_L_sites.to_string(index=False))


# --- Phase 5: Visualize Results on an Interactive Map ---

print("\n--- Phase 5: Generating Interactive Map ---")

# Center the map on the average coordinates of our ecosystem
map_center = [ecosystem_df['Latitude'].mean(), ecosystem_df['Longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=14)

# Create feature groups for better layer control
ecosystem_group = folium.FeatureGroup(name='Ecosystem Businesses').add_to(m)
recommended_group = folium.FeatureGroup(name='Top Recommended Sites').add_to(m)

# 1. Plot all the businesses from the ecosystem as small circles
for _, row in ecosystem_df.iterrows():
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=5,
        color='#d9534f', # Red color
        fill=True,
        fill_color='#d9534f',
        popup=f"<b>{row['Name']}</b><br>Category: {row['Category-Subcategory']}<br>Rating: {row['FinalRating']}"
    ).add_to(ecosystem_group)

# 2. Plot the top L recommended sites with a distinct star icon
for rank, (_, row) in enumerate(top_L_sites.iterrows(), 1):
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        popup=f"<b>Rank #{rank} Recommended Site</b><br>Total Score: {row['TotalEcosystemScore']:.2f}<br>Businesses Here: {int(row['BusinessCount'])}",
        icon=folium.Icon(color='green', icon='star', prefix='fa')
    ).add_to(recommended_group)

# Add a layer control to toggle views
folium.LayerControl().add_to(m)

# To display the map in a Jupyter/Kaggle notebook, simply have 'm' as the last line
print("\nMap generated. It will be displayed in the output cell below.")
m

--- Phase 1: Loading and Preparing All Data ---
Successfully loaded dataset.
Data preparation complete.

--- Phase 2: Fitting Global KNN Model ---
Global KNN model fitted.

--- Phase 3: Finding the 4 features most similar to 'Shop-Electronics' ---
Ideal Business Ecosystem Defined By: ['Shop-Electronics', 'Shop-Stationery', 'Office-AccountingFirm', 'Shop-ConvenienceStore']

--- Phase 4: Searching for the Top 5 Locations ---

--- TOP 5 RECOMMENDED SITE LOCATIONS FOUND ---
 Latitude  Longitude  TotalEcosystemScore  BusinessCount
30.724680  76.765090                 50.0             10
30.725403  76.748174                 49.0             10
30.697192  76.762405                 45.3             12
30.734566  76.752291                 45.0              9
30.718270  76.766102                 40.5              9

--- Phase 5: Generating Interactive Map ---

Map generated. It will be displayed in the output cell below.


In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
import folium
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- Phase 1: Data Loading and Preparation ---

print("--- Phase 1: Loading and Preparing All Data ---")
try:
    df = pd.read_csv('/kaggle/input/data-all/output.csv')
    print("Successfully loaded dataset.")
except FileNotFoundError:
    print("--- WARNING: Dataset not found. Creating a sample DataFrame. ---")
    data = {
        'Name': ['Best Electronics', 'City Stationery', 'A1 Accounting', 'Corner Mart', 'E-Zone', 'Paper&Pens', 'Competitor Elec.', 'Downtown Cafe', 'Main St Bank', 'Gadget Hub', 'Office Supplies Plus'],
        'Rating': [5.0, 4.8, 4.2, 4.0, 4.9, 4.1, 4.5, 4.6, 4.7, 4.8, 4.5],
        'Latitude': [30.724680, 30.724800, 30.724500, 30.725403, 30.725500, 30.725300, 30.697192, 30.725000, 30.725100, 30.724580, 30.724900],
        'Longitude': [76.765090, 76.765200, 76.764900, 76.748174, 76.748300, 76.748000, 76.762405, 76.765500, 76.765600, 76.765150, 76.765300],
        'Category': ['Shop', 'Shop', 'Office', 'Shop', 'Shop', 'Shop', 'Shop', 'Restaurant', 'Bank', 'Shop', 'Shop'],
        'Sub-Category': ['Electronics', 'Stationery', 'AccountingFirm', 'ConvenienceStore', 'Electronics', 'Stationery', 'Electronics', 'Cafe', 'NationalBank', 'Electronics', 'Stationery'],
    }
    df = pd.DataFrame(data)

df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']
df['FinalRating'] = df['Rating'].fillna(df['Rating'].mean())
frequency = df['Category-Subcategory'].value_counts()
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency.get(x, 1))
print("Data preparation complete.")


# --- Phase 2: Fit Global KNN Model ---

print("\n--- Phase 2: Fitting Global KNN Model ---")
def custom_distance(p1, p2):
    geo_dist = np.sqrt((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)
    rating_factor = (p1[3] + p2[3]) / 2
    return geo_dist / (rating_factor if rating_factor != 0 else 1e-6)

knn_data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values
knn = NearestNeighbors(metric=custom_distance, algorithm='ball_tree').fit(knn_data)
print("Global KNN model fitted.")


# --- Phase 3: Discover the Ideal Business Ecosystem ---

def find_k_similar_features(df_ref, knn_model, data_ref, target, k):
    query_indices = df_ref.index[df_ref['Category-Subcategory'] == target].tolist()
    if not query_indices: return []
    query_point = data_ref[query_indices[0]]
    _, indices = knn_model.kneighbors([query_point], n_neighbors=len(df_ref))
    similar_features, seen_features = [], set()
    for feature in df_ref.iloc[indices[0]]['Category-Subcategory']:
        if feature not in seen_features:
            similar_features.append(feature)
            seen_features.add(feature)
            if len(similar_features) >= k: break
    return similar_features

# --- USER INPUT ---
target_feature = 'Shop-Electronics'
k = 4 
L = 3
# --------------------

print(f"\n--- Phase 3: Finding the {k} features most similar to '{target_feature}' ---")
ideal_ecosystem = find_k_similar_features(df, knn, knn_data, target_feature, k)
if not ideal_ecosystem: exit()
print("Ideal Business Ecosystem Defined By:", ideal_ecosystem)


# --- Phase 4: Site Selection via Clustering and Weighted Centroids ---

print(f"\n--- Phase 4: Searching for the Top {L} Unique Locations via Clustering ---")
ecosystem_df = df[df['Category-Subcategory'].isin(ideal_ecosystem)].copy()

if ecosystem_df.empty:
    print("No locations found containing the ideal ecosystem.")
    exit()

coords = ecosystem_df[['Latitude', 'Longitude']].values
db = DBSCAN(eps=0.002, min_samples=2, algorithm='ball_tree').fit(coords)
ecosystem_df['cluster'] = db.labels_

clustered_df = ecosystem_df[ecosystem_df['cluster'] != -1].copy()

if clustered_df.empty:
    print("No dense clusters of ecosystem businesses found. Try increasing the 'eps' value in DBSCAN.")
    exit()

clustered_df['WeightedLat'] = clustered_df['Latitude'] * clustered_df['FinalRating']
clustered_df['WeightedLon'] = clustered_df['Longitude'] * clustered_df['FinalRating']

cluster_analysis = clustered_df.groupby('cluster').agg(
    TotalEcosystemScore=('FinalRating', 'sum'),
    BusinessCount=('Name', 'count'),
    SumWeightedLat=('WeightedLat', 'sum'),
    SumWeightedLon=('WeightedLon', 'sum')
).reset_index()

cluster_analysis['New_Latitude'] = cluster_analysis['SumWeightedLat'] / cluster_analysis['TotalEcosystemScore']
cluster_analysis['New_Longitude'] = cluster_analysis['SumWeightedLon'] / cluster_analysis['TotalEcosystemScore']

top_L_clusters = cluster_analysis.nlargest(L, 'TotalEcosystemScore')

print(f"\n--- TOP {L} RECOMMENDED NEW SITE LOCATIONS FOUND ---")
print(top_L_clusters[['New_Latitude', 'New_Longitude', 'TotalEcosystemScore', 'BusinessCount']].to_string(index=False))


# --- Phase 5: FOCUSED Visualization - Only Showing Top Sites and Their Influencers ---

print("\n--- Phase 5: Generating Focused Interactive Map ---")

map_center = [top_L_clusters['New_Latitude'].iloc[0], top_L_clusters['New_Longitude'].iloc[0]]
m = folium.Map(location=map_center, zoom_start=16, tiles='CartoDB positron')

colors = ['blue', 'orange', 'purple', 'darkgreen', 'cadetblue', 'darkred']
color_map = {category: colors[i % len(colors)] for i, category in enumerate(ideal_ecosystem)}

# 1. Identify which businesses are the "influencers" (i.e., part of the top L clusters)
top_cluster_ids = top_L_clusters['cluster'].tolist()
influencers_df = clustered_df[clustered_df['cluster'].isin(top_cluster_ids)]

# Create a layer group for each ranked site for better control
for rank, (_, site_row) in enumerate(top_L_clusters.iterrows(), 1):
    site_group = folium.FeatureGroup(name=f'Rank #{rank} Site (Score: {site_row["TotalEcosystemScore"]:.2f})').add_to(m)
    
    # Get the businesses that belong to this specific cluster
    cluster_id = site_row['cluster']
    cluster_businesses = influencers_df[influencers_df['cluster'] == cluster_id]
    
    # A. Plot the businesses in this cluster (the influencers)
    for _, business_row in cluster_businesses.iterrows():
        cat_color = color_map.get(business_row['Category-Subcategory'], 'gray')
        folium.CircleMarker(
            location=[business_row['Latitude'], business_row['Longitude']],
            radius=6,
            color=cat_color,
            fill=True,
            fill_color=cat_color,
            fill_opacity=0.8,
            popup=f"<b>Influencer for Rank #{rank}</b><br>{business_row['Name']}<br>Category: {business_row['Category-Subcategory']}<br>Rating: {business_row['FinalRating']}"
        ).add_to(site_group)

    # B. Plot the NEW recommended site location (the green star)
    folium.Marker(
        location=[site_row['New_Latitude'], site_row['New_Longitude']],
        popup=f"<b>Rank #{rank} NEW Site</b><br>Cluster Score: {site_row['TotalEcosystemScore']:.2f}<br>Influenced by: {int(site_row['BusinessCount'])} businesses",
        icon=folium.Icon(color='green', icon='star', prefix='fa')
    ).add_to(site_group)
    
    # C. (Optional) Draw lines from influencers to their calculated center
    for _, business_row in cluster_businesses.iterrows():
        folium.PolyLine(
            locations=[(business_row['Latitude'], business_row['Longitude']), (site_row['New_Latitude'], site_row['New_Longitude'])],
            color='gray',
            weight=1,
            opacity=0.6,
            dash_array='5, 5'
        ).add_to(site_group)

folium.LayerControl().add_to(m)
print("\nMap generated. It will be displayed in the output cell below.")



--- Phase 1: Loading and Preparing All Data ---
Successfully loaded dataset.
Data preparation complete.

--- Phase 2: Fitting Global KNN Model ---
Global KNN model fitted.

--- Phase 3: Finding the 4 features most similar to 'Shop-Electronics' ---
Ideal Business Ecosystem Defined By: ['Shop-Electronics', 'Shop-Stationery', 'Office-AccountingFirm', 'Shop-ConvenienceStore']

--- Phase 4: Searching for the Top 3 Unique Locations via Clustering ---

--- TOP 3 RECOMMENDED NEW SITE LOCATIONS FOUND ---
 New_Latitude  New_Longitude  TotalEcosystemScore  BusinessCount
    30.724164      76.765025                146.8             31
    30.733355      76.737414                125.0             28
    30.705086      76.758565                112.4             24

--- Phase 5: Generating Focused Interactive Map ---

Map generated. It will be displayed in the output cell below.


In [None]:
g