In [1]:
# ==========================================
# SECTION 1: IMPORTS AND SETUP
# ==========================================

import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Utilities
import math
import json
from datetime import datetime
from typing import List, Dict, Tuple, Optional

print("✅ All libraries imported successfully!")


✅ All libraries imported successfully!


In [2]:
import pandas as pd
import numpy as np
import random

# Your original complaint templates
complaint_templates = {
    # ... (your templates remain the same) ...
    "Water": [
        "No water supply since morning in my area",
        "Drinking water pipeline is leaking badly",
        "Water supply is irregular for the last 3 days",
        "The tap water is muddy and not clean",
        "Water tank in my street is overflowing",
        "Broken water line on main street",
        "Hydrant is leaking badly"
    ],
    "Electricity": [
        "Power cut in my area for more than 2 hours",
        "Street lights are not working for 3 days",
        "Voltage fluctuation is damaging appliances",
        "Complete blackout in our building since last night",
        "Transformer making loud noise near my house",
        "Wires are sparking near the school",
        "A power pole is leaning dangerously"
    ],
    "Garbage": [
        "Garbage has not been collected for a week",
        "Overflowing dustbins causing foul smell",
        "Stray dogs scattering garbage everywhere",
        "Waste burning near my street, creating smoke",
        "Garbage pile blocking the footpath"
    ],
    "Road": [
        "Large pothole near my house, dangerous for bikers",
        "Broken road causing traffic jam daily",
        "Speed breakers are too high and damaging vehicles",
        "Construction material lying on the road",
        "Rainwater logged on road making it slippery",
        "Road has large cracks and needs repair",
        "Fallen tree blocking the street"
    ],
    "Parking": [
        "Cars are being parked illegally blocking my gate",
        "Too many vehicles parked on footpath",
        "No space left in residential parking area",
        "Trucks parked on narrow road blocking way",
        "People are parking on both sides of the road"
    ],
    "Drainage": [
        "Drainage water overflowing on the street",
        "Manhole cover is missing, dangerous for children",
        "Blocked drainage causing bad smell",
        "Sewage water mixing with drinking water",
        "Drainage water collected in front of my house"
    ],
    "Fire": [
        "Fire broke out in a shop nearby, need urgent help",
        "Smoke coming from an apartment, possible fire",
        "Small fire in garbage dump spreading fast",
        "Transformer caught fire near main road",
        "Short circuit caused fire in building basement"
    ],
    "Other": [
        "Too many stray dogs chasing people",
        "Loud construction work disturbing at night",
        "Street flooded after yesterday's rain",
        "Tree fallen on road blocking traffic",
        "Unauthorized construction blocking pathway"
    ]
}

# A wider, more realistic set of hotspots for a larger city area
category_hotspots = {
    "Water": [{'lat': 29.390, 'lon': 79.460}, {'lat': 29.355, 'lon': 79.482}, {'lat': 29.399, 'lon': 79.421}],
    "Electricity": [{'lat': 29.385, 'lon': 79.450}, {'lat': 29.361, 'lon': 79.495}, {'lat': 29.390, 'lon': 79.435}],
    "Garbage": [{'lat': 29.395, 'lon': 79.455}, {'lat': 29.378, 'lon': 79.470}, {'lat': 29.385, 'lon': 79.442}],
    "Road": [{'lat': 29.401, 'lon': 79.452}, {'lat': 29.370, 'lon': 79.465}, {'lat': 29.389, 'lon': 79.480}],
    "Parking": [{'lat': 29.388, 'lon': 79.458}, {'lat': 29.350, 'lon': 79.440}, {'lat': 29.405, 'lon': 79.455}],
    "Drainage": [{'lat': 29.392, 'lon': 79.463}, {'lat': 29.365, 'lon': 79.445}, {'lat': 29.410, 'lon': 79.470}],
    "Fire": [{'lat': 29.380, 'lon': 79.457}, {'lat': 29.393, 'lon': 79.459}, {'lat': 29.372, 'lon': 79.488}],
    "Other": [{'lat': 29.397, 'lon': 79.451}, {'lat': 29.382, 'lon': 79.468}, {'lat': 29.369, 'lon': 79.475}]
}

# New function to check if a point is within the Nainital district bounds
def is_within_bounds(lat, lon):
    min_lat, max_lat = 28.9755, 29.6126
    min_lon, max_lon = 78.8531, 79.9731
    return min_lat <= lat <= max_lat and min_lon <= lon <= max_lon

def generate_complaint(category, hotspots, radius_meters):
    hotspot = random.choice(hotspots[category])
    hotspot_lat = hotspot['lat']
    hotspot_lon = hotspot['lon']
    complaint_text = random.choice(complaint_templates[category])
    
    deg_lat_per_meter = 1 / 111111 
    deg_lon_per_meter = 1 / (111111 * np.cos(np.radians(hotspot_lat)))
    
    random_dist_meters = random.uniform(0, radius_meters)
    random_angle = random.uniform(0, 2 * np.pi)
    
    delta_lat = random_dist_meters * np.cos(random_angle) * deg_lat_per_meter
    delta_lon = random_dist_meters * np.sin(random_angle) * deg_lon_per_meter
    
    new_lat = hotspot_lat + delta_lat
    new_lon = hotspot_lon + delta_lon
    
    # Check if the new location is within bounds, if not, try again
    if is_within_bounds(new_lat, new_lon):
        return [new_lat, new_lon, complaint_text, category]
    else:
        # Recursively call the function until a valid point is generated
        return generate_complaint(category, hotspots, radius_for_clustering)

# Generate dataset
data = []
complaint_id = 1
distribution = {
    "Water": 1000, "Electricity": 1000, "Garbage": 1000,
    "Road": 1000, "Parking": 1000, "Drainage": 1000,
    "Fire": 1000, "Other": 1000
}
radius_for_clustering = 30

for category, count in distribution.items():
    for _ in range(count):
        lat, lon, complaint_text, cat = generate_complaint(
            category, category_hotspots, radius_for_clustering
        )
        data.append([complaint_id, complaint_text, lat, lon, cat])
        complaint_id += 1

# Create DataFrame
df = pd.DataFrame(data, columns=["complaint_id", "complaint", "latitude", "longitude", "category"])
print(f"✅ Dataset created with {len(df)} complaints")
print(df.head())

✅ Dataset created with 8000 complaints
   complaint_id                                 complaint   latitude  \
0             1    Water tank in my street is overflowing  29.354970   
1             2                  Hydrant is leaking badly  29.355020   
2             3  Drinking water pipeline is leaking badly  29.399063   
3             4                  Hydrant is leaking badly  29.355014   
4             5      The tap water is muddy and not clean  29.390032   

   longitude category  
0  79.481963    Water  
1  79.482123    Water  
2  79.421095    Water  
3  79.482024    Water  
4  79.459929    Water  


In [5]:
import pandas as pd
import numpy as np
import pickle
import warnings
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from datetime import datetime

warnings.filterwarnings('ignore')

class ComplaintDBSCANClustering:
    def __init__(self, eps_distance=0.05, min_samples=3, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        """
        Initializes the DBSCAN clustering system.

        Args:
            eps_distance (float): The maximum distance between two samples for them to be considered
                                 as in the same neighborhood. This is a combined metric of text and GPS distance.
            min_samples (int): The number of samples in a neighborhood for a point to be considered a core point.
        """
        self.eps_distance = eps_distance
        self.min_samples = min_samples
        self.model_name = model_name
        
        print("🚀 Loading Sentence-BERT model...")
        self.sentence_model = SentenceTransformer(model_name)
        print("✅ Model loaded successfully!")
        
        self.clusters = None
        self.group_id_counter = 1

    def create_features(self, df):
        """
        Creates a combined feature vector from text embeddings and scaled GPS data.
        """
        print("📊 Creating text embeddings...")
        text_embeddings = self.sentence_model.encode(df['complaint'].tolist(), show_progress_bar=True)
        print("✅ Text embeddings created!")
        
        # Scale the GPS coordinates to a similar range as the text embeddings
        print("📍 Scaling GPS coordinates...")
        gps_coords = df[['latitude', 'longitude']].values
        scaler = StandardScaler()
        scaled_gps = scaler.fit_transform(gps_coords)
        print("✅ GPS coordinates scaled!")

        # Combine the features into a single array
        combined_features = np.hstack((text_embeddings, scaled_gps))
        return combined_features

    def run_clustering(self, combined_features):
        """
        Runs the DBSCAN clustering algorithm on the combined features.
        """
        print(f"\n🧠 Running DBSCAN clustering with eps={self.eps_distance}, min_samples={self.min_samples}...")
        
        dbscan = DBSCAN(eps=self.eps_distance, min_samples=self.min_samples, metric='euclidean')
        clusters = dbscan.fit_predict(combined_features)
        
        print("✅ Clustering complete!")
        return clusters

    def assign_groups_to_df(self, df, clusters):
        """
        Assigns the cluster labels to the DataFrame and creates complaint groups.
        """
        df['cluster_id'] = clusters
        self.clusters = clusters
        
        self.complaint_groups = {}
        unique_clusters = set(clusters)
        
        print("📂 Assigning complaints to groups...")
        for cluster_id in unique_clusters:
            if cluster_id == -1:
                # -1 cluster ID means noise (unclustered complaints)
                continue
            
            group_df = df[df['cluster_id'] == cluster_id]
            group_id = f"G{str(self.group_id_counter).zfill(3)}"
            self.group_id_counter += 1
            
            # Create a list of complaints in the group
            complaints_list = group_df.to_dict('records')
            
            # Calculate the center of the group
            lats = group_df['latitude'].values
            lons = group_df['longitude'].values

            self.complaint_groups[group_id] = {
                'group_id': group_id,
                'priority': len(complaints_list),
                'complaints': complaints_list,
                'center_latitude': np.mean(lats),
                'center_longitude': np.mean(lons),
                'category': complaints_list[0]['category'] # Take the category of the first complaint
            }
        print("✅ Groups assigned!")

    def get_high_priority_groups(self, min_priority=3):
        """Get groups with priority >= min_priority."""
        high_priority = {
            group_id: data for group_id, data in self.complaint_groups.items() 
            if data['priority'] >= min_priority
        }
        return high_priority

    def save_model_package(self, filename='complaint_clustering_model.pkl'):
        """Save the processed groups and other data for backend integration."""
        model_package = {
            'complaint_groups': self.complaint_groups,
            'config': {
                'eps_distance': self.eps_distance,
                'min_samples': self.min_samples,
                'model_name': self.model_name
            },
            'created_at': datetime.now().isoformat(),
            'total_groups': len(self.complaint_groups),
            'total_complaints': len(self.clusters)
        }
        
        with open(filename, 'wb') as f:
            pickle.dump(model_package, f)
        
        print(f"✅ Model package saved as {filename}")

In [6]:
# Assuming 'df' is your DataFrame from the previous data generation step
print("==========================================")
print("  Running New DBSCAN Clustering Model     ")
print("==========================================")

# Initialize the new clustering system
clustering_system = ComplaintDBSCANClustering(
    eps_distance=0.5,  # You may need to tune this value
    min_samples=3      # A group needs at least 3 complaints to be considered a cluster
)

# Create combined features
combined_features = clustering_system.create_features(df)

# Run the clustering
clusters = clustering_system.run_clustering(combined_features)

# Assign the results back to the DataFrame
clustering_system.assign_groups_to_df(df.copy(), clusters)

# Print a summary of the results
print(f"\n📊 CLUSTERING RESULTS:")
total_groups = len(clustering_system.complaint_groups)
print(f"Total Clusters Found: {total_groups}")
noise_points = list(clusters).count(-1)
print(f"Noise Points (unclustered): {noise_points}")
print(f"Complaints in Clusters: {len(df) - noise_points}")

# Get and print high-priority groups
high_priority_groups = clustering_system.get_high_priority_groups(min_priority=3)
print(f"High Priority Groups (≥3 complaints): {len(high_priority_groups)}")

# Save the results
clustering_system.save_model_package()

print("\n🎉 DBSCAN clustering is complete and files are ready for the backend.")

  Running New DBSCAN Clustering Model     
🚀 Loading Sentence-BERT model...
✅ Model loaded successfully!
📊 Creating text embeddings...


Batches: 100%|███████████████████████████████████████████████████████████████████████| 250/250 [00:08<00:00, 29.59it/s]


✅ Text embeddings created!
📍 Scaling GPS coordinates...
✅ GPS coordinates scaled!

🧠 Running DBSCAN clustering with eps=0.5, min_samples=3...
✅ Clustering complete!
📂 Assigning complaints to groups...
✅ Groups assigned!

📊 CLUSTERING RESULTS:
Total Clusters Found: 138
Noise Points (unclustered): 0
Complaints in Clusters: 8000
High Priority Groups (≥3 complaints): 138
✅ Model package saved as complaint_clustering_model.pkl

🎉 DBSCAN clustering is complete and files are ready for the backend.


In [7]:
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming you have the 'combined_features' and 'clusters' from your DBSCAN model

# Get rid of noise points (-1) before calculating the score
valid_indices = clusters != -1
valid_features = combined_features[valid_indices]
valid_clusters = clusters[valid_indices]

if len(np.unique(valid_clusters)) > 1:
    score = silhouette_score(valid_features, valid_clusters)
    print(f"\n📊 Complaint Clustering Model Silhouette Score: {score:.4f}")
else:
    print("\n⚠️ Not enough clusters to calculate Silhouette Score.")


📊 Complaint Clustering Model Silhouette Score: 0.9863


In [8]:
import pandas as pd
import numpy as np
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from datetime import datetime
import torch

# ==========================================
# SECTION 1: LOAD YOUR TRAINED MODEL ONCE
# ==========================================

print("🚀 Loading Complaint DBSCAN Clustering Model...")
with open('complaint_clustering_model.pkl', 'rb') as f:
    model_package = pickle.load(f)

# The model package contains the processed groups
complaint_groups = model_package['complaint_groups']

# Load the Sentence-BERT model only once
sentence_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Create a class that loads everything at startup
# Create a class that loads everything at startup
class RealtimeDBSCANProcessor:
    def __init__(self, sentence_model, complaint_groups, eps_distance=0.5): # <-- Corrected init method
        self.sentence_model = sentence_model
        self.complaint_groups = complaint_groups
        self.eps_distance = eps_distance

    def process_new_complaint(self, new_complaint_text, new_lat, new_lon):
        # Create embedding for the new complaint
        new_embedding = self.sentence_model.encode([new_complaint_text]).reshape(1, -1)
        
        # We need to find the closest existing complaint group
        min_distance = float('inf')
        best_group_id = None
        
        for group_id, group_data in self.complaint_groups.items():
            # Get the embedding for a representative complaint from the existing group
            representative_complaint_text = group_data['complaints'][0]['complaint']
            representative_embedding = self.sentence_model.encode([representative_complaint_text]).reshape(1, -1)
            
            # Combine text and GPS for distance calculation
            combined_new = np.hstack((new_embedding, StandardScaler().fit_transform(np.array([[new_lat, new_lon]]))))
            combined_rep = np.hstack((representative_embedding, StandardScaler().fit_transform(np.array([[group_data['center_latitude'], group_data['center_longitude']]]))))
            
            distance = np.linalg.norm(combined_new - combined_rep)

            if distance < min_distance:
                min_distance = distance
                best_group_id = group_id

        if min_distance <= self.eps_distance:
            # Merged with an existing group
            return {
                'action': 'merged',
                'group_id': best_group_id,
                'distance': min_distance,
                'total_complaints': len(self.complaint_groups[best_group_id]['complaints'])
            }
        else:
            # New group
            return {
                'action': 'new_group',
                'distance': min_distance,
                'total_complaints': 1
            }

print("✅ Backend Model Ready! You can now use the `process_new_complaint` function.")

# ==========================================
# SECTION 2: FAST REAL-TIME TESTING
# ==========================================

processor = RealtimeDBSCANProcessor(sentence_model, complaint_groups, eps_distance=0.7)

print("\n" + "="*60)
print("FAST TESTING PHASE")
print("="*60)

# Test 1: New complaint
result1 = processor.process_new_complaint("Water is not coming since morning", 29.391, 79.461)
print(f"Result 1: {result1}")

# Test 2: Similar complaint that should merge
result2 = processor.process_new_complaint("No water supply in this area", 29.390, 79.460)
print(f"Result 2: {result2}")

🚀 Loading Complaint DBSCAN Clustering Model...
✅ Backend Model Ready! You can now use the `process_new_complaint` function.

FAST TESTING PHASE
Result 1: {'action': 'merged', 'group_id': 'G010', 'distance': np.float64(0.6623924859390872), 'total_complaints': 54}
Result 2: {'action': 'merged', 'group_id': 'G010', 'distance': np.float64(0.5923695250521482), 'total_complaints': 54}


# Red Zone Analysis 

In [9]:
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

# Assuming 'df' is your DataFrame from the previous data generation step

# --- Red Zone Detector Class ---
class RedZoneDetector:
    def __init__(self, grid_size_meters=500):
        self.grid_size_meters = grid_size_meters
        self.grid_data = {}  # Stores data for each grid

    def _get_grid_id(self, lat, lon):
        """Calculates a unique ID for a grid square based on its coordinates."""
        lat_per_meter = 1 / 111111
        lon_per_meter = 1 / (111111 * np.cos(np.radians(lat)))
        
        grid_lat = int(lat / (self.grid_size_meters * lat_per_meter))
        grid_lon = int(lon / (self.grid_size_meters * lon_per_meter))
        
        return f"{grid_lat}_{grid_lon}"

    def assign_complaints_to_grids(self, complaints: pd.DataFrame):
        """Assigns each complaint to its corresponding grid square."""
        self.grid_data = {}
        for _, row in complaints.iterrows():
            grid_id = self._get_grid_id(row['latitude'], row['longitude'])
            if grid_id not in self.grid_data:
                self.grid_data[grid_id] = {'count': 0, 'complaints': []}
            self.grid_data[grid_id]['count'] += 1
            self.grid_data[grid_id]['complaints'].append(row.to_dict())

    def get_map_data(self):
        """Prepares the data for Google Maps visualization with risk levels."""
        map_data = {'zones': []}
        for grid_id, data in self.grid_data.items():
            count = data['count']
            
            if count >= 50: risk, color = "RED", "#FF0000"
            elif count >= 25: risk, color = "ORANGE", "#FFA500"
            elif count >= 10: risk, color = "YELLOW", "#FFFF00"
            else: risk, color = "GREEN", "#00FF00"
            
            if count > 0:
                lats = [c['latitude'] for c in data['complaints']]
                lons = [c['longitude'] for c in data['complaints']]
                center_lat, center_lon = np.mean(lats), np.mean(lons)
                map_data['zones'].append({
                    'grid_id': grid_id, 
                    'complaint_count': count, 
                    'risk_level': risk,
                    'color': color, 
                    'center_lat': center_lat, 
                    'center_lon': center_lon
                })
        return map_data

# --- Execution ---
print("🚀 Running Red Zone Detection Model...")
red_zone_detector = RedZoneDetector()
red_zone_detector.assign_complaints_to_grids(df)
map_data = red_zone_detector.get_map_data()

with open('red_zone_map_data.json', 'w') as f:
    json.dump(map_data, f, indent=4)
print(f"✅ Red zone map data file saved as 'red_zone_map_data.json'")

🚀 Running Red Zone Detection Model...
✅ Red zone map data file saved as 'red_zone_map_data.json'


In [10]:
import pandas as pd
import numpy as np
import json
import warnings
warnings.filterwarnings('ignore')

# --- Red Zone Detector Class ---
# This class contains the logic to divide the map into a grid and detect high-density areas.
class RedZoneDetector:
    def __init__(self, grid_size_meters=500):
        self.grid_size_meters = grid_size_meters
        self.grid_data = {}

    def _get_grid_id(self, lat, lon):
        """Calculates a unique ID for a grid square based on its coordinates."""
        lat_per_meter = 1 / 111111
        lon_per_meter = 1 / (111111 * np.cos(np.radians(lat)))
        grid_lat = int(lat / (self.grid_size_meters * lat_per_meter))
        grid_lon = int(lon / (self.grid_size_meters * lon_per_meter))
        return f"{grid_lat}_{grid_lon}"

    def assign_complaints_to_grids(self, complaints: pd.DataFrame):
        """Assigns each complaint to its corresponding grid square."""
        self.grid_data = {}
        for _, row in complaints.iterrows():
            grid_id = self._get_grid_id(row['latitude'], row['longitude'])
            if grid_id not in self.grid_data:
                self.grid_data[grid_id] = {'count': 0, 'complaints': []}
            self.grid_data[grid_id]['count'] += 1
            self.grid_data[grid_id]['complaints'].append(row.to_dict())

    def get_map_data(self):
        """Prepares the data for Google Maps visualization with risk levels."""
        map_data = {'zones': []}
        for grid_id, data in self.grid_data.items():
            count = data['count']
            if count >= 50: risk, color = "RED", "#FF0000"
            elif count >= 25: risk, color = "ORANGE", "#FFA500"
            elif count >= 10: risk, color = "YELLOW", "#FFFF00"
            else: risk, color = "GREEN", "#00FF00"
            if count > 0:
                lats = [c['latitude'] for c in data['complaints']]
                lons = [c['longitude'] for c in data['complaints']]
                center_lat, center_lon = np.mean(lats), np.mean(lons)
                map_data['zones'].append({
                    'grid_id': grid_id,
                    'complaint_count': count,
                    'risk_level': risk,
                    'color': color,
                    'center_lat': center_lat,
                    'center_lon': center_lon
                })
        return map_data

# --- Execution ---
# Assumes 'df' is a pandas DataFrame with complaint data
print("🚀 Running Red Zone Detection Model...")
red_zone_detector = RedZoneDetector()
red_zone_detector.assign_complaints_to_grids(df)
map_data = red_zone_detector.get_map_data()

with open('red_zone_map_data.json', 'w') as f:
    json.dump(map_data, f, indent=4)
print(f"✅ Red zone map data file saved as 'red_zone_map_data.json'")

🚀 Running Red Zone Detection Model...
✅ Red zone map data file saved as 'red_zone_map_data.json'


In [11]:
import folium
import json

# Define the center of your map (Nainital coordinates)
map_center = [29.3909, 79.4632]

# Create a base map
my_map = folium.Map(location=map_center, zoom_start=14)

# Load the JSON data generated by your model
with open('red_zone_map_data.json', 'r') as f:
    map_data = json.load(f)

# Loop through each zone and add a circle marker to the map
for zone in map_data['zones']:
    folium.Circle(
        location=[zone['center_lat'], zone['center_lon']],
        radius=300, # A 500-meter radius to match your grid size
        color=zone['color'],
        fill=True,
        fillColor=zone['color'],
        fillOpacity=0.5,
        tooltip=f"<b>Risk Level:</b> {zone['risk_level']}<br>"
                f"<b>Complaints:</b> {zone['complaint_count']}<br>"
                f"<b>Grid ID:</b> {zone['grid_id']}"
    ).add_to(my_map)

# Add a simple legend
legend_html = '''
     <div style="position: fixed; 
                 bottom: 20px; left: 20px; width: 150px; height: 120px; 
                 border:2px solid grey; z-index:9999; font-size:14px;
                 background-color:white; opacity:0.9;">
       &nbsp; <b>Risk Levels</b> <br>
       &nbsp; <i style="background:#FF0000; display:inline-block; border:1px solid black; width:10px; height:10px; border-radius:50%;"></i> Red Zone (50+)<br>
       &nbsp; <i style="background:#FFA500; display:inline-block; border:1px solid black; width:10px; height:10px; border-radius:50%;"></i> Orange Zone (25-49)<br>
       &nbsp; <i style="background:#FFFF00; display:inline-block; border:1px solid black; width:10px; height:10px; border-radius:50%;"></i> Yellow Zone (10-24)<br>
       &nbsp; <i style="background:#00FF00; display:inline-block; border:1px solid black; width:10px; height:10px; border-radius:50%;"></i> Green Zone (0-9)<br>
     </div>
     '''
my_map.get_root().html.add_child(folium.Element(legend_html))


# Save the map to an HTML file
my_map.save("red_zone_map.html")
print("Map saved to 'red_zone_map.html'. Open this file in your browser to view it.")

Map saved to 'red_zone_map.html'. Open this file in your browser to view it.


# Priority prediction 

In [12]:
# Create a new column to store the priority label
df['priority_label'] = 'low'

# Define high-priority categories
high_priority_categories = ['Fire', 'Water', 'Electricity', 'Drainage', 'Road']

# Loop through the DataFrame and assign 'high' priority based on the category
for category in high_priority_categories:
    df.loc[df['category'] == category, 'priority_label'] = 'high'

# Display a sample of the new DataFrame
print("✅ Sample Labeled Dataset:")
print(df[['complaint', 'category', 'priority_label']].head(10))

# Print the final count of each priority to check the balance
print("\n📊 Priority Label Distribution:")
print(df['priority_label'].value_counts())

✅ Sample Labeled Dataset:
                                       complaint category priority_label
0         Water tank in my street is overflowing    Water           high
1                       Hydrant is leaking badly    Water           high
2       Drinking water pipeline is leaking badly    Water           high
3                       Hydrant is leaking badly    Water           high
4           The tap water is muddy and not clean    Water           high
5       Drinking water pipeline is leaking badly    Water           high
6       Drinking water pipeline is leaking badly    Water           high
7       Drinking water pipeline is leaking badly    Water           high
8  Water supply is irregular for the last 3 days    Water           high
9               Broken water line on main street    Water           high

📊 Priority Label Distribution:
priority_label
high    5000
low     3000
Name: count, dtype: int64


In [13]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW  # <-- Correct Import from torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pandas as pd
import numpy as np
import pickle

print("✅ All libraries loaded.")

✅ All libraries loaded.


In [14]:
# Assuming 'df' is your DataFrame from the previous steps with 'complaint' and 'priority_label'
# Map priority labels to integers (0 for low, 1 for high)
priority_map = {'low': 0, 'high': 1}
df['label'] = df['priority_label'].map(priority_map)

# Load the BERT tokenizer
print("🚀 Loading BERT tokenizer...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Tokenize all of the complaints in your dataset
input_ids = []
attention_masks = []

for complaint in tqdm(df.complaint.values, desc="Tokenizing"):
    encoded_dict = tokenizer.encode_plus(
        complaint,
        add_special_tokens=True,
        max_length=64, # Use the same fixed length for all sentences
        truncation=True, # Explicitly truncate longer sequences
        padding='max_length', # Explicitly pad shorter sequences
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df.label.values)

# Split data into training and validation sets (85% train, 15% val)
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
    input_ids, labels, random_state=42, test_size=0.15, stratify=labels
)
train_masks, validation_masks, _, _ = train_test_split(
    attention_masks, labels, random_state=42, test_size=0.15, stratify=labels
)

# Create the DataLoader for our training set
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

print("✅ Data preparation complete!")

🚀 Loading BERT tokenizer...


Tokenizing: 100%|████████████████████████████████████████████████████████████████| 8000/8000 [00:02<00:00, 3623.06it/s]


✅ Data preparation complete!


In [15]:
# Load BertForSequenceClassification, the pre-trained BERT model with a single classification layer
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=2,    # The number of output labels (high and low)
    output_attentions=False,
    output_hidden_states=False,
)

# Tell the model we'll be using a GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"✅ Using device: {device}")

# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
print("\n🚀 Starting model fine-tuning...")
for epoch in range(epochs):
    print(f"======== Epoch {epoch + 1} / {epochs} ========")
    model.train() # Set the model to training mode

    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        model.zero_grad() # Clear any previously calculated gradients

        outputs = model(b_input_ids,
                        token_type_ids=None,
                        attention_mask=b_input_mask,
                        labels=b_labels)
        
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward() # Perform a backward pass to calculate gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Clip gradients to prevent exploding gradients

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"  Average training loss: {avg_train_loss:.2f}")

    # Validation step
    print("\n  Running Validation...")
    model.eval() # Set the model to evaluation mode

    total_eval_loss = 0
    for batch in tqdm(validation_dataloader, desc="Validating"):
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
        loss = outputs.loss
        total_eval_loss += loss.item()
    
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print(f"  Validation Loss: {avg_val_loss:.2f}")

print("\n✅ Fine-tuning complete!")

# Save the fine-tuned model
torch.save(model.state_dict(), 'priority_prediction_model.pth')
print("✅ Model saved to 'priority_prediction_model.pth'")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Using device: cpu

🚀 Starting model fine-tuning...


Training: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [17:38<00:00,  4.97s/it]


  Average training loss: 0.05

  Running Validation...


Validating: 100%|██████████████████████████████████████████████████████████████████████| 38/38 [00:46<00:00,  1.23s/it]


  Validation Loss: 0.00


Training: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [18:25<00:00,  5.19s/it]


  Average training loss: 0.00

  Running Validation...


Validating: 100%|██████████████████████████████████████████████████████████████████████| 38/38 [00:46<00:00,  1.23s/it]


  Validation Loss: 0.00


Training: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [17:39<00:00,  4.98s/it]


  Average training loss: 0.00

  Running Validation...


Validating: 100%|██████████████████████████████████████████████████████████████████████| 38/38 [00:48<00:00,  1.28s/it]


  Validation Loss: 0.00


Training: 100%|██████████████████████████████████████████████████████████████████████| 213/213 [20:55<00:00,  5.89s/it]


  Average training loss: 0.00

  Running Validation...


Validating: 100%|██████████████████████████████████████████████████████████████████████| 38/38 [00:56<00:00,  1.50s/it]


  Validation Loss: 0.00

✅ Fine-tuning complete!
✅ Model saved to 'priority_prediction_model.pth'
