In [7]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# ✅ List of all states & districts in India
states_districts = {
    "Andhra Pradesh": ["Anantapur", "Chittoor", "East Godavari", "Guntur", "Krishna", "Kurnool"],
    "Arunachal Pradesh": ["Tawang", "Itanagar", "Papum Pare"],
    "Assam": ["Guwahati", "Dibrugarh", "Jorhat", "Silchar"],
    "Bihar": ["Patna", "Gaya", "Bhagalpur", "Muzaffarpur"],
    "Chhattisgarh": ["Raipur", "Bilaspur", "Durg"],
    "Delhi": ["New Delhi", "North Delhi", "South Delhi"],
    "Goa": ["North Goa", "South Goa"],
    "Gujarat": ["Ahmedabad", "Surat", "Vadodara", "Rajkot"],
    "Haryana": ["Gurgaon", "Faridabad", "Panipat"],
    "Himachal Pradesh": ["Shimla", "Kangra", "Mandi"],
    "Jharkhand": ["Ranchi", "Jamshedpur", "Dhanbad"],
    "Karnataka": ["Bangalore", "Mysore", "Mangalore"],
    "Kerala": ["Thiruvananthapuram", "Kochi", "Kozhikode"],
    "Madhya Pradesh": ["Bhopal", "Indore", "Gwalior"],
    "Maharashtra": ["Mumbai", "Pune", "Nagpur", "Nashik"],
    "Manipur": ["Imphal East", "Imphal West"],
    "Meghalaya": ["Shillong", "Tura"],
    "Mizoram": ["Aizawl", "Lunglei"],
    "Nagaland": ["Kohima", "Dimapur"],
    "Odisha": ["Bhubaneswar", "Cuttack", "Rourkela"],
    "Punjab": ["Ludhiana", "Amritsar", "Jalandhar"],
    "Rajasthan": ["Jaipur", "Jodhpur", "Udaipur"],
    "Sikkim": ["Gangtok"],
    "Tamil Nadu": ["Chennai", "Coimbatore", "Madurai"],
    "Telangana": ["Hyderabad", "Warangal", "Karimnagar"],
    "Tripura": ["Agartala"],
    "Uttar Pradesh": ["Lucknow", "Kanpur", "Varanasi", "Agra"],
    "Uttarakhand": ["Dehradun", "Haridwar"],
    "West Bengal": ["Kolkata", "Darjeeling", "Howrah"],
}

# ✅ Generate Station Codes based on State & District Names
def generate_station_code(state, district):
    return f"{state[:3].upper()}_{district[:3].upper()}"

# ✅ Define date range (past 1 year)
start_date = datetime(2023, 1, 1)
end_date = datetime(2023, 12, 31)  # 1 year of data
date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# ✅ Function to generate synthetic data
def generate_realistic_data(state, district):
    data = []
    station_code = generate_station_code(state, district)
    
    for date in date_range:
        entry = {
            "StationCode": station_code,
            "State": state,
            "District": district,
            "Timestamp": date.strftime("%Y-%m-%d"),
            "Rainfall (mm)": round(np.random.uniform(0, 150) if random.random() < 0.6 else 0, 1),  # 60% chance of rain
            "Groundwater Level (m)": round(np.random.uniform(5, 20), 2),  # Between 5m and 20m
            "Temperature (°C)": round(np.random.uniform(15, 45), 1),  # Between 15°C and 45°C
            "River Water Level (m)": round(np.random.uniform(1, 15), 2)  # Between 1m and 15m
        }
        data.append(entry)
    
    return data

# ✅ Generate data for all states and districts
all_data = []
for state, districts in states_districts.items():
    for district in districts:
        all_data.extend(generate_realistic_data(state, district))

# ✅ Convert to Pandas DataFrame
df = pd.DataFrame(all_data)

# ✅ Save data as CSV
df.to_csv("india_water_data.csv", index=False)

print("Synthetic water data saved as india_water_data.csv")


Synthetic water data saved as india_water_data.csv


In [16]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Define seasonal rainfall patterns (approximate)
SEASONAL_RAINFALL = {
    "winter": (0, 20),  # January - February, November - December
    "summer": (0, 50),  # March - May
    "monsoon": (50, 200)  # June - October
}

# States and districts
states_districts = {
    "Assam": ["Barpeta", "Dhemaji", "Kamrup", "Nagaon", "Nalbari", "Sonitpur"],
    "Karnataka": ["Bangalore", "Mysore", "Udupi", "Dharwad", "Belgaum"],
    "Maharashtra": ["Mumbai", "Pune", "Nagpur", "Nashik", "Aurangabad"]
}

# Generate station code
def generate_station_code(state, district):
    return f"{state[:3].upper()}_{district[:3].upper()}"

# Define date range
start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)
date_range = [start_date + timedelta(days=i) for i in range((end_date - start_date).days + 1)]

# Determine season for given month
def get_season(month):
    if month in [1, 2, 11, 12]:
        return "winter"
    elif month in [3, 4, 5]:
        return "summer"
    else:
        return "monsoon"

# Generate realistic data
def generate_realistic_data(state, district):
    data = []
    station_code = generate_station_code(state, district)
    groundwater = round(np.random.uniform(8, 20), 2)  # Initial groundwater level

    for date in date_range:
        season = get_season(date.month)
        rainfall = round(np.random.uniform(*SEASONAL_RAINFALL[season]), 1) if random.random() < 0.7 else 0
        temperature = round(np.random.uniform(10, 45), 1)
        river_level = round(np.random.uniform(1, 15), 2)

        # Adjust groundwater based on rainfall
        if rainfall > 50:
            groundwater = max(5, groundwater - np.random.uniform(0.5, 1.5))  # Increase groundwater
        elif rainfall < 10:
            groundwater = min(20, groundwater + np.random.uniform(0.2, 1.0))  # Decrease groundwater

        groundwater = round(groundwater, 2)

        # Scarcity prediction
        scarcity = 1 if (rainfall < 10 and groundwater > 15 and river_level < 5) else 0

        data.append({
            "Date": date.strftime("%Y-%m-%d"),
            "StationCode": station_code,
            "State": state,
            "District": district,
            "Rainfall (mm)": rainfall,
            "Groundwater Level (m)": groundwater,
            "Temperature (°C)": temperature,
            "River Water Level (m)": river_level,
            "Scarcity": scarcity
        })

    return data

# Generate data for selected states and districts
all_data = []
for state, districts in states_districts.items():
    for district in districts:
        all_data.extend(generate_realistic_data(state, district))

# Convert to DataFrame
df = pd.DataFrame(all_data)

# Count months with water scarcity per district
df["Month"] = pd.to_datetime(df["Date"]).dt.month
df_scarcity = df.groupby(["State", "District", "Month"])["Scarcity"].sum().reset_index()
df_scarcity = df_scarcity[df_scarcity["Scarcity"] > 0]
df_scarcity_count = df_scarcity.groupby(["State", "District"]).size().reset_index(name="Scarcity Months")

# Merge scarcity count into main dataset
df = df.merge(df_scarcity_count, on=["State", "District"], how="left").fillna(0)
df["Scarcity Months"] = df["Scarcity Months"].astype(int)

# Save to CSV
df.to_csv("water_scarcity_2024.csv", index=False)

print("Data saved as water_scarcity_2024.csv with improved scarcity prediction.")


Data saved as water_scarcity_2024.csv with improved scarcity prediction.


In [20]:
data = pd.read_csv('water_scarcity_2024.csv')

# Drop unwanted columns (months and scarcity months if they exist)
unwanted_columns = ['Month', 'Scarcity Months']
data = data.drop(columns=[col for col in unwanted_columns if col in data.columns])

data.to_csv("water_scarcity.csv", index=False)