In [1]:
import os
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import shutil

options = webdriver.ChromeOptions()

download_dir = os.path.join(os.path.expanduser("~"), "Downloads")

prefs = {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
}
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(options=options)

driver.get('https://data.desagri.gov.in/weblus/classification-of-area-report-web')

states = ['Andhra Pradesh', 'Arunachal Pradesh', 'Assam', 'Bihar', 'Chhattisgarh', 'Goa',
          'Gujarat', 'Haryana', 'Himachal Pradesh', 'Jharkhand', 'Karnataka', 'Kerala',
          'Madhya Pradesh', 'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland',
          'Odisha', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu', 'Telangana', 'Tripura',
          'Uttar Pradesh', 'Uttarakhand', 'West Bengal']

ut = ['Andaman and Nicobar Islands','Chhattisgarh','Dadra and Nagar Haveli','Daman and Diu',
      'Delhi','Jammu and Kashmir','Ladakh','Lakshadweep','Puducherry','Telangana' 
      ,'The Dadra &amp','Nagar Haveli and Daman and Diu']
years = [f'{y}-{y+1}' for y in range(2020, 2021)]

def wait_for_download(download_dir, timeout=30):
    """Wait for a new file to appear in the download directory"""
    initial_files = set(os.listdir(download_dir))
    end_time = time.time() + timeout
    while time.time() < end_time:
        current_files = set(os.listdir(download_dir))
        new_files = current_files - initial_files
        if new_files:
            return new_files.pop()
        time.sleep(1)
    raise Exception("File download timed out")

for year in years:
    final_download_dir = f'data/luc/{year}'
    if not os.path.exists(final_download_dir):
        os.makedirs(final_download_dir)
    year_dropdown = Select(WebDriverWait(driver, 30).until(
        EC.presence_of_element_located((By.ID, 'fltryear'))
    ))
    year_dropdown.select_by_visible_text(year)
    
    for state in ut:
        try:
            state_dropdown = Select(WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.ID, 'fltrstates'))
            ))
            state_dropdown.select_by_visible_text(state)
            
            excel_button = Select(WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.ID, 'fltrrptformat'))
            ))
            excel_button.select_by_visible_text('Excel')

            download_button = WebDriverWait(driver, 30).until(
                EC.element_to_be_clickable((By.ID, 'btn_report'))
            )
            download_button.click()

            downloaded_file = wait_for_download(download_dir)

            xls_filepath = os.path.join(download_dir, downloaded_file)
            
            with open(xls_filepath, 'r', encoding='utf-8') as file:
                content = file.read()
                if content.strip().startswith('<table'):
                    df = pd.read_html(xls_filepath)[0]
                else:
                    df = pd.read_excel(xls_filepath, engine='xlrd')

            new_filename = f"{state}_{year}.csv"
            new_filepath = os.path.join(final_download_dir, new_filename)
            
            df.to_csv(new_filepath, index=False)
        
        except Exception as e:
            print(f"An error occurred for {state} in {year}: {e}")
        
    driver.refresh()

driver.quit()


ModuleNotFoundError: No module named 'selenium'

In [None]:
import requests
import os
import pandas as pd

# The URL you provided (with format set to JSON)


# Set the output folder
output_folder = "data/rainfall"
os.makedirs(output_folder, exist_ok=True)
limit = 1000
offset = 0
batch_number = 1

# Send the GET request
while True:
    url = f"https://api.data.gov.in/catalog/a6007b2f-eed3-4a68-a321-d2d563d52bb2?api-key=579b464db66ec23bdd0000010aad5f82a828450e566cef4cde8c196e&format=json&offset={offset}&limit={limit}"
    response = requests.get(url)

    if response.status_code == 200:
        try:
            # Parse the JSON response
            data = response.json()  # Parse the response as JSON
            records = data.get("records", [])  # Extract the "records" field

            # Check if there are records
            if not records:
                print("No data found in the response.")
            else:
                # Convert to pandas DataFrame
                df = pd.DataFrame(records)

                # Save the DataFrame to CSV
                file_path = os.path.join(output_folder, f"{batch_number}.csv")
                df.to_csv(file_path, index=False)
                print(f"Data saved to {file_path}")
            offset += limit
            batch_number += 1
        
        except ValueError:
            print("Failed to parse JSON. Here is the response text:")
            print(response.text)  # Print the raw response for debugging
            break
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        break


In [None]:
import requests
import os

# Replace with your actual API key
api_key = "579b464db66ec23bdd0000010aad5f82a828450e566cef4cde8c196e"
base_url = "https://api.data.gov.in/resource/a6007b2f-eed3-4a68-a321-d2d563d52bb2"

limit = 100000
offset = 0
batch_number = 1
output_folder = "data/rainfall_csv"

# Create the output directory if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

while True:
    params = {
        "api-key": api_key,
        "format": "csv",
        "limit": limit,
        "offset": offset
    }

    # Fetch data in CSV format
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        try:
            file_path = os.path.join(output_folder, f"rainfall_data_batch_{batch_number}.csv")
            with open(file_path, "wb") as file:
                file.write(response.content)
            print(f"Data saved to {file_path}")

            offset += limit
            batch_number += 1
        except Exception as e:
            print(f"Failed to write CSV data: {e}")
            break
    else:
        print(f"Failed to fetch data. Status code: {response.status_code}")
        break


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Load the dataset
data = pd.read_csv('Water_Requirements.csv', index_col='Date', parse_dates=True)
target_column = 'Water_Level'  # Assuming 'Water_Level' is the target variable

# ARIMA Model
# Fit ARIMA to the entire dataset for linear trend prediction
arima_model = ARIMA(data[target_column], order=(5, 1, 0))  # (p,d,q) can be tuned
arima_fit = arima_model.fit()

# Forecast using ARIMA (in-sample)
arima_forecast = arima_fit.fittedvalues  # In-sample predictions
arima_residuals = data[target_column] - arima_forecast  # Residuals (actual - ARIMA predictions)

# Scale the residuals for RNN
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_residuals = scaler.fit_transform(arima_residuals.values.reshape(-1, 1))

# Prepare sequences for RNN
def create_sequences(data, seq_length):
    x, y = [], []
    for i in range(len(data) - seq_length):
        x.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(x), np.array(y)

# Set sequence length (e.g., 30 time steps)
sequence_length = 10
x_train, y_train = create_sequences(scaled_residuals, sequence_length)

# Reshape for LSTM input (samples, time steps, features)
x_train = np.reshape(x_train, (x_train.shape[0], x_train.shape[1], 1))

# Build the RNN Model
rnn_model = Sequential()
rnn_model.add(LSTM(units=50, return_sequences=True, input_shape=(x_train.shape[1], 1)))
rnn_model.add(Dropout(0.3))
rnn_model.add(LSTM(units=50, return_sequences=False))
rnn_model.add(Dropout(0.6))
rnn_model.add(Dense(units=25))
rnn_model.add(Dense(units=1))  # Output one value (residual)

# Compile the RNN
rnn_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the RNN on ARIMA residuals
rnn_model.fit(x_train, y_train, epochs=500, batch_size=64)

# Make predictions on ARIMA residuals using the trained RNN
rnn_predictions = rnn_model.predict(x_train)
rnn_predictions = scaler.inverse_transform(rnn_predictions)  # Inverse transform to original scale

# Combine ARIMA and RNN predictions
final_predictions = arima_forecast[-len(rnn_predictions):] + rnn_predictions.flatten()

# Plot the results
plt.figure(figsize=(12, 6))
plt.plot(data.index[-len(final_predictions):], data[target_column][-len(final_predictions):], label='Actual')
plt.plot(data.index[-len(final_predictions):], final_predictions, label='Hybrid ARIMA-RNN Predictions')
plt.title('Actual vs Hybrid ARIMA-RNN Predictions')
plt.legend()
plt.show()

# Performance Evaluation for the in-sample predictions
mae = mean_absolute_error(data[target_column][-len(final_predictions):], final_predictions)
mse = mean_squared_error(data[target_column][-len(final_predictions):], final_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(data[target_column][-len(final_predictions):], final_predictions)

# Print performance metrics
print(f'In-Sample Mean Absolute Error (MAE): {mae}')
print(f'In-Sample Mean Squared Error (MSE): {mse}')
print(f'In-Sample Root Mean Squared Error (RMSE): {rmse}')
print(f'In-Sample R-squared (R²): {r2}')

# Function to forecast future values
def forecast_future_years(arima_model, rnn_model, years_ahead):
    # Forecast using ARIMA for future predictions
    future_steps = years_ahead * 12  # Assuming monthly predictions for 'x' years
    arima_future_forecast = arima_model.forecast(steps=future_steps)

    # Use the last available sequence from ARIMA residuals for RNN predictions
    last_sequence = scaled_residuals[-sequence_length:]  # Last sequence of residuals
    future_rnn_predictions = []

    for i in range(future_steps):
        # Make prediction using RNN
        next_rnn_prediction = rnn_model.predict(np.expand_dims(last_sequence, axis=0))[0]
        future_rnn_predictions.append(next_rnn_prediction)

        # Update the last sequence
        last_sequence = np.vstack([last_sequence[1:], next_rnn_prediction])

    future_rnn_predictions = scaler.inverse_transform(future_rnn_predictions)

    # Combine ARIMA and RNN predictions
    future_final_predictions = arima_future_forecast + future_rnn_predictions.flatten()

    # Create future date range
    last_date = data.index[-1]
    future_dates = pd.date_range(last_date, periods=future_steps + 1, freq='M')[1:]

    # Return the predictions as a DataFrame
    future_df = pd.DataFrame({
        'Date': future_dates,
        'Forecasted Water Levels': future_final_predictions
    }).set_index('Date')

    return future_df, rmse  # Return future predictions and RMSE

# User input for number of years ahead to forecast
years_ahead = int(input("Enter the number of years into the future you want to forecast: "))
future_predictions, future_rmse = forecast_future_years(arima_fit, rnn_model, years_ahead)

# Display the future predictions
print(f"Future predictions for the next {years_ahead} years:")
print(future_predictions)

# Print the RMSE for the future predictions
print(f"Future Forecast RMSE: {future_rmse}")

# Plot the future predictions
plt.figure(figsize=(12, 6))
plt.plot(future_predictions.index, future_predictions['Forecasted Water Levels'], label='Future Forecast')
plt.title(f'Forecasted Water Levels for the Next {years_ahead} Years')
plt.legend()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'Water_Requirements.csv'

In [13]:
import pandas as pd
df = pd.read_csv('data/usage.csv')
states = set(df['State'])
dicts = {state: i for i, state in enumerate(states)}
states = pd.DataFrame(list(dicts.items()),columns=['State', 'Index'])
states.to_csv('data/states.csv',index=False)
print(dicts)

{'Chandigarh': 0, 'Arunachal Pradesh': 1, 'Odisha': 2, 'Manipur': 3, 'Rajasthan': 4, 'Bihar': 5, 'Telangana': 6, 'Puducherry': 7, 'Lakshadweep': 8, 'Ladakh': 9, 'Kerala': 10, 'Andaman and Nicobar Islands': 11, 'Maharashtra': 12, 'Uttar Pradesh': 13, 'Mizoram': 14, 'Uttarakhand': 15, 'Andhra Pradesh': 16, 'Haryana': 17, 'Dadra and Nagar Haveli': 18, 'Himachal Pradesh': 19, 'Karnataka': 20, 'Jammu and Kashmir': 21, 'Chhattisgarh': 22, 'Meghalaya': 23, 'Delhi': 24, 'Tripura': 25, 'West Bengal': 26, 'Assam': 27, 'Madhya Pradesh': 28, 'Nagaland': 29, 'Goa': 30, 'Daman and Diu': 31, 'Jharkhand': 32, 'Sikkim': 33, 'Tamil Nadu': 34, 'Gujarat': 35, 'Punjab': 36}


In [22]:
import pandas as pd
df = pd.read_csv('data/usage.csv')
states_df = pd.read_csv('data/states.csv')
state_to_index = states_df.set_index('State')['Index'].to_dict()
df['State'] = df['State'].map(state_to_index)

df.to_csv('data/usage(1).csv', index=False)


In [24]:
import pandas as pd
df = pd.read_csv('data/luc.csv')
states_df = pd.read_csv('data/states.csv')

state_to_index = states_df.set_index('State')['Index'].to_dict()
df['State'] = df['State'].map(state_to_index)

df.to_csv('data/luc(1).csv', index=False)