# Data Collection

In this notebook we collect data to append to the data to try and supplement the data from the dataset

## Import libraries


In [49]:
# Set configuration for notebook
import os

os.chdir('c:\\Users\\Spectra\\flood-prediction')
os.getcwd()

'c:\\Users\\Spectra\\flood-prediction'

In [50]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry
import pandas as pd
from src.utils import load_config

In [4]:
config = load_config()

## Collect data


In [5]:
# Read data
flood_df = pd.read_csv(config["data"]["raw_data_path"])

In [25]:
flood_df

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood?
0,0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.70,90.36,4,1949.01,
1,1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,536809.8,510151.9,22.70,90.36,4,1949.02,
2,2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,536809.8,510151.9,22.70,90.36,4,1949.03,
3,3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,536809.8,510151.9,22.70,90.36,4,1949.04,
4,4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,536809.8,510151.9,22.70,90.36,4,1949.05,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,734765.4,308914.1,20.87,92.26,4,2013.08,1.0
20540,20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,734765.4,308914.1,20.87,92.26,4,2013.09,
20541,20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,734765.4,308914.1,20.87,92.26,4,2013.10,
20542,20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,734765.4,308914.1,20.87,92.26,4,2013.11,


In [28]:
# Filter to include only unique station names and their corresponding latitude and longitude
station_info = flood_df[['Station_Names', 'LATITUDE', 'LONGITUDE']].drop_duplicates()

station_info

Unnamed: 0,Station_Names,LATITUDE,LONGITUDE
0,Barisal,22.7,90.36
780,Bhola,22.7,90.66
1356,Bogra,24.88,89.36
2148,Chandpur,23.26,90.67
2748,Chittagong (City-Ambagan),22.35,91.8166
2796,Chittagong (IAP-Patenga),22.34,91.79
3576,Comilla,23.48,91.19
4368,Cox's Bazar,21.46,91.98
5160,Dhaka,23.78,90.39
5892,Dinajpur,25.63,88.66


In [29]:
longitude_and_latitude = list(zip(station_info["LATITUDE"].values, station_info["LONGITUDE"].values))

In [30]:
longitude_and_latitude

[(22.7, 90.36),
 (22.7, 90.66),
 (24.88, 89.36),
 (23.26, 90.67),
 (22.35, 91.8166),
 (22.34, 91.79),
 (23.48, 91.19),
 (21.46, 91.98),
 (23.78, 90.39),
 (25.63, 88.66),
 (23.61, 89.84),
 (23.01, 91.37),
 (22.29, 91.13),
 (24.12, 89.04),
 (23.17, 89.22),
 (21.98, 90.22),
 (22.8, 89.58),
 (21.83, 91.84),
 (23.17, 90.18),
 (22.83, 91.08),
 (22.43, 89.66),
 (24.75, 90.41),
 (22.36, 90.34),
 (24.35, 88.56),
 (22.67, 92.2),
 (25.72, 89.26),
 (22.5, 91.46),
 (22.68, 89.07),
 (22.64, 91.64),
 (24.29, 91.73),
 (24.88, 91.93),
 (24.15, 89.55),
 (20.87, 92.26)]

**Collect River Discharge Data**

In [None]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)


url = "https://flood-api.open-meteo.com/v1/flood"
params = {
	"latitude": 22.7,
	"longitude": 90.36,
	"daily": "river_discharge",
	"start_date": "2016-01-01",
	"end_date": "2024-05-13"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_river_discharge = daily.Variables(0).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}
daily_data["river_discharge"] = daily_river_discharge

daily_dataframe = pd.DataFrame(data = daily_data)
print(daily_dataframe)

In [39]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
import time

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Define function to make API call with retry mechanism
def make_api_call(url, params):
    while True:
        try:
            return openmeteo.weather_api(url, params=params)
        except:
            time.sleep(60)  # Wait for one minute before retrying
             

# Make API call to retrieve daily discharge data for each location
def get_daily_discharge(latitude, longitude, start_date, end_date):
    url = "https://flood-api.open-meteo.com/v1/flood"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "daily": "river_discharge",
        "start_date": start_date,
        "end_date": end_date
    }
    responses = make_api_call(url, params=params)
    return responses

# Define function to calculate average monthly discharge
def calculate_monthly_discharge(response):
    daily = response.Daily()
    daily_river_discharge = daily.Variables(0).ValuesAsNumpy()
    daily_data = {
        "date": pd.date_range(
            start=pd.to_datetime(daily.Time(), unit="s", utc=True),
            end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=daily.Interval()),
            inclusive="left"
        )
    }
    daily_data["river_discharge"] = daily_river_discharge
    daily_dataframe = pd.DataFrame(data=daily_data)

    # Calculate average monthly discharge
    monthly_discharge = daily_dataframe.resample('M', on='date').mean()
    return monthly_discharge


       




In [51]:
# Iterate over each location in your dataset
# Define an empty list to store discharge data for all locations
all_monthly_discharge = []

# Iterate over each station in the station_info DataFrame
for index, row in station_info.iterrows():
    latitude = row['LATITUDE']
    longitude = row['LONGITUDE']
    start_date = "1985-01-01"
    end_date = "2014-01-01"
    
    # Retrieve daily discharge data
    responses = get_daily_discharge(latitude, longitude, start_date, end_date)

    # Process each response
    for response in responses:
        # Calculate average monthly discharge
        monthly_discharge = calculate_monthly_discharge(response)
        
        # Add latitude and longitude columns to the monthly discharge DataFrame
        monthly_discharge['LATITUDE'] = latitude
        monthly_discharge['LONGITUDE'] = longitude
        
        # Append the monthly discharge data for this location to the list
        all_monthly_discharge.append(monthly_discharge)

# Concatenate all discharge data into a single DataFrame
all_monthly_discharge_df = pd.concat(all_monthly_discharge, ignore_index=True)

# Print the combined DataFrame
print(all_monthly_discharge_df)


  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge 

       river_discharge  LATITUDE  LONGITUDE
0             0.217383     22.70      90.36
1             0.186330     22.70      90.36
2             0.261097     22.70      90.36
3             0.240555     22.70      90.36
4             0.846616     22.70      90.36
...                ...       ...        ...
11512         3.317700     20.87      92.26
11513         2.120073     20.87      92.26
11514         0.810497     20.87      92.26
11515         0.285147     20.87      92.26
11516         0.250259     20.87      92.26

[11517 rows x 3 columns]


  monthly_discharge = daily_dataframe.resample('M', on='date').mean()


In [54]:
len(all_monthly_discharge)

33

In [57]:
concatenated_river_discharge_df = pd.concat([df.reset_index() for df in all_monthly_discharge], ignore_index=True)

In [60]:
merged_df = pd.merge(concatenated_river_discharge_df, station_info, on=['LATITUDE', 'LONGITUDE'], how='inner')
merged_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE,Station_Names
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36,Barisal
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36,Barisal
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36,Barisal
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36,Barisal
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36,Barisal
...,...,...,...,...,...
11512,2013-09-30 00:00:00+00:00,3.317700,20.87,92.26,Teknaf
11513,2013-10-31 00:00:00+00:00,2.120073,20.87,92.26,Teknaf
11514,2013-11-30 00:00:00+00:00,0.810497,20.87,92.26,Teknaf
11515,2013-12-31 00:00:00+00:00,0.285147,20.87,92.26,Teknaf


In [61]:
merged_df.to_csv("data/raw/river_discharge.csv", index=False)