# Data Collection

In this notebook we collect data to append to the data to try and supplement the data from the dataset

## Import libraries


In [1]:
# Set configuration for notebook
import os

os.chdir("c:\\Users\\Spectra\\flood-prediction")
os.getcwd()

'c:\\Users\\Spectra\\flood-prediction'

In [2]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry
import pandas as pd
from src.utils import load_config

In [3]:
config = load_config()

## Collect data


In [4]:
# Read data
flood_df = pd.read_csv(config["data"]["raw_data_path"])

In [8]:
flood_df

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood?
0,0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.70,90.36,4,1949.01,
1,1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,536809.8,510151.9,22.70,90.36,4,1949.02,
2,2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,536809.8,510151.9,22.70,90.36,4,1949.03,
3,3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,536809.8,510151.9,22.70,90.36,4,1949.04,
4,4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,536809.8,510151.9,22.70,90.36,4,1949.05,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,734765.4,308914.1,20.87,92.26,4,2013.08,1.0
20540,20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,734765.4,308914.1,20.87,92.26,4,2013.09,
20541,20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,734765.4,308914.1,20.87,92.26,4,2013.10,
20542,20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,734765.4,308914.1,20.87,92.26,4,2013.11,


In [5]:
# Filter to include only unique station names and their corresponding latitude and longitude
station_info = flood_df[["Station_Names", "LATITUDE", "LONGITUDE"]].drop_duplicates()

station_info

Unnamed: 0,Station_Names,LATITUDE,LONGITUDE
0,Barisal,22.7,90.36
780,Bhola,22.7,90.66
1356,Bogra,24.88,89.36
2148,Chandpur,23.26,90.67
2748,Chittagong (City-Ambagan),22.35,91.8166
2796,Chittagong (IAP-Patenga),22.34,91.79
3576,Comilla,23.48,91.19
4368,Cox's Bazar,21.46,91.98
5160,Dhaka,23.78,90.39
5892,Dinajpur,25.63,88.66


In [10]:
longitude_and_latitude = list(
    zip(station_info["LATITUDE"].values, station_info["LONGITUDE"].values)
)

In [11]:
longitude_and_latitude

[(22.7, 90.36),
 (22.7, 90.66),
 (24.88, 89.36),
 (23.26, 90.67),
 (22.35, 91.8166),
 (22.34, 91.79),
 (23.48, 91.19),
 (21.46, 91.98),
 (23.78, 90.39),
 (25.63, 88.66),
 (23.61, 89.84),
 (23.01, 91.37),
 (22.29, 91.13),
 (24.12, 89.04),
 (23.17, 89.22),
 (21.98, 90.22),
 (22.8, 89.58),
 (21.83, 91.84),
 (23.17, 90.18),
 (22.83, 91.08),
 (22.43, 89.66),
 (24.75, 90.41),
 (22.36, 90.34),
 (24.35, 88.56),
 (22.67, 92.2),
 (25.72, 89.26),
 (22.5, 91.46),
 (22.68, 89.07),
 (22.64, 91.64),
 (24.29, 91.73),
 (24.88, 91.93),
 (24.15, 89.55),
 (20.87, 92.26)]

**Collect River Discharge Data**

In [None]:
import openmeteo_requests

import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


url = "https://flood-api.open-meteo.com/v1/flood"
params = {
    "latitude": 22.7,
    "longitude": 90.36,
    "daily": "river_discharge",
    "start_date": "2016-01-01",
    "end_date": "2024-05-13",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_river_discharge = daily.Variables(0).ValuesAsNumpy()

daily_data = {
    "date": pd.date_range(
        start=pd.to_datetime(daily.Time(), unit="s", utc=True),
        end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=daily.Interval()),
        inclusive="left",
    )
}
daily_data["river_discharge"] = daily_river_discharge

daily_dataframe = pd.DataFrame(data=daily_data)
print(daily_dataframe)

In [10]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
import time

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(".cache", expire_after=3600)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Define function to make API call with retry mechanism
def make_api_call(url, params):
    while True:
        try:
            return openmeteo.weather_api(url, params=params)
        except:
            time.sleep(60)  # Wait for one minute before retrying


# Make API call to retrieve daily discharge data for each location
def get_daily_discharge(latitude, longitude, start_date, end_date):
    url = "https://flood-api.open-meteo.com/v1/flood"
    params = {
        "latitude": latitude,
        "longitude": longitude,
        "daily": "river_discharge",
        "start_date": start_date,
        "end_date": end_date,
    }
    responses = make_api_call(url, params=params)
    return responses


# Define function to calculate average monthly discharge
def calculate_monthly_discharge(response):
    daily = response.Daily()
    daily_river_discharge = daily.Variables(0).ValuesAsNumpy()
    daily_data = {
        "date": pd.date_range(
            start=pd.to_datetime(daily.Time(), unit="s", utc=True),
            end=pd.to_datetime(daily.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=daily.Interval()),
            inclusive="left",
        )
    }
    daily_data["river_discharge"] = daily_river_discharge
    daily_dataframe = pd.DataFrame(data=daily_data)

    # Calculate average monthly discharge
    monthly_discharge = daily_dataframe.resample("M", on="date").mean()
    return monthly_discharge

In [12]:
# Iterate over each location in your dataset
# Define an empty list to store discharge data for all locations
all_monthly_discharge = []

# Iterate over each station in the station_info DataFrame
for index, row in station_info.iterrows():
    latitude = row["LATITUDE"]
    longitude = row["LONGITUDE"]
    start_date = "1985-01-01"
    end_date = "2024-01-01"

    # Retrieve daily discharge data
    responses = get_daily_discharge(latitude, longitude, start_date, end_date)

    # Process each response
    for response in responses:
        # Calculate average monthly discharge
        monthly_discharge = calculate_monthly_discharge(response)

        # Add latitude and longitude columns to the monthly discharge DataFrame
        monthly_discharge["LATITUDE"] = latitude
        monthly_discharge["LONGITUDE"] = longitude

        # Append the monthly discharge data for this location to the list
        all_monthly_discharge.append(monthly_discharge)

# Concatenate all discharge data into a single DataFrame
all_monthly_discharge_df = pd.concat(all_monthly_discharge, ignore_index=True)

# Print the combined DataFrame
print(all_monthly_discharge_df)

  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge = daily_dataframe.resample('M', on='date').mean()
  monthly_discharge 

       river_discharge  LATITUDE  LONGITUDE
0             0.217383     22.70      90.36
1             0.186330     22.70      90.36
2             0.261097     22.70      90.36
3             0.240555     22.70      90.36
4             0.846616     22.70      90.36
...                ...       ...        ...
15472         3.545088     20.87      92.26
15473         2.128332     20.87      92.26
15474         0.940908     20.87      92.26
15475         0.523045     20.87      92.26
15476         0.288249     20.87      92.26

[15477 rows x 3 columns]


  monthly_discharge = daily_dataframe.resample('M', on='date').mean()


In [13]:
len(all_monthly_discharge)

33

In [14]:
concatenated_river_discharge_df = pd.concat(
    [df.reset_index() for df in all_monthly_discharge], ignore_index=True
)

In [15]:
concatenated_river_discharge_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36
...,...,...,...,...
15472,2023-09-30 00:00:00+00:00,3.545088,20.87,92.26
15473,2023-10-31 00:00:00+00:00,2.128332,20.87,92.26
15474,2023-11-30 00:00:00+00:00,0.940908,20.87,92.26
15475,2023-12-31 00:00:00+00:00,0.523045,20.87,92.26


In [16]:
merged_df = pd.merge(
    concatenated_river_discharge_df,
    station_info,
    on=["LATITUDE", "LONGITUDE"],
    how="inner",
)
merged_df

Unnamed: 0,date,river_discharge,LATITUDE,LONGITUDE,Station_Names
0,1985-01-31 00:00:00+00:00,0.217383,22.70,90.36,Barisal
1,1985-02-28 00:00:00+00:00,0.186330,22.70,90.36,Barisal
2,1985-03-31 00:00:00+00:00,0.261097,22.70,90.36,Barisal
3,1985-04-30 00:00:00+00:00,0.240555,22.70,90.36,Barisal
4,1985-05-31 00:00:00+00:00,0.846616,22.70,90.36,Barisal
...,...,...,...,...,...
15472,2023-09-30 00:00:00+00:00,3.545088,20.87,92.26,Teknaf
15473,2023-10-31 00:00:00+00:00,2.128332,20.87,92.26,Teknaf
15474,2023-11-30 00:00:00+00:00,0.940908,20.87,92.26,Teknaf
15475,2023-12-31 00:00:00+00:00,0.523045,20.87,92.26,Teknaf


In [61]:
merged_df.to_csv("data/raw/river_discharge.csv", index=False)

**Collect Rest of the daily data**

In [6]:
locations = []
for index, row in station_info.iterrows():
    latitude = row["LATITUDE"]
    longitude = row["LONGITUDE"]
    location_dict = {"latitude": latitude, "longitude": longitude}
    locations.append(location_dict)

In [7]:
locations

[{'latitude': 22.7, 'longitude': 90.36},
 {'latitude': 22.7, 'longitude': 90.66},
 {'latitude': 24.88, 'longitude': 89.36},
 {'latitude': 23.26, 'longitude': 90.67},
 {'latitude': 22.35, 'longitude': 91.8166},
 {'latitude': 22.34, 'longitude': 91.79},
 {'latitude': 23.48, 'longitude': 91.19},
 {'latitude': 21.46, 'longitude': 91.98},
 {'latitude': 23.78, 'longitude': 90.39},
 {'latitude': 25.63, 'longitude': 88.66},
 {'latitude': 23.61, 'longitude': 89.84},
 {'latitude': 23.01, 'longitude': 91.37},
 {'latitude': 22.29, 'longitude': 91.13},
 {'latitude': 24.12, 'longitude': 89.04},
 {'latitude': 23.17, 'longitude': 89.22},
 {'latitude': 21.98, 'longitude': 90.22},
 {'latitude': 22.8, 'longitude': 89.58},
 {'latitude': 21.83, 'longitude': 91.84},
 {'latitude': 23.17, 'longitude': 90.18},
 {'latitude': 22.83, 'longitude': 91.08},
 {'latitude': 22.43, 'longitude': 89.66},
 {'latitude': 24.75, 'longitude': 90.41},
 {'latitude': 22.36, 'longitude': 90.34},
 {'latitude': 24.35, 'longitude': 8

In [19]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)


# Function to fetch and process weather data for a given location
def fetch_weather_data(location):
    url = "https://archive-api.open-meteo.com/v1/archive"
    params = {
        "latitude": location["latitude"],
        "longitude": location["longitude"],
        "start_date": "1950-01-01",
        "end_date": "2024-01-01",
        "hourly": ["temperature_2m", "relative_humidity_2m", "rain", "cloud_cover"],
        "timezone": "auto",
    }

    responses = make_api_call(url, params=params)
    response = responses[0]

    # Process hourly data
    hourly = response.Hourly()
    hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
    hourly_relative_humidity_2m = hourly.Variables(1).ValuesAsNumpy()
    hourly_rain = hourly.Variables(2).ValuesAsNumpy()
    hourly_cloud_cover = hourly.Variables(3).ValuesAsNumpy()

    hourly_data = {
        "date": pd.date_range(
            start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
            end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
            freq=pd.Timedelta(seconds=hourly.Interval()),
            inclusive="left",
        )
    }
    hourly_data["temperature_2m"] = hourly_temperature_2m
    hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
    hourly_data["rain"] = hourly_rain
    hourly_data["cloud_cover"] = hourly_cloud_cover

    hourly_dataframe = pd.DataFrame(data=hourly_data)
    return hourly_dataframe


# Function to summarize hourly data into monthly data
def summarize_monthly(dataframe):
    monthly_summary = (
        dataframe.resample("M", on="date")
        .agg(
            {
                "temperature_2m": "mean",
                "relative_humidity_2m": "mean",
                "rain": "sum",
                "cloud_cover": "mean",
            }
        )
        .reset_index()
    )
    return monthly_summary

In [20]:
# Initialize an empty list to store the results
all_monthly_summaries = []

# Fetch and process data for each location
for location in locations:
    hourly_df = fetch_weather_data(location)
    monthly_summary = summarize_monthly(hourly_df)
    monthly_summary["latitude"] = location["latitude"]
    monthly_summary["longitude"] = location["longitude"]
    all_monthly_summaries.append(monthly_summary)

# Combine all monthly summaries into a single DataFrame
combined_monthly_summary = pd.concat(all_monthly_summaries, ignore_index=True)

# Print the combined monthly summary
print(combined_monthly_summary)

# Optionally, save to a CSV file
combined_monthly_summary.to_csv("combined_monthly_weather_summary.csv", index=False)

  dataframe.resample("M", on="date")
  dataframe.resample("M", on="date")
  dataframe.resample("M", on="date")


In [13]:
flood_df

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood?
0,0,Barisal,1949,1,29.4,12.3,0.0,68.0,0.453704,0.6,7.831915,41950,536809.8,510151.9,22.70,90.36,4,1949.01,
1,1,Barisal,1949,2,33.9,15.2,9.0,63.0,0.659259,0.9,8.314894,41950,536809.8,510151.9,22.70,90.36,4,1949.02,
2,2,Barisal,1949,3,36.7,20.2,8.0,59.0,1.085185,1.5,8.131915,41950,536809.8,510151.9,22.70,90.36,4,1949.03,
3,3,Barisal,1949,4,33.9,23.9,140.0,71.0,1.772222,3.9,8.219149,41950,536809.8,510151.9,22.70,90.36,4,1949.04,
4,4,Barisal,1949,5,35.6,25.0,217.0,76.0,1.703704,4.1,7.046809,41950,536809.8,510151.9,22.70,90.36,4,1949.05,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20539,20539,Teknaf,2013,8,32.5,25.4,851.0,88.0,2.500000,6.2,3.800000,41998,734765.4,308914.1,20.87,92.26,4,2013.08,1.0
20540,20540,Teknaf,2013,9,32.6,25.7,329.0,88.0,3.000000,6.1,4.200000,41998,734765.4,308914.1,20.87,92.26,4,2013.09,
20541,20541,Teknaf,2013,10,33.1,24.8,271.0,85.0,2.500000,4.4,5.600000,41998,734765.4,308914.1,20.87,92.26,4,2013.10,
20542,20542,Teknaf,2013,11,32.5,20.0,0.0,79.0,2.100000,1.7,8.400000,41998,734765.4,308914.1,20.87,92.26,4,2013.11,


In [14]:
flood_df.loc[(flood_df["Station_Names"] == "Barisal") & (flood_df["Year"] == 1985)]

Unnamed: 0,Sl,Station_Names,Year,Month,Max_Temp,Min_Temp,Rainfall,Relative_Humidity,Wind_Speed,Cloud_Coverage,Bright_Sunshine,Station_Number,X_COR,Y_COR,LATITUDE,LONGITUDE,ALT,Period,Flood?
432,432,Barisal,1985,1,30.2,12.6,1.0,84.0,0.2,1.4,7.5,41950,536809.8,510151.9,22.7,90.36,4,1985.01,
433,433,Barisal,1985,2,31.4,14.3,2.0,77.0,0.6,1.1,8.6,41950,536809.8,510151.9,22.7,90.36,4,1985.02,
434,434,Barisal,1985,3,36.7,22.4,45.0,81.0,1.8,3.6,7.5,41950,536809.8,510151.9,22.7,90.36,4,1985.03,
435,435,Barisal,1985,4,36.6,24.6,64.0,81.0,1.9,4.7,8.4,41950,536809.8,510151.9,22.7,90.36,4,1985.04,
436,436,Barisal,1985,5,35.0,24.2,226.0,88.0,1.7,5.1,7.9,41950,536809.8,510151.9,22.7,90.36,4,1985.05,1.0
437,437,Barisal,1985,6,36.2,25.8,322.0,91.0,1.5,6.3,3.9,41950,536809.8,510151.9,22.7,90.36,4,1985.06,1.0
438,438,Barisal,1985,7,34.5,25.3,291.0,93.0,1.5,6.8,3.3,41950,536809.8,510151.9,22.7,90.36,4,1985.07,1.0
439,439,Barisal,1985,8,35.0,25.9,308.0,90.0,1.5,6.7,5.2,41950,536809.8,510151.9,22.7,90.36,4,1985.08,1.0
440,440,Barisal,1985,9,34.6,25.4,180.0,90.0,1.0,5.9,5.1,41950,536809.8,510151.9,22.7,90.36,4,1985.09,
441,441,Barisal,1985,10,34.5,22.8,198.0,86.0,0.5,2.5,8.6,41950,536809.8,510151.9,22.7,90.36,4,1985.1,


In [21]:
combined_monthly_summary

Unnamed: 0,date,temperature_2m,relative_humidity_2m,rain,cloud_cover,latitude,longitude
0,1984-12-31 00:00:00+00:00,14.713833,87.341438,0.000000,23.800001,22.70,90.36
1,1985-01-31 00:00:00+00:00,20.220150,71.012703,8.599999,20.722178,22.70,90.36
2,1985-02-28 00:00:00+00:00,22.590544,64.674034,10.000000,12.533036,22.70,90.36
3,1985-03-31 00:00:00+00:00,27.412891,73.686783,54.200001,28.681452,22.70,90.36
4,1985-04-30 00:00:00+00:00,29.095428,76.044502,83.800003,28.859304,22.70,90.36
...,...,...,...,...,...,...,...
15505,2023-09-30 00:00:00+00:00,26.947071,91.703636,400.000000,69.232224,20.87,92.26
15506,2023-10-31 00:00:00+00:00,26.805334,86.059883,156.300003,39.095295,20.87,92.26
15507,2023-11-30 00:00:00+00:00,25.613804,81.703674,154.800003,32.358612,20.87,92.26
15508,2023-12-31 00:00:00+00:00,22.740011,83.418503,55.100002,28.840324,20.87,92.26
