In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
import os

os.chdir("/content/drive/MyDrive/Thesis/METEOSAT")
os.listdir()

['_notebooks',
 'weather_stations_combined_at_GMT.csv',
 'metadata.csv',
 '201806',
 '201807',
 '201808',
 '201809',
 'weather_stations',
 '201810',
 '201811',
 '201812',
 'description.pdf']

In [None]:
# Each .npy file is a tensor of float32's of shape 4 x 61 x 32 x 32 x 10, where the dimensions imply:
# - 4 images at in this case September 1 2018 at 00:12, 00:37, 00:42, 00:57 ('201809/2018090100-4.npy')
# - 61 weather stations, ordered as in metadata.csv (on station identifier), after applying filters: 15 > latitude > 6, longitude > -16
# - 32 image pixels horizontally (parallel to equator), from left to right (index 0 is left)
# - 32 image pixels vertically (orthogonal to equator), from top to bottom (index 0 is top)
# - 4 bands (WV_073, IR_087, IR_108, IR_120) and 2 RGB composites each having 3 values R G B (day_microphysics, dust) --> see https://resources.eumetrain.org/rgb_quick_guides/ for explanation. Both composites are based on IR_087, IR_108, IR_120 channels

# NB: for 201806 and part of 201807, the .npy files are named YYYYMMDDHH instead of YYYYMMDDHH-2/4. You can ignore the -2 and -4 at the end of the .npy files
# NB: sometimes METEOSAT's API returns duplicate images or no images. In those cases, no .npy file is created for the corresponding hour. Luckily, this happened very rarely
# NB: in both weather_stations_combined_at_GMT.csv and the .npy files, timestamp 02:00 corresponds to rainfall/images between 02:00 - 03:00
import numpy as np
tensor = np.load('201809/2018090100-4.npy')
tensor.shape

(4, 61, 32, 32, 10)

In [19]:
# NB: make sure to extract the right columns as currently all 70 weather stations are included. Furthermore, check out which weather stations have no data whatsoever, or too little to include
import pandas as pd
df_rainfall = pd.read_csv('weather_stations_combined_at_GMT.csv')
df_rainfall = df_rainfall.drop(columns=df_rainfall.columns[0], axis=1)
df_rainfall

Unnamed: 0,timestamp,TA00102,TA00103,TA00072,TA00082,TA00085,TA00088,TA00096,TA00136,TA00148,...,TA00634,TA00635,TA00652,TA00665,TA00666,TA00682,TA00684,TA00687,TA00710,TA00720
0,2018-07-01 00:00:00,,,,,,4.80,,0.170,0.000,...,,,,,,,,,,
1,2018-07-01 01:00:00,,,,,,4.85,,0.119,0.000,...,,,,,,,,,,
2,2018-07-01 02:00:00,,,,,,0.19,,0.017,0.017,...,,,,,,,,,,
3,2018-07-01 03:00:00,,,,,,0.00,,0.000,0.000,...,,,,,,,,,,
4,2018-07-01 04:00:00,,,,,,0.00,,0.017,0.000,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39452,2022-12-30 20:00:00,,,0.0,,,,,0.000,0.000,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39453,2022-12-30 21:00:00,,,0.0,,,,,0.000,0.000,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39454,2022-12-30 22:00:00,,,0.0,,,,,0.000,0.000,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
39455,2022-12-30 23:00:00,,,0.0,,,,,0.000,0.000,...,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# NB: when you plot the lat/lon's on the globe. you'll see that the excluded weather stations lie in a different region
df = pd.read_csv('metadata.csv')[['station_code', 'name', 'country', 'latitude', 'longitude', 'savanna', 'GMT+']]
df = df[(df['latitude'] > 6) & (df['latitude'] < 15) & (df['longitude'] > -16)].sort_values(by='station_code')
df

Unnamed: 0,station_code,name,country,latitude,longitude,savanna,GMT+
1,TA00082,RÃ©seau MARP,BF,12.235800,1.281600,yes,0.0
2,TA00085,N'Djamena Airport,TD,12.138500,15.045583,yes,1.0
3,TA00088,Kin Kin Village,TD,11.568833,15.209333,yes,1.0
4,TA00096,Siramana,ML,11.593013,-5.714411,yes,0.0
5,TA00102,Unimaid WS2,NG,11.810040,13.209780,yes,1.0
...,...,...,...,...,...,...,...
63,TA00665,IITA Mokwa,NG,9.352427,5.017185,yes,1.0
64,TA00666,IITA Abuja,NG,9.163178,7.343015,yes,1.0
65,TA00682,INERA Saria,BF,12.278628,-2.157046,yes,0.0
66,TA00684,Balgo-Yargho,BF,12.044388,-0.256202,yes,0.0
