# **Downlaod Packages**

In [9]:
!pip install scipy
!pip install xarray
!pip install netCDF4 xarray cftime h5netcdf --quiet
!pip install xarray[complete]
!pip install cftime --quiet



# **Load Manchester Data**

In [10]:
# Link to dataset dropbox provided in practical 6
!wget "https://www.dropbox.com/scl/fo/dmabz9pf3167l62612h5b/h?rlkey=ge8u486w7w7vq8vnpr2f1fvag&e=2&dl=0" -O MAN_data.nc


--2025-03-26 23:35:08--  https://www.dropbox.com/scl/fo/dmabz9pf3167l62612h5b/h?rlkey=ge8u486w7w7vq8vnpr2f1fvag&e=2&dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc087015d11d50d8eec5c31ab32b.dl-eu.dropboxusercontent.com/zip_download_get/CKBUxENN7sp3ehI1gKji_uXjsLIrS8FDkaq_zLX7ysdVF92ioyXHWCeBhLDB9uJuC7LwIqL8wPF7wUn-RttvKQYEt6uNU7QgNr65qJROZ_095Q# [following]
--2025-03-26 23:35:08--  https://uc087015d11d50d8eec5c31ab32b.dl-eu.dropboxusercontent.com/zip_download_get/CKBUxENN7sp3ehI1gKji_uXjsLIrS8FDkaq_zLX7ysdVF92ioyXHWCeBhLDB9uJuC7LwIqL8wPF7wUn-RttvKQYEt6uNU7QgNr65qJROZ_095Q
Resolving uc087015d11d50d8eec5c31ab32b.dl-eu.dropboxusercontent.com (uc087015d11d50d8eec5c31ab32b.dl-eu.dropboxusercontent.com)... 162.125.4.15, 2620:100:601c:15::a27d:60f
Connecting to uc087015d11d50d8eec5c31ab32b.dl-eu.drop

In [11]:
# Unzip .nc file
import zipfile
import os

with zipfile.ZipFile("MAN_data.nc", 'r') as zip_ref:
    zip_ref.extractall("unzipped_data")
os.listdir("unzipped_data")


['007_2006_2080_352_360.nc',
 '005_2006_2080_352_360.nc',
 '008_2006_2080_352_360.nc',
 '004_2006_2080_352_360.nc',
 '003_2006_2080_352_360.nc',
 '006_2006_2080_352_360.nc']

In [12]:
# Turn into datasets
import xarray as xr
import os

data_dir = "unzipped_data"
nc_files = sorted(os.listdir(data_dir))

MAN_data = {}

#Specify Manchester location
target_lat = 53.246075
target_lon = 357.5
for file in nc_files:
    file_path = os.path.join(data_dir, file)

    try:
        ds = xr.open_dataset(file_path, engine="netcdf4", decode_times=False)
    except ValueError:
        ds = xr.open_dataset(file_path, engine="h5netcdf", decode_times=False)

    prefix = file.split('_')[0]
    var_name = f"MAN_{prefix}"
    MAN_data[var_name] = ds
    point_data = ds.sel(
        lon=target_lon,
        lat=target_lat,
        method="nearest"
    )
    df = point_data.to_dataframe().reset_index()
    MAN_data[var_name] = df
print(MAN_data['MAN_005'].head())


   time  TREFMXAV_U       FLNS       FSNS         PRECT          PRSN  \
0   1.0  284.302612   8.737288   7.855509  1.544201e-07  4.051599e-16   
1   2.0  286.163269   6.686464   7.501073  7.784098e-08  0.000000e+00   
2   3.0  286.310303  27.445148  12.188718  4.851411e-08  7.075068e-18   
3   4.0  288.008820  10.443632   4.354691  1.091676e-07  1.429017e-14   
4   5.0  283.935760  70.927299  31.532597  2.531009e-09  6.418103e-18   

       QBOT      TREFHT      UBOT      VBOT        lat    lon  
0  0.005373  279.221954  2.303055  4.502803  53.246075  357.5  
1  0.007595  284.198425  4.657475  3.158464  53.246075  357.5  
2  0.005667  282.420227  5.083677  5.835154  53.246075  357.5  
3  0.007362  284.972076  3.029053  8.031938  53.246075  357.5  
4  0.004160  280.298279  3.783906  6.986506  53.246075  357.5  


In [13]:
# Check missing values and duplicates
for name, df in MAN_data.items():
    print(f"\nChecking {name}")
    missing = df.isnull().sum()
    print("Missing values:")
    print(missing[missing > 0] if missing.sum() > 0 else "None")
    duplicate_times = df['time'].duplicated().sum()
    print(f"Duplicate timestamps: {duplicate_times if duplicate_times > 0 else 'None'}")
    print(f"Unique timestamps: {df['time'].nunique()} / Total rows: {len(df)}")



Checking MAN_003
Missing values:
None
Duplicate timestamps: None
Unique timestamps: 27374 / Total rows: 27374

Checking MAN_004
Missing values:
None
Duplicate timestamps: None
Unique timestamps: 27374 / Total rows: 27374

Checking MAN_005
Missing values:
None
Duplicate timestamps: None
Unique timestamps: 27374 / Total rows: 27374

Checking MAN_006
Missing values:
None
Duplicate timestamps: None
Unique timestamps: 27374 / Total rows: 27374

Checking MAN_007
Missing values:
None
Duplicate timestamps: None
Unique timestamps: 27374 / Total rows: 27374

Checking MAN_008
Missing values:
None
Duplicate timestamps: None
Unique timestamps: 27374 / Total rows: 27374


In [14]:
#Turn from count of days to datetime
import cftime
import pandas as pd

for df in MAN_data.values():
    noleap_dates = df['time'].apply(lambda x: cftime.DatetimeNoLeap(2006, 1, 1) + pd.to_timedelta(x - 1, unit='D'))
    df['date'] = pd.to_datetime([dt.strftime('%Y-%m-%d') for dt in noleap_dates])
    df.drop(columns='time', inplace=True)



In [15]:
MAN_data

{'MAN_003':        TREFMXAV_U       FLNS       FSNS         PRECT          PRSN      QBOT  \
 0      282.775848  41.925179  25.926952  4.663135e-09  4.781004e-17  0.004769   
 1      284.471130   8.905806  10.946910  8.046593e-08  1.295726e-16  0.006271   
 2      284.287964  16.511415   6.405902  2.910935e-09  4.094447e-16  0.005782   
 3      282.121094  29.948362   9.315041  7.432505e-11  0.000000e+00  0.004277   
 4      280.890900  19.064156   7.941241  6.894238e-12  0.000000e+00  0.003990   
 ...           ...        ...        ...           ...           ...       ...   
 27369  286.859070  55.167488  30.543959  4.768826e-08  0.000000e+00  0.006020   
 27370  285.209930  33.668793  15.180238  1.048248e-08  0.000000e+00  0.005895   
 27371  285.517090  20.443943  15.585098  1.590194e-09  0.000000e+00  0.006381   
 27372  283.791870  33.174114  18.295568  9.272739e-12  0.000000e+00  0.005410   
 27373  283.192780  14.260359   8.917209  2.529291e-10  0.000000e+00  0.005427   
 
   

# **Load Chess Data**

In [39]:
#Link to Chess Tesmax .nc file
!wget "https://www.dropbox.com/scl/fi/l49mccyqqejo3ohve5ibi/chess-scape_rcp85_bias-corrected_01_tasmax_uk_1km_monthly_19801201-20801130.nc?rlkey=ypcwdy6e7ln00qm91mml76akg&st=0tujdw5t&dl=0" -O CHESS_data.nc



--2025-03-27 00:22:04--  https://www.dropbox.com/scl/fi/l49mccyqqejo3ohve5ibi/chess-scape_rcp85_bias-corrected_01_tasmax_uk_1km_monthly_19801201-20801130.nc?rlkey=ypcwdy6e7ln00qm91mml76akg&st=0tujdw5t&dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc4dbe5f457559dc58fc4c0364b5.dl.dropboxusercontent.com/cd/0/inline/CmoWLEKGrICqurhFEEzIf9M4crcyXdalYj2EDe0KH8RhSeL2Tcsupc0Sz4d-psKlnHVpsOaMERG4sutRQEwtK5zkHv3S8c2VdMF8N6NcrgsISpGmpF9QMa5QHccR7VbpmnkVelUyyVA17Y9g5BMDxEl5/file# [following]
--2025-03-27 00:22:04--  https://uc4dbe5f457559dc58fc4c0364b5.dl.dropboxusercontent.com/cd/0/inline/CmoWLEKGrICqurhFEEzIf9M4crcyXdalYj2EDe0KH8RhSeL2Tcsupc0Sz4d-psKlnHVpsOaMERG4sutRQEwtK5zkHv3S8c2VdMF8N6NcrgsISpGmpF9QMa5QHccR7VbpmnkVelUyyVA17Y9g5BMDxEl5/file
Resolving uc4dbe5f457559dc58fc4c0364b5.dl.dropboxusercontent.c

In [40]:
!pip show netcdf4

Name: netCDF4
Version: 1.7.2
Summary: Provides an object-oriented python interface to the netCDF version 4 library
Home-page: 
Author: 
Author-email: Jeff Whitaker <jeffrey.s.whitaker@noaa.gov>
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: certifi, cftime, numpy
Required-by: 


In [41]:
import xarray as xr
ds_chess = xr.open_dataset("CHESS_data.nc")
print(ds_chess)


<xarray.Dataset> Size: 3GB
Dimensions:    (y: 1057, x: 656, time: 1200, bnds: 2)
Coordinates:
    lat        (y, x) float32 3MB ...
    lon        (y, x) float32 3MB ...
  * time       (time) object 10kB 1980-12-16 00:00:00 ... 2080-11-16 00:00:00
  * x          (x) float32 3kB 500.0 1.5e+03 2.5e+03 ... 6.545e+05 6.555e+05
  * y          (y) float32 4kB 500.0 1.5e+03 2.5e+03 ... 1.056e+06 1.056e+06
Dimensions without coordinates: bnds
Data variables:
    time_bnds  (time, bnds) object 19kB ...
    x_bnds     (x, bnds) float32 5kB ...
    y_bnds     (y, bnds) float32 8kB ...
    crsOSGB    int32 4B ...
    tasmax     (time, y, x) float32 3GB ...
Attributes: (12/35)
    title:                         CHESS-SCAPE: Future projections of meteoro...
    summary:                       Gridded daily meteorological variables ove...
    source:                        This dataset was derived from four ensembl...
    cdm_data_type:                 grid
    standard_name_vocabulary:      CF Standa

In [42]:
#Transform same way as mme dataset
import numpy as np

lat_grid = ds_chess['lat'].values
lon_grid = ds_chess['lon'].values

target_lat = 53.246075
target_lon = -2.5

dist = np.sqrt((lat_grid - target_lat)**2 + (lon_grid - target_lon)**2)
min_idx = np.unravel_index(np.argmin(dist), dist.shape)
y_idx, x_idx = min_idx

tasmax_series = ds_chess['tasmax'][:, y_idx, x_idx]
chess_df = tasmax_series.to_dataframe().reset_index()

chess_df.rename(columns={'tasmax': 'tasmax_chess'}, inplace=True)

chess_df['time'] = chess_df['time'].astype(str)
chess_df['time'] = pd.to_datetime(chess_df['time'], errors='coerce')
chess_df['year'] = chess_df['time'].dt.year
chess_df['month'] = chess_df['time'].dt.month
chess_df = chess_df[(chess_df['year'] >= 2006) & (chess_df['year'] <= 2080)]

chess_df = chess_df.drop(columns=['x', 'y'])
chess_df['lon'] = chess_df['lon'].apply(lambda lon: lon - 360 if lon > 180 else lon)
print("Longitude range:", chess_df['lon'].min(), "to", chess_df['lon'].max())
chess_df = chess_df[['time', 'year', 'month', 'lat', 'lon', 'tasmax_chess']]
chess_df

Longitude range: -2.5021114349365234 to -2.5021114349365234


Unnamed: 0,time,year,month,lat,lon,tasmax_chess
301,2006-01-16,2006,1,53.24818,-2.502111,281.001984
302,2006-02-16,2006,2,53.24818,-2.502111,281.770294
303,2006-03-16,2006,3,53.24818,-2.502111,283.746277
304,2006-04-16,2006,4,53.24818,-2.502111,284.548462
305,2006-05-16,2006,5,53.24818,-2.502111,286.758942
...,...,...,...,...,...,...
1195,2080-07-16,2080,7,53.24818,-2.502111,295.305359
1196,2080-08-16,2080,8,53.24818,-2.502111,297.577484
1197,2080-09-16,2080,9,53.24818,-2.502111,295.929199
1198,2080-10-16,2080,10,53.24818,-2.502111,292.206055


We now have a dataset of 899 entries (monthly) for Manchester for tesmax

In [43]:
!wget "https://www.dropbox.com/scl/fi/tsqydr20o3phmz3yi3d1w/pr.nc?rlkey=xungz3xphv96zazjrrd0hr0ix&st=i126jgpj&dl=0" -O PR_data.nc


--2025-03-27 00:22:19--  https://www.dropbox.com/scl/fi/tsqydr20o3phmz3yi3d1w/pr.nc?rlkey=xungz3xphv96zazjrrd0hr0ix&st=i126jgpj&dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:6019:18::a27d:412
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ucbb2dc9faf760cbc84aec5b2cd4.dl.dropboxusercontent.com/cd/0/inline/CmqO-n_SOOYWBjC8mL9SzIs6OK9PJUJYEYZSSYxaOikq1UNJ8ebah_-klLVaJAIBHz4QeG-FZvJxZhVNWImTQjq_I72LxZ98190RfbwfG3B04xZMKAbMVItCxS54xQbQ19HXjJKSi2Qt2GlDcmgPM9su/file# [following]
--2025-03-27 00:22:19--  https://ucbb2dc9faf760cbc84aec5b2cd4.dl.dropboxusercontent.com/cd/0/inline/CmqO-n_SOOYWBjC8mL9SzIs6OK9PJUJYEYZSSYxaOikq1UNJ8ebah_-klLVaJAIBHz4QeG-FZvJxZhVNWImTQjq_I72LxZ98190RfbwfG3B04xZMKAbMVItCxS54xQbQ19HXjJKSi2Qt2GlDcmgPM9su/file
Resolving ucbb2dc9faf760cbc84aec5b2cd4.dl.dropboxusercontent.com (ucbb2dc9faf760cbc84aec5b2cd4.dl.dropboxusercontent.com)... 162.125.4.

In [44]:
!wget "https://www.dropbox.com/scl/fi/cpes0xs9dvxz8acafao01/chess-scape_rcp85_bias-corrected_01_rsds_uk_1km_monthly_19801201-20801130.nc?rlkey=c2w1152tirfm4gn494sfuncpf&st=jet68c0x&dl=0" -O RSDS_data.nc


--2025-03-27 00:22:45--  https://www.dropbox.com/scl/fi/cpes0xs9dvxz8acafao01/chess-scape_rcp85_bias-corrected_01_rsds_uk_1km_monthly_19801201-20801130.nc?rlkey=c2w1152tirfm4gn494sfuncpf&st=jet68c0x&dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc5d6a7f2308481af731fb5bfd2c.dl.dropboxusercontent.com/cd/0/inline/Cmrpq5OJM0YmN9FJzXZQwoqj6kpZMK0nfe4O6Ql1cLyLJjPtk3bfVgCAuTSrhJZnpvkW5mcp5w1D6cmNwQtrM1k0yFePn4MTIqGMqCLnuS5_zXSSo90d7vDn_rwcfmO6WrbZaLWc64JtKVbY4lDefQGq/file# [following]
--2025-03-27 00:22:45--  https://uc5d6a7f2308481af731fb5bfd2c.dl.dropboxusercontent.com/cd/0/inline/Cmrpq5OJM0YmN9FJzXZQwoqj6kpZMK0nfe4O6Ql1cLyLJjPtk3bfVgCAuTSrhJZnpvkW5mcp5w1D6cmNwQtrM1k0yFePn4MTIqGMqCLnuS5_zXSSo90d7vDn_rwcfmO6WrbZaLWc64JtKVbY4lDefQGq/file
Resolving uc5d6a7f2308481af731fb5bfd2c.dl.dropboxusercontent.com

In [45]:
ds_rsds = xr.open_dataset("RSDS_data.nc")
ds_pr = xr.open_dataset("PR_data.nc")
print(ds_rsds)


<xarray.Dataset> Size: 3GB
Dimensions:    (y: 1057, x: 656, time: 1200, bnds: 2)
Coordinates:
    lat        (y, x) float32 3MB ...
    lon        (y, x) float32 3MB ...
  * time       (time) object 10kB 1980-12-16 00:00:00 ... 2080-11-16 00:00:00
  * x          (x) float32 3kB 500.0 1.5e+03 2.5e+03 ... 6.545e+05 6.555e+05
  * y          (y) float32 4kB 500.0 1.5e+03 2.5e+03 ... 1.056e+06 1.056e+06
Dimensions without coordinates: bnds
Data variables:
    time_bnds  (time, bnds) object 19kB ...
    x_bnds     (x, bnds) float32 5kB ...
    y_bnds     (y, bnds) float32 8kB ...
    crsOSGB    int32 4B ...
    rsds       (time, y, x) float32 3GB ...
Attributes: (12/35)
    title:                         CHESS-SCAPE: Future projections of meteoro...
    summary:                       Gridded daily meteorological variables ove...
    source:                        This dataset was derived from four ensembl...
    cdm_data_type:                 grid
    standard_name_vocabulary:      CF Standa

In [46]:
# Process rsds to match chess
import numpy as np

lat_grid = ds_rsds['lat'].values
lon_grid = ds_rsds['lon'].values

target_lat = 53.246075
target_lon = -2.5

dist = np.sqrt((lat_grid - target_lat)**2 + (lon_grid - target_lon)**2)
min_idx = np.unravel_index(np.argmin(dist), dist.shape)
y_idx, x_idx = min_idx

print(f"Closest grid cell to Manchester is at x={x_idx}, y={y_idx}")
rsds_series = ds_rsds['rsds'][:, y_idx, x_idx]
rsds_df = rsds_series.to_dataframe().reset_index()

rsds_df.rename(columns={'rsds': 'FSNS_chess'}, inplace=True)

rsds_df['time'] = rsds_df['time'].astype(str)
rsds_df['time'] = pd.to_datetime(rsds_df['time'], errors='coerce')
rsds_df['year'] = rsds_df['time'].dt.year
rsds_df['month'] = rsds_df['time'].dt.month
rsds_df = rsds_df[(rsds_df['year'] >= 2006) & (rsds_df['year'] <= 2080)]

rsds_df = rsds_df.drop(columns=['x', 'y'])
rsds_df['lon'] = rsds_df['lon'].apply(lambda lon: lon - 360 if lon > 180 else lon)
print("Longitude range:", rsds_df['lon'].min(), "to", rsds_df['lon'].max())
rsds_df = rsds_df[['time', 'year', 'month', 'lat', 'lon', 'FSNS_chess']]




Closest grid cell to Manchester is at x=366, y=372
Longitude range: -2.5021114349365234 to -2.5021114349365234


In [47]:
# Process pr to match chess
import numpy as np

lat_grid = ds_pr['lat'].values
lon_grid = ds_pr['lon'].values

target_lat = 53.246075
target_lon = -2.5

dist = np.sqrt((lat_grid - target_lat)**2 + (lon_grid - target_lon)**2)
min_idx = np.unravel_index(np.argmin(dist), dist.shape)
y_idx, x_idx = min_idx

print(f"Closest grid cell to Manchester is at x={x_idx}, y={y_idx}")
pr_series = ds_pr['pr'][:, y_idx, x_idx]
pr_df = pr_series.to_dataframe().reset_index()
pr_df


pr_df.rename(columns={'pr': 'PRECT_chess'}, inplace=True)

pr_df['time'] = pr_df['time'].astype(str)
pr_df['time'] = pd.to_datetime(pr_df['time'], errors='coerce')
pr_df['year'] = pr_df['time'].dt.year
pr_df['month'] = pr_df['time'].dt.month
pr_df = pr_df[(pr_df['year'] >= 2006) & (pr_df['year'] <= 2080)]

pr_df = pr_df.drop(columns=['x', 'y'])
pr_df['lon'] = pr_df['lon'].apply(lambda lon: lon - 360 if lon > 180 else lon)
print("Longitude range:", pr_df['lon'].min(), "to", pr_df['lon'].max())
pr_df = pr_df[['time', 'year', 'month', 'lat', 'lon', 'PRECT_chess']]

Closest grid cell to Manchester is at x=366, y=372
Longitude range: -2.5021114349365234 to -2.5021114349365234


# **Merge Data**

In [48]:
# Merge MAN_data

cleaned_mme = []

for model_name, df in MAN_data.items():
    df = df.copy()
    model_code = model_name.split("_")[1]
    df['model'] = model_code
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df = df[['date', 'year', 'month', 'TREFMXAV_U', 'FLNS', 'FSNS', 'PRECT', 'PRSN',
             'QBOT', 'TREFHT', 'UBOT', 'VBOT', 'lat', 'lon', 'model']]

    cleaned_mme.append(df)

mme_df = pd.concat(cleaned_mme, ignore_index=True)



In [49]:
mme_df

Unnamed: 0,date,year,month,TREFMXAV_U,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT,lat,lon,model
0,2006-01-01,2006,1,282.775848,41.925179,25.926952,4.663135e-09,4.781004e-17,0.004769,279.142883,3.878579,1.394184,53.246075,357.5,003
1,2006-01-02,2006,1,284.471130,8.905806,10.946910,8.046593e-08,1.295726e-16,0.006271,281.148651,1.788157,3.821703,53.246075,357.5,003
2,2006-01-03,2006,1,284.287964,16.511415,6.405902,2.910935e-09,4.094447e-16,0.005782,281.223785,0.804845,-2.299140,53.246075,357.5,003
3,2006-01-04,2006,1,282.121094,29.948362,9.315041,7.432505e-11,0.000000e+00,0.004277,278.397675,-1.100267,-1.258029,53.246075,357.5,003
4,2006-01-05,2006,1,280.890900,19.064156,7.941241,6.894238e-12,0.000000e+00,0.003990,278.063782,-0.706321,-0.938421,53.246075,357.5,003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164239,2080-12-26,2080,12,286.378754,17.438847,6.720299,3.892051e-08,7.616115e-15,0.006004,282.548523,2.741483,7.565282,53.246075,357.5,008
164240,2080-12-27,2080,12,286.621307,26.986507,12.976557,2.301632e-07,6.722780e-16,0.005813,281.885712,4.295599,7.222901,53.246075,357.5,008
164241,2080-12-28,2080,12,282.327393,41.652626,16.614481,2.653475e-08,1.264485e-15,0.004281,279.076965,3.328679,4.574871,53.246075,357.5,008
164242,2080-12-29,2080,12,281.410248,56.226501,24.193817,1.709695e-08,2.954024e-17,0.004289,277.916626,2.779102,4.442761,53.246075,357.5,008


In [50]:
# Prep chess to match mme dataset

chess_df = chess_df.copy()

chess_df.rename(columns={'tasmax_chess': 'TREFMXAV_U'}, inplace=True)

for col in ['FLNS', 'FSNS', 'PRECT', 'PRSN', 'QBOT', 'TREFHT', 'UBOT', 'VBOT']:
    chess_df[col] = np.nan

chess_df['date'] = pd.to_datetime(chess_df[['year', 'month']].assign(day=15))

chess_df['model'] = '0085'

chess_df = chess_df[['date', 'year', 'month', 'TREFMXAV_U', 'FLNS', 'FSNS', 'PRECT', 'PRSN',
                     'QBOT', 'TREFHT', 'UBOT', 'VBOT', 'lat', 'lon', 'model']]
chess_df

Unnamed: 0,date,year,month,TREFMXAV_U,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT,lat,lon,model
301,2006-01-15,2006,1,281.001984,,,,,,,,,53.24818,-2.502111,0085
302,2006-02-15,2006,2,281.770294,,,,,,,,,53.24818,-2.502111,0085
303,2006-03-15,2006,3,283.746277,,,,,,,,,53.24818,-2.502111,0085
304,2006-04-15,2006,4,284.548462,,,,,,,,,53.24818,-2.502111,0085
305,2006-05-15,2006,5,286.758942,,,,,,,,,53.24818,-2.502111,0085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,2080-07-15,2080,7,295.305359,,,,,,,,,53.24818,-2.502111,0085
1196,2080-08-15,2080,8,297.577484,,,,,,,,,53.24818,-2.502111,0085
1197,2080-09-15,2080,9,295.929199,,,,,,,,,53.24818,-2.502111,0085
1198,2080-10-15,2080,10,292.206055,,,,,,,,,53.24818,-2.502111,0085


In [51]:
# Merge pr and rsds with chess (rename and drop additional columns)

pr_df.rename(columns={'PRECT_chess': 'PRECT'}, inplace=True)
rsds_df.rename(columns={'FSNS_chess': 'FSNS'}, inplace=True)

chess_df = pd.merge(chess_df, pr_df[['year', 'month', 'PRECT']], on=['year', 'month'], how='left')
chess_df = pd.merge(chess_df, rsds_df[['year', 'month', 'FSNS']], on=['year', 'month'], how='left')

chess_df.drop(columns=['PRECT_x', 'FSNS_x'], inplace=True)
chess_df.rename(columns={'PRECT_y': 'PRECT', 'FSNS_y': 'FSNS'}, inplace=True)


In [52]:
chess_df

Unnamed: 0,date,year,month,TREFMXAV_U,FLNS,PRSN,QBOT,TREFHT,UBOT,VBOT,lat,lon,model,PRECT,FSNS
0,2006-01-15,2006,1,281.001984,,,,,,,53.24818,-2.502111,0085,0.000038,22.417812
1,2006-02-15,2006,2,281.770294,,,,,,,53.24818,-2.502111,0085,0.000027,48.379295
2,2006-03-15,2006,3,283.746277,,,,,,,53.24818,-2.502111,0085,0.000023,95.018860
3,2006-04-15,2006,4,284.548462,,,,,,,53.24818,-2.502111,0085,0.000024,142.946579
4,2006-05-15,2006,5,286.758942,,,,,,,53.24818,-2.502111,0085,0.000022,175.253571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,2080-07-15,2080,7,295.305359,,,,,,,53.24818,-2.502111,0085,0.000037,215.390106
895,2080-08-15,2080,8,297.577484,,,,,,,53.24818,-2.502111,0085,0.000010,197.854187
896,2080-09-15,2080,9,295.929199,,,,,,,53.24818,-2.502111,0085,0.000006,147.282272
897,2080-10-15,2080,10,292.206055,,,,,,,53.24818,-2.502111,0085,0.000021,74.600731


In [54]:
# Merge chess & mme into full_df & fix Kelvin -> Celcius
full_df = pd.concat([mme_df, chess_df], ignore_index=True)

full_df['TREFMXAV_U'] = full_df['TREFMXAV_U'] - 273.15


print(full_df['model'].value_counts())
full_df

model
003     27374
004     27374
005     27374
006     27374
007     27374
008     27374
0085      899
Name: count, dtype: int64


  full_df = pd.concat([mme_df, chess_df], ignore_index=True)


Unnamed: 0,date,year,month,TREFMXAV_U,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT,lat,lon,model
0,2006-01-01,2006,1,9.625854,41.925179,25.926952,4.663135e-09,4.781004e-17,0.004769,279.142883,3.878579,1.394184,53.246075,357.500000,003
1,2006-01-02,2006,1,11.321136,8.905806,10.946910,8.046593e-08,1.295726e-16,0.006271,281.148651,1.788157,3.821703,53.246075,357.500000,003
2,2006-01-03,2006,1,11.137970,16.511415,6.405902,2.910935e-09,4.094447e-16,0.005782,281.223785,0.804845,-2.299140,53.246075,357.500000,003
3,2006-01-04,2006,1,8.971100,29.948362,9.315041,7.432505e-11,0.000000e+00,0.004277,278.397675,-1.100267,-1.258029,53.246075,357.500000,003
4,2006-01-05,2006,1,7.740906,19.064156,7.941241,6.894238e-12,0.000000e+00,0.003990,278.063782,-0.706321,-0.938421,53.246075,357.500000,003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165138,2080-07-15,2080,7,22.155365,,215.390106,4.232548e-13,,,,,,53.248180,-2.502111,0085
165139,2080-08-15,2080,8,24.427490,,197.854187,1.110259e-13,,,,,,53.248180,-2.502111,0085
165140,2080-09-15,2080,9,22.779205,,147.282272,7.228998e-14,,,,,,53.248180,-2.502111,0085
165141,2080-10-15,2080,10,19.056061,,74.600731,2.394742e-13,,,,,,53.248180,-2.502111,0085


# **Validate Data**

In [56]:
# Negative Values

neg_check = full_df[['PRECT', 'PRSN', 'FLNS', 'FSNS', 'QBOT']].lt(0).sum()
print("Negative value counts:\n", neg_check)
vars_never_negative = ['PRECT', 'PRSN', 'FLNS', 'FSNS', 'QBOT']
for col in vars_never_negative:
    full_df[col] = full_df[col].apply(lambda x: max(x, 0) if pd.notnull(x) else x)


Negative value counts:
 PRECT    0
PRSN     0
FLNS     0
FSNS     0
QBOT     0
dtype: int64


In [59]:
full_df.describe()

Unnamed: 0,date,year,month,TREFMXAV_U,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT,lat,lon
count,165143,165143.0,165143.0,165143.0,164244.0,165143.0,165143.0,164244.0,164244.0,164244.0,164244.0,164244.0,165143.0,165143.0
mean,2043-07-01 17:50:58.374862848,2042.998432,6.525654,15.744396,43.101493,97.295389,3.358928e-08,3.165129e-10,0.006545,284.932526,1.208071,1.516496,53.246101,355.540232
min,2006-01-01 00:00:00,2006.0,1.0,-1.822937,0.0,1.445412,0.0,0.0,0.001555,266.015381,-9.797097,-11.123035,53.246075,-2.502111
25%,2024-10-01 00:00:00,2024.0,4.0,11.659805,26.179801,31.26635,1.979676e-09,0.0,0.004879,281.075012,-1.041753,-0.701179,53.246075,357.5
50%,2043-07-01 00:00:00,2043.0,7.0,15.240723,40.78441,78.376495,1.266884e-08,6.143178000000001e-22,0.006246,284.656403,1.518637,1.617521,53.246075,357.5
75%,2062-04-01 00:00:00,2062.0,10.0,19.824814,57.289759,150.102432,4.563117e-08,2.414438e-16,0.007988,288.995483,3.401309,3.621754,53.246075,357.5
max,2080-12-30 00:00:00,2080.0,12.0,38.107269,118.209473,310.974915,8.357912e-07,4.985002e-07,0.018279,303.117981,12.690058,12.794631,53.24818,357.5
std,,21.647895,3.447774,5.23517,21.538836,75.151348,5.045284e-08,5.173666e-09,0.002123,4.986011,3.061526,2.922178,0.126948,26.489319


In [60]:
# Fix longitude
mme_mask = full_df['model'] != '0085'

full_df.loc[mme_mask, 'lon'] = full_df.loc[mme_mask, 'lon'].apply(
    lambda x: x - 360 if x > 180 else x
)


In [61]:
# Basic overview
print(full_df.shape)
print(full_df.dtypes)
print(full_df['model'].value_counts())
print(full_df.isna().sum())


(165143, 15)
date          datetime64[ns]
year                   int32
month                  int32
TREFMXAV_U           float32
FLNS                 float64
FSNS                 float64
PRECT                float64
PRSN                 float64
QBOT                 float64
TREFHT               float32
UBOT                 float32
VBOT                 float32
lat                  float32
lon                  float64
model                 object
dtype: object
model
003     27374
004     27374
005     27374
006     27374
007     27374
008     27374
0085      899
Name: count, dtype: int64
date            0
year            0
month           0
TREFMXAV_U      0
FLNS          899
FSNS            0
PRECT           0
PRSN          899
QBOT          899
TREFHT        899
UBOT          899
VBOT          899
lat             0
lon             0
model           0
dtype: int64


In [69]:
# 1. Check Common varibales between Chess and MME
print("TREFMXAV_U value check per model")
print(full_df.groupby('model')['TREFMXAV_U'].describe())

print("\n")

print("year FSNS check per model")
print(full_df.groupby('model')['FSNS'].describe())

print("\n")



TREFMXAV_U value check per model
         count       mean       std       min        25%        50%  \
model                                                                 
003    27374.0  15.686185  5.246464  1.003204  11.569466  15.149902   
004    27374.0  15.700686  5.202891 -0.825134  11.598122  15.225266   
005    27374.0  15.784789  5.201962  0.918610  11.754562  15.211945   
006    27374.0  15.923167  5.163857 -0.626282  11.849289  15.398026   
007    27374.0  15.685923  5.290716 -1.822937  11.567924  15.250763   
008    27374.0  15.702014  5.283375 -0.152435  11.673470  15.229797   
0085     899.0  15.244807  5.714432  3.789032  10.229218  14.668976   

             75%        max  
model                        
003    19.719528  38.107269  
004    19.829247  34.225830  
005    19.867134  33.341034  
006    19.967567  37.395142  
007    19.779549  34.530701  
008    19.792931  36.813934  
0085   19.922592  31.322357  


year FSNS check per model
         count        mean   

In [80]:
#Final checks
import pandas as pd
import numpy as np

print("TREFMXAV_U value check per model (should be 0-30 C):")
print(full_df.groupby('model')['TREFMXAV_U'].describe())

print("\n")

print("Date range check:")
print(f"Min date: {full_df['date'].min()}")
print(f"Max date: {full_df['date'].max()}")

print("\n")

print("Missing values per column:")
print(full_df.isnull().sum())

print("\n")

expected_cols = ['TREFMXAV_U', 'FLNS', 'FSNS', 'PRECT', 'PRSN', 'QBOT', 'TREFHT', 'UBOT', 'VBOT']
models = full_df['model'].unique()

print("Feature presence check per model:")
for model in models:
    print(f"\nModel {model}")
    sub_df = full_df[full_df['model'] == model]
    print(sub_df[expected_cols].notnull().sum())

print("\n")

print("Data types:")
print(full_df.dtypes)

print("\n")

print("Row counts per model:")
print(full_df['model'].value_counts())



TREFMXAV_U value check per model (should be 0-30 C):
         count       mean       std       min        25%        50%  \
model                                                                 
003    27374.0  15.686185  5.246464  1.003204  11.569466  15.149902   
004    27374.0  15.700686  5.202891 -0.825134  11.598122  15.225266   
005    27374.0  15.784789  5.201962  0.918610  11.754562  15.211945   
006    27374.0  15.923167  5.163857 -0.626282  11.849289  15.398026   
007    27374.0  15.685923  5.290716 -1.822937  11.567924  15.250763   
008    27374.0  15.702014  5.283375 -0.152435  11.673470  15.229797   
0085     899.0  15.244807  5.714432  3.789032  10.229218  14.668976   

             75%        max  
model                        
003    19.719528  38.107269  
004    19.829247  34.225830  
005    19.867134  33.341034  
006    19.967567  37.395142  
007    19.779549  34.530701  
008    19.792931  36.813934  
0085   19.922592  31.322357  


Date range check:
Min date: 2006-01

In [81]:
# Downlaod
full_df.to_csv("full_dataset.csv", index=False)
from google.colab import files
files.download("full_dataset.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>