In [1]:
import netCDF4 as nc
import numpy as np
import pandas as pd
import statsmodels.api as sm
import xarray as xr
import cartopy.crs as ccrs
import matplotlib.pyplot as plt



In [2]:
file_prhist = 'C:/Users/Windows10/Documents/GitHub/NCfilesmetro/GCMfromstations/pr_Amon_ACCESS-CM2_historical_r1i1p1f1_gn_185001-201412.nc'
file_tashist = 'C:/Users/Windows10/Documents/GitHub/NCfilesmetro/GCMfromstations/tas_Amon_ACCESS-CM2_historical_r1i1p1f1_gn_185001-201412.nc'
file_tasmaxhist = 'C:/Users/Windows10/Documents/GitHub/NCfilesmetro/GCMfromstations/tasmax_Amon_ACCESS-CM2_historical_r1i1p1f1_gn_185001-201412.nc'
file_tasminhist = 'C:/Users/Windows10/Documents/GitHub/NCfilesmetro/GCMfromstations/tasmin_Amon_ACCESS-CM2_historical_r1i1p1f1_gn_185001-201412.nc'



In [3]:
ds_prhist = xr.open_dataset(file_prhist)
ds_tashist = xr.open_dataset(file_tashist)
ds_tasmaxhist = xr.open_dataset(file_tasmaxhist)
ds_tasminhist = xr.open_dataset(file_tasminhist)


In [4]:
(
    print(ds_prhist.data_vars),
    print(ds_tashist.data_vars),
    print(ds_tasmaxhist.data_vars),
    print(ds_tasminhist.data_vars)
)

Data variables:
    time_bnds  (time, bnds) datetime64[ns] 32kB ...
    lat_bnds   (lat, bnds) float64 2kB ...
    lon_bnds   (lon, bnds) float64 3kB ...
    pr         (time, lat, lon) float32 219MB ...
Data variables:
    time_bnds  (time, bnds) datetime64[ns] 32kB ...
    lat_bnds   (lat, bnds) float64 2kB ...
    lon_bnds   (lon, bnds) float64 3kB ...
    tas        (time, lat, lon) float32 219MB ...
Data variables:
    time_bnds  (time, bnds) datetime64[ns] 32kB ...
    lat_bnds   (lat, bnds) float64 2kB ...
    lon_bnds   (lon, bnds) float64 3kB ...
    tasmax     (time, lat, lon) float32 219MB ...
Data variables:
    time_bnds  (time, bnds) datetime64[ns] 32kB ...
    lat_bnds   (lat, bnds) float64 2kB ...
    lon_bnds   (lon, bnds) float64 3kB ...
    tasmin     (time, lat, lon) float32 219MB ...


(None, None, None, None)

In [5]:
ds_prhist

In [6]:
import dask
import dask.dataframe as dd


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [7]:
ds_prhist = ds_prhist.chunk({'time': 100})  # Chunk by time dimension
ds_prhist_df = ds_prhist['pr'].to_dataframe().reset_index()

ds_tashist = ds_tashist.chunk({'time': 100})
ds_tashist_df = ds_tashist['tas'].to_dataframe().reset_index()

ds_tasmaxhist = ds_tasmaxhist.chunk({'time': 100})
ds_tasmaxhist_df = ds_tasmaxhist['tasmax'].to_dataframe().reset_index()

ds_tasminhist = ds_tasminhist.chunk({'time': 100})
ds_tasminhist_df = ds_tasminhist['tasmin'].to_dataframe().reset_index()

In [8]:
def optimize_memory(df):
    df['time'] = pd.to_datetime(df['time'])
    for col in df.select_dtypes(include=['float']).columns:
        df[col] = pd.to_numeric(df[col], downcast='float')
    return df

ds_prhist_df = optimize_memory(ds_prhist_df)
ds_tashist_df = optimize_memory(ds_tashist_df)
ds_tasmaxhist_df = optimize_memory(ds_tasmaxhist_df)
ds_tasminhist_df = optimize_memory(ds_tasminhist_df)

In [9]:
ds_tashist_df = ds_tashist_df.drop(columns=['height'])
ds_tasmaxhist_df = ds_tasmaxhist_df.drop(columns=['height'])
ds_tasminhist_df = ds_tasminhist_df.drop(columns=['height'])

In [10]:
# check the df that were created
print(ds_prhist_df)
print(ds_tashist_df)
print(ds_tasmaxhist_df)
print(ds_tasminhist_df)

                        time     lat       lon        pr
0        1850-01-16 12:00:00 -89.375    0.9375  0.000003
1        1850-01-16 12:00:00 -89.375    2.8125  0.000002
2        1850-01-16 12:00:00 -89.375    4.6875  0.000002
3        1850-01-16 12:00:00 -89.375    6.5625  0.000002
4        1850-01-16 12:00:00 -89.375    8.4375  0.000002
...                      ...     ...       ...       ...
54743035 2014-12-16 12:00:00  89.375  351.5625  0.000004
54743036 2014-12-16 12:00:00  89.375  353.4375  0.000004
54743037 2014-12-16 12:00:00  89.375  355.3125  0.000004
54743038 2014-12-16 12:00:00  89.375  357.1875  0.000004
54743039 2014-12-16 12:00:00  89.375  359.0625  0.000004

[54743040 rows x 4 columns]
                        time     lat       lon         tas
0        1850-01-16 12:00:00 -89.375    0.9375  252.684326
1        1850-01-16 12:00:00 -89.375    2.8125  252.653320
2        1850-01-16 12:00:00 -89.375    4.6875  252.612793
3        1850-01-16 12:00:00 -89.375    6.5625  252

In [11]:
def merge_dataframes(dfs, on_columns):
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = pd.merge(merged_df, df, on=on_columns, how='inner')
    return merged_df

# List of DataFrames to merge
dataframes = [ds_prhist_df, ds_tashist_df, ds_tasmaxhist_df, ds_tasminhist_df]

# Correctly pass the parameter name 'on_columns'
merged_df = merge_dataframes(dataframes, on_columns=['time', 'lat', 'lon'])

In [12]:
merged_df.isnull().sum() #cleaned data

time      0
lat       0
lon       0
pr        0
tas       0
tasmax    0
tasmin    0
dtype: int64

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder

In [21]:
merged_df

Unnamed: 0,time,lat,lon,pr,tas,tasmax,tasmin,year,month,day
0,1850-01-16 12:00:00,-89.375,0.9375,0.000003,252.684326,253.631134,251.654175,1850,1,16
1,1850-01-16 12:00:00,-89.375,2.8125,0.000002,252.653320,253.584991,251.611328,1850,1,16
2,1850-01-16 12:00:00,-89.375,4.6875,0.000002,252.612793,253.563965,251.564606,1850,1,16
3,1850-01-16 12:00:00,-89.375,6.5625,0.000002,252.603760,253.545990,251.545807,1850,1,16
4,1850-01-16 12:00:00,-89.375,8.4375,0.000002,252.584961,253.539780,251.542648,1850,1,16
...,...,...,...,...,...,...,...,...,...,...
54743035,2014-12-16 12:00:00,89.375,351.5625,0.000004,240.812256,243.706772,237.623932,2014,12,16
54743036,2014-12-16 12:00:00,89.375,353.4375,0.000004,240.815674,243.643356,237.621887,2014,12,16
54743037,2014-12-16 12:00:00,89.375,355.3125,0.000004,240.784912,243.617371,237.600327,2014,12,16
54743038,2014-12-16 12:00:00,89.375,357.1875,0.000004,240.790771,243.611511,237.623184,2014,12,16


In [24]:
merged_df['time'] = pd.to_datetime(merged_df['time'])
merged_df['year'] = merged_df['time'].dt.year
merged_df['month'] = merged_df['time'].dt.month
merged_df['day'] = merged_df['time'].dt.day


In [26]:
X = merged_df[['year', 'month', 'day', 'lat', 'lon']]  
y = merged_df[['pr', 'tas', 'tasmin', 'tasmax']]

In [28]:
train_data = merged_df[merged_df['year'] < 2015]
test_data = merged_df[merged_df['year'] >= 2015]

In [30]:
X_train = train_data[['year', 'month', 'day', 'lat', 'lon']]
y_train = train_data[['pr', 'tas', 'tasmin', 'tasmax']]
X_test = test_data[['year', 'month', 'day', 'lat', 'lon']]

In [36]:
model = DecisionTreeRegressor(max_depth=10, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = model.predict(X_test)