# Import Libraries

In [1]:
import netCDF4
import numpy as np
import pandas as pd
import xarray as xr
import datetime

# Loading netCDF files into local environment

## Load in netCDF files

In [2]:
## OPENING 2 .nc files of each ensemble member (from 003 to 033), whilst naming them (with previously printed names) 
## in local environment as, e.g., ensemble3_0_2 and ensemble3_352_360

## Array which we will fill with each Dataset
temp_datasets = []

for i in range(31):
    ## Ensemble number starts from 003
    ensemble_member = 3+i
    ## Need to convert to string for next steps
    ensemble_member_str = str(ensemble_member)
    if ensemble_member<=9:
        for j in range(1):
            dataset_name = "ensemble"+ensemble_member_str+"_0_2"
            dataset_name = xr.open_dataset("00"+ensemble_member_str+"_2006_2080_0_2.nc")
            ## Appending opened Dataset to array
            temp_datasets.append(dataset_name)
        for k in range(1):
            dataset_name = "ensemble"+ensemble_member_str+"_352_360"
            dataset_name = xr.open_dataset("00"+ensemble_member_str+"_2006_2080_352_360.nc")
            temp_datasets.append(dataset_name)
    else:
        for j in range(1):
            dataset_name = "ensemble"+ensemble_member_str+"_0_2"
            dataset_name = xr.open_dataset("0"+ensemble_member_str+"_2006_2080_0_2.nc")
            temp_datasets.append(dataset_name)
        for k in range(1):
            dataset_name = "ensemble"+ensemble_member_str+"_352_360"
            dataset_name = xr.open_dataset("0"+ensemble_member_str+"_2006_2080_352_360.nc")
            temp_datasets.append(dataset_name)

## Combining the two Datasets per ensemble member and Re-setting longitute coordinates

In [3]:
## Re-setting longitute coordinates
## Loop through each odd i-th Dataset in array
for i in range(62):
    if i%2==1:     ## means when i is odd
        ## Change coordinates
        temp_datasets[i] = temp_datasets[i].assign_coords(lon=temp_datasets[i].lon - 360)
    else:
        continue

## Combining together
## Full Datasets saved in an array
datasets = []

## Loop through each odd i-th Dataset in array
for i in range(62):
    if i%2==0:     ## means when i is even
        to_add = temp_datasets[i].combine_first(temp_datasets[(i+1)])
        datasets.append(to_add)

## Convert to Pandas data frame

#### <font color=orange>Full simulation period (2006-2080)</font>

In [4]:
## All variables of Datasets saved in array        
df = []

## Converting to pandas dataframe while dropping null values
for i in datasets:
    to_add = i.to_dataframe().dropna()
    df.append(to_add)
    
##Adding ensemble number to each data frame as a new column
for i in range(31):
    df[i].insert(loc=0, column='Ensemble_num', value=i+3)

In [5]:
##Make full data frame of all ensemble member and location data
for i in range(31):
    ##Starting off the train set
    if i == 0:
        df_full = df[i]
    ##Concatenating each next ensemble member to the data frame
    else:
        df_full = pd.concat([df_full, df[i]])
        
display(df_full)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Ensemble_num,TREFMXAV_U,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT
time,lat,lon,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2006-01-02 00:00:00,49.476440,-1.25,3,283.678986,78.309052,49.704235,1.521918e-08,2.044216e-17,0.005410,281.388367,5.109643,-0.435288
2006-01-02 00:00:00,49.476440,0.00,3,283.351013,60.172768,43.689503,1.322234e-08,1.171104e-17,0.005415,279.859039,4.190400,0.893759
2006-01-02 00:00:00,49.476440,1.25,3,282.187378,32.727840,28.623762,1.178051e-09,5.449920e-21,0.005343,278.851868,3.444478,1.583298
2006-01-02 00:00:00,50.418850,-5.00,3,284.443939,78.301247,44.980167,1.131585e-08,8.991746e-22,0.005035,283.077118,6.209209,-3.468436
2006-01-02 00:00:00,50.418850,-3.75,3,284.329865,82.156403,46.794003,8.339057e-09,1.546433e-20,0.005278,282.473907,5.486755,-2.518895
...,...,...,...,...,...,...,...,...,...,...,...,...
2080-12-31 00:00:00,57.958115,-6.25,33,281.492188,52.250000,5.492432,8.324969e-08,6.823153e-13,0.004242,280.000000,3.484375,-1.656250
2080-12-31 00:00:00,57.958115,-3.75,33,280.949799,27.625000,5.855469,8.928873e-08,2.512888e-08,0.004303,276.250000,2.156250,1.687500
2080-12-31 00:00:00,57.958115,-2.50,33,281.729858,59.250000,9.607666,8.084498e-08,1.265862e-13,0.004547,279.125000,3.687500,2.812500
2080-12-31 00:00:00,57.958115,-1.25,33,282.414215,75.000000,13.924561,5.770107e-08,2.130295e-15,0.004395,281.000000,6.125000,3.125000


'time' domain isn't good, convert to datetime format instead.

Cannot directly change domain, must save to CSV file first, then re-load data frame back in to set 'time' domain to object.

Then, convert to datetime and save as PARQUET file (to save memory)

## Save to CSV

In [17]:
##Save to CSV
#df_full.to_csv('Final full dataset.csv')
##Re-load in from CSV file
df_full = pd.read_csv('Final full dataset.csv')

print("time domain has now reset:")
display(df_full.info())

time domain has now reset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44126888 entries, 0 to 44126887
Data columns (total 13 columns):
 #   Column        Dtype  
---  ------        -----  
 0   time          object 
 1   lat           float64
 2   lon           float64
 3   Ensemble_num  int64  
 4   TREFMXAV_U    float64
 5   FLNS          float64
 6   FSNS          float64
 7   PRECT         float64
 8   PRSN          float64
 9   QBOT          float64
 10  TREFHT        float64
 11  UBOT          float64
 12  VBOT          float64
dtypes: float64(11), int64(1), object(1)
memory usage: 4.3+ GB


None

# Feature Engineering

## Convert 'time' to datetime domain

In [18]:
##Change 'time' to datetime format
df_full['time'] = pd.to_datetime(df_full['time'])

## Extract 'Year' and 'Day_of_year' from 'time'

In [19]:
##Converting time column to datetime and saving
datetime = df_full['time']

##Extracting month & year
df_full['Year'], df_full['Day_of_year'] = datetime.dt.year, datetime.dt.dayofyear

## Re-set index

In [20]:
##Re-setting MultiIndex with 'time','lat','lon'
df_full = df_full.set_index(['time','lat','lon'])

### Re-order columns so that atmospheric variables are to the RHS

In [21]:
##Re-order columns
df_full = df_full[['Ensemble_num', 'TREFMXAV_U', 'Day_of_year', 'Year', 'FLNS', 'FSNS', 'PRECT', 'PRSN','QBOT', 'TREFHT', 'UBOT', 'VBOT']]

## Save full data frame to PARQUET file

In [22]:
df_full.to_parquet('Final full dataset.parquet', engine = 'pyarrow', compression = 'gzip')

In [2]:
##Checking our full data frame
df_full = pd.read_parquet('Final full dataset.parquet', engine='pyarrow')

df_full

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Ensemble_num,TREFMXAV_U,Day_of_year,Year,FLNS,FSNS,PRECT,PRSN,QBOT,TREFHT,UBOT,VBOT
time,lat,lon,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2006-01-02,49.476440,-1.25,3,283.67900,2,2006,78.309050,49.704235,1.521918e-08,2.044216e-17,0.005410,281.38837,5.109643,-0.435289
2006-01-02,49.476440,0.00,3,283.35100,2,2006,60.172768,43.689503,1.322234e-08,1.171104e-17,0.005415,279.85904,4.190400,0.893759
2006-01-02,49.476440,1.25,3,282.18738,2,2006,32.727840,28.623762,1.178051e-09,5.449920e-21,0.005343,278.85187,3.444478,1.583298
2006-01-02,50.418850,-5.00,3,284.44394,2,2006,78.301250,44.980167,1.131585e-08,8.991746e-22,0.005035,283.07712,6.209209,-3.468437
2006-01-02,50.418850,-3.75,3,284.32986,2,2006,82.156400,46.794003,8.339057e-09,1.546433e-20,0.005278,282.47390,5.486755,-2.518895
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2080-12-31,57.958115,-6.25,33,281.49220,366,2080,52.250000,5.492432,8.324969e-08,6.823153e-13,0.004242,280.00000,3.484375,-1.656250
2080-12-31,57.958115,-3.75,33,280.94980,366,2080,27.625000,5.855469,8.928873e-08,2.512888e-08,0.004303,276.25000,2.156250,1.687500
2080-12-31,57.958115,-2.50,33,281.72986,366,2080,59.250000,9.607666,8.084498e-08,1.265862e-13,0.004547,279.12500,3.687500,2.812500
2080-12-31,57.958115,-1.25,33,282.41420,366,2080,75.000000,13.924561,5.770107e-08,2.130295e-15,0.004395,281.00000,6.125000,3.125000
