# data_preprocessing
s1. surface temperature  
a) Calculate the surface temperature anomaly.
The anomaly is achieved by subtracting the annual cycle from the t2m data. 
Annual cycle: average of the same month of each year.

In [50]:
# load data
%matplotlib inline

import numpy as np
from netCDF4 import Dataset
import matplotlib.pyplot as plt
import datetime

ncf = Dataset('./data/ERA_interim_t2m_197901_201810.nc')
print(ncf.variables.keys())
t2m = ncf.variables['t2m'][:,:,:]
time = ncf.variables['time'][:]
time_units = ncf.variables['time'].units
lat = ncf.variables['latitude'][:]
lon = ncf.variables['longitude'][:]
# print(ncf)
print('============================')
# print(ncf.variables['latitude'])
# print(ncf.variables['longitude'])
print(ncf.variables['time'])
# print(ncf.variables['t2m'])
print('============================')
ncf.close()

dict_keys(['longitude', 'latitude', 'time', 't2m'])
<class 'netCDF4._netCDF4.Variable'>
int32 time(time)
    units: hours since 1900-01-01 00:00:00.0
    long_name: time
    calendar: gregorian
unlimited dimensions: time
current shape = (478,)
filling on, default _FillValue of -2147483647 used


In [51]:
# calculate the anomaly by removing the annual_cycle
def remove_annual_cycle(var):
    '''
    This function is to remove the annual cycle of the input variable
    monthly data are needed.
    
    Paras:
    var :: 3-D data with the 'time' as the first axis.
    
    Return:
    anomaly of the input variable in the same format of the input
    '''
    anomaly              =  np.zeros(var.shape)
    for ii in range(0,12): 
        temp                 =  var[ii::12,:,:]
        annual_cycle         =  np.mean(temp,axis = 0)
        anomaly[ii::12,:,:]      =  var[ii::12,:,:] - annual_cycle
    return anomaly

t2m_ano = remove_annual_cycle(t2m)
print('the shape of the anomaly of the t2m is ', t2m_ano.shape)

the shape of the anomaly of the t2m is  (478, 181, 360)


b) Extract the DJF

In [53]:
def extract_DJF(var,time,time_units):
    '''
    This function is to extract specific months from the input varibale.
    
    Paras:
    var        :: 3-D data with the 'time' as the first axis.
    time       :: the corresponding time coordinate of the variable.
    time_units :: the units of time in the format of gregorian.
    
    Return:
    variables with the extracted months
    '''
    start_year = int(time_units[-21:-17])
    start_month = int(time_units[-16:-14])
    start_date = int(time_units[-13:-11])
    intervals = time_units.split()[0]
    
    yyyymm		=	np.zeros(len(time), dtype=np.int32)
    start_time	=	datetime.datetime(start_year,start_month,start_date)
    
    for i in range(len(time)): 
#         it can be changed into command lines.
        if intervals == 'hours':    
            time_temp			=	start_time + datetime.timedelta(hours=int(time[i]))
        elif intervals == 'days':
            time_temp			=	start_time + datetime.timedelta(days=int(time[i]))
        else:
            print('ERROR: the intervals of the time units can only be hours or days')
                
        yyyymm[i]			=	int(time_temp.year * 100 + time_temp.month)

    mm				=	yyyymm%100
    #print(np.min(yyyymm_all),np.max(yyyymm_all))

    D_id	=	mm	==	12
    J_id	=	mm	==	1
    F_id	=	mm	==	2
    var_DJF	=	var[D_id|J_id|F_id,:,:]
    yyyymm_DJF	= yyyymm[D_id|J_id|F_id]
    
    return var_DJF, yyyymm_DJF

t2m_DJF, time_DJF = extract_DJF(t2m_ano,time,time_units)
print(t2m_DJF.shape)
print(t2m_DJF[1,:,:])

(119, 181, 360)
[[-7.23873684 -7.23873684 -7.23873684 ... -7.23873684 -7.23873684
  -7.23873684]
 [-6.9642131  -6.9735175  -6.98501633 ... -6.93261324 -6.94240042
  -6.95433814]
 [-6.75201129 -6.78308448 -6.81297268 ... -6.65681673 -6.68907491
  -6.71966533]
 ...
 [ 1.04178578  1.03730914  1.0327886  ...  1.04139078  1.04261967
   1.04222467]
 [ 1.01435535  1.00987871  1.00584095 ...  1.01246814  1.01409202
   1.01255592]
 [ 0.80965851  0.80965851  0.80965851 ...  0.80965851  0.80965851
   0.80965851]]


c) Calculate the average values of South-east of CHINA (20º-40ºN, 100-125ºE)

In [56]:

def area_mean(var,lat,lon,east,west,south,north):
    '''
    This function is to calculate the average mean of a rectangle area.
    
    Params:
    var                     ::  3-D data with [time, lat, lon]
    lat,lon                 ::  the coordinate of the variable
    east,west,south,north   ::  The four boundry of a rectangle area in the unit of degree.
    
    Return:
    The average values of the rectangle area in the sequence of time.

    '''
    
    latS	=	lat >= south
    latN	=	lat <= north
    lonW	=	lon >= west
    lonE	=	lon <= east
    
    lat_box	=	lat[latS&latN]
    lon_box	=	lon[lonE&lonW]

    var_box_temp	=	var[:,latS&latN,:]
    var_box	=	var_box_temp[:,:,lonE&lonW]
    
    print(var_box.shape)

    # First we need to convert the latitudes to radians
    latr = np.deg2rad(lat_box)
    # Use the cosine of the converted latitudes as weights for the average
    weights = np.cos(lat_box)
    # Assuming the shape of your data array is (nTimes, nLats, nLons)
    # First find the zonal mean SST by averaging along the latitude circles
    
    var_ave_zonal = var_box.mean(axis=2)
    print(var_ave_zonal.shape)

    # Then take the weighted average of those using the weights we calculated earlier
    var_ave = np.average(var_ave_zonal, axis=1, weights=weights)
    return var_ave

t2m_ano_DJF_SouthEast = area_mean(t2m_DJF,lat,lon,125,100,20,40)
print(t2m_ano_DJF_SouthEast)

(119, 21, 26)
(119, 21)
[ 1.14526     1.12976802  1.70585289  2.36752227 -1.51482148  1.98880564
 -0.51625641  3.73475546 -2.03200486 -0.65745135 -2.88841777 -2.86634823
 -3.40833591 -1.41740303 -3.47647357 -2.4862791  -3.17732235 -0.1124001
 -3.29863836 -0.76008265  2.43653628 -1.94878238 -0.20278842 -1.08072219
  0.24958378 -0.25336719 -3.89092861  2.88591138  0.62885301 -3.05984139
 -1.49832911 -4.38704583 -3.64754912 -1.67525232 -2.89899262  0.37054203
  1.33235369  0.08370239  0.31693246 -0.76630047  0.00889135  0.17191475
  0.52275147 -0.23680192 -0.11717197 -0.65302316  2.60783844  1.53863992
  0.5510473  -1.82306392 -0.42787703  1.18403045 -1.48562395 -0.73116412
 -0.51294286 -1.30637946  0.54353463  3.90047851  2.57819974  0.54594276
  0.77834694  2.02245104 -3.20175999  3.86637329  0.2460524  -0.16397211
 -1.09763313 -1.50777328 -0.28174013 -0.40576142 -4.36317136  2.97348751
 -0.56079321  0.8272811   2.70671365  0.38639063 -1.06445569 -2.23535244
 -1.78180568  2.81641492  0.