# Zscore 数据标准化/归一化 GLEAM ET
> [知乎详解](https://zhuanlan.zhihu.com/p/69074703)

In [1]:
import cftime
import netCDF4 as nc
import numpy as np
from sklearn import preprocessing
import xarray as xr

import warnings
warnings.filterwarnings("ignore")

In [None]:
# %%
def read_nc():
    global lat, lon, t, e, time
    inpath = r'E:/Gleamv3.6a/v3.6a/North East Asia region1/region1_E_1980-2021_GLEAM_v3.6a_MO.nc.nc'
    with nc.Dataset(inpath, mode='r') as f:
        '''
        print(f.variables.keys())
        print(f.variables['time'])
        print(f.variables['lat'])
        print(f.variables['lon'])
        print(f.variables['E'])
        '''
        time = (f.variables['time'][12:-12])  # 1982.1-2020.12
        t = nc.num2date(time, 'days since 1980-01-31 00:00:00').data

        e = (f.variables['E'][12:-12, :, :])
        lat = (f.variables['lat'][:])
        lon = (f.variables['lon'][:])

### 480 to 40年每月数据
1. Fuc: e_month   将(480, 60, 140)按顺序存放的40年数据变为逐月数据，e_mn:(40, 60, 140);
2. Fuc: e_month_reverse  将(40, 60, 140)的数据循环放入，得到data_z:(480, 60, 140);

In [None]:
# %%
def e_month(mn, data):
    e_mn = []
    ind = mn-1
    for i in range(40):
        # print(ind)
        e_mn.append(data[ind, :, :])
        ind += 12
    e_mn = np.array(e_mn)
    return e_mn


def e_month_reverse(mn, data, data_z):
    ind = mn-1

    for i in range(40):
        data_z[ind, :, :] = data[i, :, :]
        ind += 12

    return data_z

### 各变量说明
1. e_mn: (40, 60, 140) 循环得到40年每月数据；
2. e_mn_ave: (60, 140)月平均；
3. e_all: (12, 60, 140) 所有月份月平均；
4. e_mn_z: (40, 60, 140) e_mn 的归一化值；
5. data_z: (480, 60, 140) 原数据e的归一化；

In [None]:
# %%
read_nc()
# %% 多年月平均
e_all_mn = []
for mn in range(1, 13):
    e_mn = e_month(mn, e)
    e_mn_ave = np.nanmean(e_mn, axis=0)
    e_all_mn.append(e_mn_ave)
e_all_mn = np.array(e_all_mn)
# CreatNC(e_all_mn)

In [None]:
# %% Z-Score值
e_mn_z = np.zeros((40, 60, 140))
data_z = np.zeros((480, 60, 140))
for mn in range(1, 13):
    e_mn = e_month(mn, e)
    for r in range(60):
        for c in range(140):
            e_mn_z[:, r, c] = preprocessing.scale(e_mn[:, r, c])
            data_z = e_month_reverse(mn, e_mn_z, data_z)

### CreatNC

In [None]:
# %%
def CreatNC(data):
    new_NC = nc.Dataset(
        rf"E:/Gleamv3.6a/v3.6a/North East Asia region1/region1_E_month_ave.nc",
        'w', format='NETCDF4')

    new_NC.createDimension('time', 12)
    new_NC.createDimension('lat', len(lat))
    new_NC.createDimension('lon', len(lon))

    var = new_NC.createVariable('E', 'f', ("time", "lat", "lon"))
    new_NC.createVariable('lat', 'f', ("lat"))
    new_NC.createVariable('lon', 'f', ("lon"))

    new_NC.variables['E'][:] = data
    new_NC.variables['lat'][:] = lat
    new_NC.variables['lon'][:] = lon

    var.description = "1981.1-2020.12 E (actual e) 每月实际蒸散发总和平均值 mm/month"

    new_NC.close()


def CreatNC2(data):
    new_NC = nc.Dataset(
        rf"E:/Gleamv3.6a/v3.6a/North East Asia region1/region1_E_month_Zscore.nc",
        'w', format='NETCDF4')

    new_NC.createDimension('time', 480)
    new_NC.createDimension('lat', len(lat))
    new_NC.createDimension('lon', len(lon))

    var = new_NC.createVariable('E', 'f', ("time", "lat", "lon"))
    new_NC.createVariable('time', 'f', ("time"))
    new_NC.createVariable('lat', 'f', ("lat"))
    new_NC.createVariable('lon', 'f', ("lon"))

    new_NC.variables['E'][:] = data
    new_NC.variables['time'][:] = time
    new_NC.variables['lat'][:] = lat
    new_NC.variables['lon'][:] = lon

    var.description = "1981.1-2020.12 E (actual e) 每月实际蒸散发总和平均值 mm/month"
    var.time = "days since 1980-01-31 00:00:00"

    new_NC.close()

# 原数据-> 月年 OR 季节年 OR 标准化值 OR 异常值 CRU TMP

### 读取数据

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Thu Oct 20 11:40:34 2022

@author: MaYutong
"""

import netCDF4 as nc
import cftime
import xarray as xr
import numpy as np
from sklearn import preprocessing

import warnings
warnings.filterwarnings("ignore")

def read_data_nc1(a, b):
    global lat, lon
    inpath = (f"E:/CRU/TMP_DATA/cru_ts4.06.{a}.{b}.tmp.dat.nc")
    with nc.Dataset(inpath, mode='r') as f:
        '''
        print(f.variables.keys())
        print(f.variables['pre'])
        print(f.variables['time'])
        '''
        #print(f.variables['stn'])
        
        lat = (f.variables['lat'][:])
        lon = (f.variables['lon'][:])
        time = (f.variables['time'][:])
        t = nc.num2date(time, 'days since 1900-01-01 00:00:0.0').data
        tmp = (f.variables['tmp'][:])
        
        return time, tmp

# %%
def region1(data):
    pre_rg1global = xr.DataArray(data, dims=['t', 'y', 'x'], coords=[
                               time, lat, lon])  # 原tmp-base数据
    pre_rg1 = pre_rg1global.loc[:, 40:55, 100:135]

    return pre_rg1        

#%%
yr1 = [1981, 1991, 2001, 2011]
yr2 = [1990, 2000, 2010, 2020]
for a, b in zip(yr1, yr2):
    t, _ = read_data_nc1(a, b)
    if a==1981:
        time, tmp = t, _
    else:
        time, tmp = np.hstack((time, t)), np.vstack((tmp, _))
        

#%%
'''
pre_rg1 = region1(pre)
lat_rg1 = pre_rg1.y
lon_rg1 = pre_rg1.x
'''



### 原始数据连接 -> (480, 360, 720)

In [None]:
#%%生成新的nc文件
def CreatNC(data):
    new_NC = nc.Dataset(r"E:/CRU/TMP_DATA/TMP_CRU_ORIGINAL_81_20.nc", 'w', format='NETCDF4')
    
    new_NC.createDimension('time', 480)
    new_NC.createDimension('lat', 360)
    new_NC.createDimension('lon', 720)
    
    var=new_NC.createVariable('tmp', 'f', ("time","lat","lon"))
    new_NC.createVariable('time', 'f', ("time"))
    new_NC.createVariable('lat', 'f', ("lat"))
    new_NC.createVariable('lon', 'f', ("lon"))
    
    new_NC.variables['tmp'][:]=data
    new_NC.variables['time'][:]=time
    new_NC.variables['lat'][:]=lat
    new_NC.variables['lon'][:]=lon
        
    
    #var.lat_range="[40, 55], 30, 精度：0.5, 边界：[40.25, 54.75]"
    #var.lon_range="[100, 135], 70, 精度：0.5, 边界：[100.25, 134.75]"
    var.Fillvalues="9.96921e+36"
    var.time="1981.1-2020.12"
    
    #最后记得关闭文件
    new_NC.close()

# CreatNC(tmp)

### 480(输入data) -> 月 年
> (480, 360, 720) -> (12, 40, 360, 720)

In [None]:
#%%生成新的nc文件
def CreatNC2(data):
    new_NC = nc.Dataset(r"E:/CRU/TMP_DATA/TMP_CRU_MONTH_81_20.nc", 'w', format='NETCDF4')
    
    year = np.arange(1, 41, 1)
    month = np.arange(1, 13, 1)
    
    new_NC.createDimension('month', 12)
    new_NC.createDimension('year', 40)
    new_NC.createDimension('lat', 360)
    new_NC.createDimension('lon', 720)
    
    var=new_NC.createVariable('tmp', 'f', ("month","year","lat","lon"))
    new_NC.createVariable('lat', 'f', ("lat"))
    new_NC.createVariable('lon', 'f', ("lon"))
    
    new_NC.variables['tmp'][:]=data
    new_NC.variables['lat'][:]=lat
    new_NC.variables['lon'][:]=lon
        
    
    #var.lat_range="[40, 55], 30, 精度：0.5, 边界：[40.25, 54.75]"
    #var.lon_range="[100, 135], 70, 精度：0.5, 边界：[100.25, 134.75]"
    var.Fillvalues="9.96921e+36"
    var.time="1981.1-2020.12"
    
    #最后记得关闭文件
    new_NC.close()
    
#%% 480(输入data) -> 月 年
def mn_yr(data):
    tmp_mn = []
    for mn in range(12):
        tmp_ = []
        for yr in range(40):
            tmp_.append(data[mn])
            mn += 12
        tmp_mn.append(tmp_)
            
    tmp_mn = np.array(tmp_mn)
    
    return tmp_mn

tmp_mn = mn_yr(tmp)

# CreatNC2(tmp_mn)



### 480 -> 月 年（输入data）-> 季节 年
> (12, 40, 360, 720)->(4, 40, 360, 720) 顺序：春夏秋冬 #####第一版计算冬季有问题，冬季应掐头去尾

In [None]:
# %%生成新的nc文件
def CreatNC3(data):
    new_NC = nc.Dataset(
        r"E:/CRU/TMP_DATA/TMP_CRU_SEASON_81_20.nc", 'w', format='NETCDF4')

    year = np.arange(1, 5, 1)
    month = np.arange(1, 13, 1)

    new_NC.createDimension('month', 4)
    new_NC.createDimension('year', 40)
    new_NC.createDimension('lat', 360)
    new_NC.createDimension('lon', 720)

    var = new_NC.createVariable('tmp', 'f', ("month", "year", "lat", "lon"))
    new_NC.createVariable('lat', 'f', ("lat"))
    new_NC.createVariable('lon', 'f', ("lon"))

    new_NC.variables['tmp'][:] = data
    new_NC.variables['lat'][:] = lat
    new_NC.variables['lon'][:] = lon

    #var.lat_range="[40, 55], 30, 精度：0.5, 边界：[40.25, 54.75]"
    #var.lon_range="[100, 135], 70, 精度：0.5, 边界：[100.25, 134.75]"
    var.Fillvalues = "9.96921e+36"
    var.time = "1981.1-2020.12"
    var.sort = "春 夏 秋 冬"

    # 最后记得关闭文件
    new_NC.close()

# %% 480 -> 月 年（输入data）-> 季节 年

'''冬季计算有问题版
def season_yr(data):
    tmp_s = np.vstack((data[2:, :], data[:2, :]))
    tmp_sea = []
    for mn1, mn2 in zip(range(0, 12, 3), range(3, 15, 3)):
        tmp_sea.append(tmp_s[mn1:mn2, :])

    tmp_sea = np.array(tmp_sea)
    tmp_sea = tmp_sea.mean(axis=1)

    return tmp_sea
'''

######### 修改版


# %%
tmp_sea = season_yr(tmp_mn)
# CreatNC3(tmp_sea)

### 480 -> Zscore归一化值
> 可输入年月值（此程序） 或480序列值(Gleam 程序)

> reverse非常慢，逐月保存  

> preprocessing()碰到NaN值会跳过

In [None]:
#%%生成新的nc文件
def CreatNC4(data, mn):
    new_NC = nc.Dataset(rf"E:/CRU/TMP_DATA/Zscore/TMP_CRU_Zscore_81_20_month{i}.nc", 'w', format='NETCDF4')
    
    year = np.arange(1, 41, 1)
    
    new_NC.createDimension('year', 40)
    new_NC.createDimension('lat', 360)
    new_NC.createDimension('lon', 720)
    
    var=new_NC.createVariable('tmp', 'f', ("year","lat","lon"))
    new_NC.createVariable('lat', 'f', ("lat"))
    new_NC.createVariable('lon', 'f', ("lon"))
    
    new_NC.variables['tmp'][:]=data
    new_NC.variables['lat'][:]=lat
    new_NC.variables['lon'][:]=lon
        
    
    #var.lat_range="[40, 55], 30, 精度：0.5, 边界：[40.25, 54.75]"
    #var.lon_range="[100, 135], 70, 精度：0.5, 边界：[100.25, 134.75]"
    var.Fillvalues="9.96921e+36"
    var.time="1981.1-2020.12 'days since 1900-01-01 00:00:0.0'"
    
    #最后记得关闭文件
    new_NC.close()
    
#%% 480 -> 归一化值
def tmp_month_reverse(mn, data, data_z):
    ind = mn-1

    for i in range(40):
        data_z[ind, :, :] = data[i, :, :]
        ind += 12
    
    
    return data_z

def Zscore(data):
    tmp_mn_z = np.zeros((40, 360, 720))
    data_z = np.zeros((480, 360, 720))
    for mn in range(1, 13):
        for r in range(360):
            if r%30 == 0:
                print(f"columns {r} is done!")
            for c in range(720):
                tmp_mn_z[:, r, c] = preprocessing.scale(data[mn, :, r, c]) #########
        CreatNC4(tmp_mn_z, mn)
                
Zscore(tmp_mn)
#tmp_z = Zscore(tmp_mn)
#CreatNC4(data_z)

### 异常值，循环求