# xarray入门
### 一、创建xarray对象DataArray
用numpy创建

In [8]:
import xarray as xr
import numpy as np
import pandas as pd
data = np.random.rand(4, 3)
locs = ['level','latitude','longitude']
times = pd.date_range('2000-01-01', periods=4)
foo = xr.DataArray(data, coords=[times, locs], dims=['time', 'space']) #设置维度，坐标
print(foo)

<xarray.DataArray (time: 4, space: 3)>
array([[0.28874046, 0.29358839, 0.4834316 ],
       [0.32894623, 0.98946003, 0.80528601],
       [0.6101863 , 0.04852265, 0.71790979],
       [0.56375212, 0.29326885, 0.83856733]])
Coordinates:
  * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
  * space    (space) <U9 'level' 'latitude' 'longitude'


用pandas.dataframe

In [10]:
df = pd.DataFrame({'lat': [0, 1], 'lon': [2, 3]}, index=['2000-01-01', '2000-01-02'])
df.index.name = 'time'
df.columns.name = 'space'
print(df)
foo = xr.DataArray(df)
print(foo)

space       lat  lon
time                
2000-01-01    0    2
2000-01-02    1    3
<xarray.DataArray (time: 2, space: 2)>
array([[0, 2],
       [1, 3]], dtype=int64)
Coordinates:
  * time     (time) object '2000-01-01' '2000-01-02'
  * space    (space) object 'lat' 'lon'


(1)利用列表设置coords 以元组组成的列表形式设置

In [12]:
foo = xr.DataArray(data, coords=[('time', times), ('space', locs)])
print(foo)

<xarray.DataArray (time: 4, space: 3)>
array([[0.28874046, 0.29358839, 0.4834316 ],
       [0.32894623, 0.98946003, 0.80528601],
       [0.6101863 , 0.04852265, 0.71790979],
       [0.56375212, 0.29326885, 0.83856733]])
Coordinates:
  * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
  * space    (space) <U9 'level' 'latitude' 'longitude'


(2)利用字典设置coords。

In [14]:
foo = xr.DataArray(data,coords={'time': times,'space': locs},dims=['time', 'space'])
print(foo)

<xarray.DataArray (time: 4, space: 3)>
array([[0.28874046, 0.29358839, 0.4834316 ],
       [0.32894623, 0.98946003, 0.80528601],
       [0.6101863 , 0.04852265, 0.71790979],
       [0.56375212, 0.29326885, 0.83856733]])
Coordinates:
  * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
  * space    (space) <U9 'level' 'latitude' 'longitude'


访问coords

代码|说明  
:---|:---  
foo.coords[“time”]|根据坐标索引  
foo[“time”]|根据坐标索引  
del foo[“time”]|删除coords  
foo.values|访问数值  
foo.dims|返回维度  
foo.coords|返回坐标 
foo.name = 'foo'|给dataframe命名  
foo.attrs['units']='meters'|给dataframe添加属性  
foo_new = foo.rename('foo_new')|给dataframe重命名


## 二、数据集 Dataset
Dataset有很多个数据，而DataArray只有一个数据

In [16]:
temp = 15 + 8 * np.random.randn(2, 2, 3)
precip = 10 * np.random.rand(2, 2, 3)
lon = [[-99.83, -99.32], [-99.79, -99.23]]
lat = [[42.25, 42.21], [42.63, 42.59]]
ds = xr.Dataset({'temperature': (['x', 'y', 'time'], temp),
              'precipitation': (['x', 'y', 'time'], precip),},
             coords={'lon': (['x', 'y'], lon),
                    'lat': (['x', 'y'], lat),
                    'time': pd.date_range('2014-09-06', periods=3),
                    'reference_time': pd.Timestamp('2014-09-05')} )
print(ds)

<xarray.Dataset>
Dimensions:         (x: 2, y: 2, time: 3)
Coordinates:
    lon             (x, y) float64 -99.83 -99.32 -99.79 -99.23
    lat             (x, y) float64 42.25 42.21 42.63 42.59
  * time            (time) datetime64[ns] 2014-09-06 2014-09-07 2014-09-08
    reference_time  datetime64[ns] 2014-09-05
Dimensions without coordinates: x, y
Data variables:
    temperature     (x, y, time) float64 11.18 14.79 6.994 ... 18.66 14.74 24.42
    precipitation   (x, y, time) float64 1.902 4.012 5.199 ... 0.1967 0.8077


In [17]:
"""通过DaTaArray创建Dataset"""
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)
foo = xr.DataArray(data, coords=[times, locs], dims=['time', 'space'])
ds = xr.Dataset({'bar': foo})
print(ds)

<xarray.Dataset>
Dimensions:  (time: 4, space: 3)
Coordinates:
  * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
  * space    (space) <U2 'IA' 'IL' 'IN'
Data variables:
    bar      (time, space) float64 0.5736 0.9471 0.1249 ... 0.21 0.5897 0.6021


In [22]:
"""通过pandas对象来创建数据集"""
data = np.random.rand(4, 3)
locs = ['IA', 'IL', 'IN']
times = pd.date_range('2000-01-01', periods=4)
foo = xr.DataArray(data, coords=[times, locs], dims=['time', 'space'])
ds = xr.Dataset({'bar': foo.to_pandas()})  #把dataarray转换成pandas
print(ds)

<xarray.Dataset>
Dimensions:  (time: 4, space: 3)
Coordinates:
  * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
  * space    (space) object 'IA' 'IL' 'IN'
Data variables:
    bar      (time, space) float64 0.8858 0.7581 0.3331 ... 0.9236 0.5028 0.9401


代码|说明  
:---|:---  
ds['temperature']|访问变量值  
ds.temperature|访问变量值
ds.data_vars|访问名称  
ds.coords|访问坐标  
ds.copy()|浅层复制  
ds.copy(deep=True)|深层复制
ds.drop|删除变量  
ds.drop_dims()|删除维度
ds.assign(temperature2=2 * ds.temperature)|替换值  
assign_coords()|对坐标标签重新赋值  
ds.rename|对变量重命名

##  数据数组与数据集的处理

### 位置索引

In [23]:
da = xr.DataArray(np.random.rand(4, 3),
               [('time', pd.date_range('2000-01-01', periods=4)),('space', ['level', 'lat', 'lon'])])
print(da)

<xarray.DataArray (time: 4, space: 3)>
array([[0.94810809, 0.25349317, 0.40785102],
       [0.25011475, 0.65441812, 0.42969867],
       [0.97764775, 0.19634062, 0.71563777],
       [0.88923702, 0.54879085, 0.09067266]])
Coordinates:
  * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
  * space    (space) <U5 'level' 'lat' 'lon'


代码|说明  
:---|:---  
da[]|位置索引  
da.loc[]|标签索引  




In [29]:
da[0,0],da[:,[2,1]],da.loc['2000-01-01':'2000-01-02','lat']

(<xarray.DataArray ()>
 array(0.94810809)
 Coordinates:
     time     datetime64[ns] 2000-01-01
     space    <U5 'level',
 <xarray.DataArray (time: 4, space: 2)>
 array([[0.40785102, 0.25349317],
        [0.42969867, 0.65441812],
        [0.71563777, 0.19634062],
        [0.09067266, 0.54879085]])
 Coordinates:
   * time     (time) datetime64[ns] 2000-01-01 2000-01-02 2000-01-03 2000-01-04
   * space    (space) <U5 'lon' 'lat',
 <xarray.DataArray (time: 2)>
 array([0.25349317, 0.65441812])
 Coordinates:
   * time     (time) datetime64[ns] 2000-01-01 2000-01-02
     space    <U5 'lat')

### 利用维度名称进行索引  
代码|描述  
:---|:---  
da.[dic()]|字典  
da.loc[dic()]|字典索引  
da.isel()|  
da.sel()|  
da.where|掩盖元素  

In [30]:
da[dict(space=0, time=slice(None, 2))] #健space，time

In [32]:
da.loc[dict(time=slice('2000-01-01', '2000-01-02'))]

### 数据运算
可以用numpy，scipy库的一些函数  


代码|描述  
:---|:---  
x.isnull()|x的缺测值返回布尔值  
x.notnull()|非缺测值返回布尔值  
x.count()|返回x中非缺测值的个数  
x.dropna()|去掉缺测值  
x.fillna(-1)|缺测值替换为-1  
x.bfill|将数据数组中的缺测值替换为向后最近的一个非缺测值  

### 聚合运算  
skipna=False 运算过程中禁止忽略缺测值

### 数据滑动
rolling()

In [34]:
arr = xr.DataArray(np.arange(0, 7.5, 0.5).reshape(3, 5), dims=('lat', 'lon'))
print(arr)
print(arr.rolling(lon=3))

<xarray.DataArray (lat: 3, lon: 5)>
array([[0. , 0.5, 1. , 1.5, 2. ],
       [2.5, 3. , 3.5, 4. , 4.5],
       [5. , 5.5, 6. , 6.5, 7. ]])
Dimensions without coordinates: lat, lon
DataArrayRolling [lon->3]


### 数据的拆分与组合
groupby()

In [35]:
ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 3))},coords={'x': [10, 20, 30, 40], 'letters': ('x', list('abba'))})
arr = ds['foo']
print(ds)

<xarray.Dataset>
Dimensions:  (x: 4, y: 3)
Coordinates:
  * x        (x) int32 10 20 30 40
    letters  (x) <U1 'a' 'b' 'b' 'a'
Dimensions without coordinates: y
Data variables:
    foo      (x, y) float64 0.7945 0.9478 0.3205 ... 0.9924 0.03626 0.4654


### 数据的变形和重组
ds.transpose(“y”, “z”, “x”)调整其维度顺序  
ds.expand_dims("w") 扩充维度  
expanded.squeeze("w")删除维度  
to_array() 将数据集转为数据组 
o_dataset() 将数据组转换为数据集




### 数据维度的堆栈和出栈  
xarray通过stack()和unstack()方法来实现数据维度的合并（堆栈）和拆分（出栈）

### 数据的移动和滚动
shift()和roll()方法来移动和滚动数据

### 数据合并
xarray支持多种数据合并方式，主要包括concatenate、merge、combine、combining.multi 4种类型

### 时间序列数据

In [38]:
print(pd.to_datetime(['2000-01-01', '2000-02-02']))
print(pd.date_range('2000-01-01', periods=365))

DatetimeIndex(['2000-01-01', '2000-02-02'], dtype='datetime64[ns]', freq=None)
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10',
               ...
               '2000-12-21', '2000-12-22', '2000-12-23', '2000-12-24',
               '2000-12-25', '2000-12-26', '2000-12-27', '2000-12-28',
               '2000-12-29', '2000-12-30'],
              dtype='datetime64[ns]', length=365, freq='D')
