In [2]:
import numpy as np
import pandas as pd

# Series

**Series** 是pandas中的两种基础数据结构之一，可以理解为<i>一维带标签数组</i>. 

+ 数组中数据可以为任意类型(整数，字符串，浮点数，Python objects等). 
+ 数组中数据为同一类型(homogeneous)


### 创建Series

建立一个Series    
    - s = pd.Series(data, index=index)

这里data可以是
    
    - list
    - array
    - dictionary

In [2]:
price = pd.Series([15280,45888,15692,55689,28410,27566])
price

0    15280
1    45888
2    15692
3    55689
4    28410
5    27566
dtype: int64

In [3]:
type(price)

pandas.core.series.Series

In [4]:
price = pd.Series([15280,45888,15692,55689,28410,27566],name="price")
price

0    15280
1    45888
2    15692
3    55689
4    28410
5    27566
Name: price, dtype: int64

In [5]:
pd.Series?

In [6]:
temp = {'Mon': 33, 'Tue': 19, 'Wed': 15, 'Thu': 89, 'Fri': 11, 'Sat': -5, 'Sun': 9}
pd.Series(temp)

Fri    11
Mon    33
Sat    -5
Sun     9
Thu    89
Tue    19
Wed    15
dtype: int64

In [7]:
price.mean()

31420.833333333332

In [8]:
price.sum()

188525

In [5]:
price.head(2)

0    15280
1    45888
dtype: int64

In [9]:
price.tail(3)

3    55689
4    28410
5    27566
Name: price, dtype: int64

In [9]:
price.max?

In [10]:
print(dir(price))

['T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_SLICEMAP', '__abs__', '__add__', '__and__', '__array__', '__array_prepare__', '__array_priority__', '__array_wrap__', '__bool__', '__bytes__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__int__', '__invert__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__long__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__

#### data types in series

In [10]:
price = pd.Series([15280,45888,15692,55689,28410,27566])
price.dtype

dtype('int64')

In [11]:
price = pd.Series([15280,45888,15692,55689,28410,27566.3])
price.dtype

dtype('float64')

In [12]:
city=pd.Series(['wh','sh','hz','bj','gz','nj'])
city.dtype

dtype('O')

In [17]:
temp=pd.Series([{},[],(3,4)])
temp.dtype

dtype('O')

In [13]:
x=pd.Series(['2016-01-01','2017-01-01'])
print(x.dtype)
pd.to_datetime(x)

object


0   2016-01-01
1   2017-01-01
dtype: datetime64[ns]

In [14]:
x=pd.Series(['hw','apple','vivo','mi','hw','oppo','samsung','vivo'],dtype='category')
x

0         hw
1      apple
2       vivo
3         mi
4         hw
5       oppo
6    samsung
7       vivo
dtype: category
Categories (6, object): [apple, hw, mi, oppo, samsung, vivo]

#### boolean 

In [18]:
mask=pd.Series([True,False,True,True,False,False])
mask

0     True
1    False
2     True
3     True
4    False
5    False
dtype: bool

In [16]:
price

0    15280.0
1    45888.0
2    15692.0
3    55689.0
4    28410.0
5    27566.3
dtype: float64

In [26]:
price[mask]

0    15280.0
2    15692.0
3    35689.0
dtype: float64

In [17]:
mask2=pd.Series([True,False,True,True,False,True])
mask|mask2

0     True
1    False
2     True
3     True
4    False
5     True
dtype: bool

In [28]:
mask&mask2

0     True
1    False
2     True
3     True
4    False
5    False
dtype: bool

In [29]:
~mask

0    False
1     True
2    False
3    False
4     True
5     True
dtype: bool

In [32]:
price>20000&price<30000

TypeError: cannot compare a dtyped [float64] array with a scalar of type [bool]

In [19]:
price[(price>20000)&(price<30000)]

4    28410.0
5    27566.3
dtype: float64

In [20]:
temp=[True,False,True,True,False,False]
temp2=[True,False,True,True,False,False]
temp & temp2

TypeError: unsupported operand type(s) for &: 'list' and 'list'

#### index

In [21]:
price

0    15280.0
1    45888.0
2    15692.0
3    55689.0
4    28410.0
5    27566.3
dtype: float64

In [22]:
price[2]

15692.0

In [10]:
price = pd.Series([15280,45888,15692,55689,28410,27566.3],index=['wh','sh','hz','bj','gz','nj'])
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
dtype: float64

In [24]:
price['sh']

45888.0

In [45]:
price.index

Index(['wh', 'sh', 'hz', 'bj', 'gz', 'nj'], dtype='object')

In [46]:
price.index.name='city'
price

city
wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
dtype: float64

In [3]:
dates=pd.date_range('2016-01-01','2016-6-01',freq='M')  #date_range
dates

DatetimeIndex(['2016-01-31', '2016-02-29', '2016-03-31', '2016-04-30',
               '2016-05-31'],
              dtype='datetime64[ns]', freq='M')

In [1]:
pd.date_range?

Object `pd.date_range` not found.


In [4]:
tempature=pd.Series([13,15,20,27,29],index=dates)
tempature

2016-01-31    13
2016-02-29    15
2016-03-31    20
2016-04-30    27
2016-05-31    29
Freq: M, dtype: int64

In [42]:
temp=pd.Series([13,15,20,27,29],index=[0,2,2,3,4]) #index可以重复
temp

0    13
2    15
2    20
3    27
4    29
dtype: int64

#### index/slicing

iloc
loc
.

In [5]:
temp=pd.Series([13,15,20,27,29])
temp

0    13
1    15
2    20
3    27
4    29
dtype: int64

In [49]:
temp[0] #通过下标索引

13

In [51]:
temp[3]

27

In [6]:
temp[7]

KeyError: 7

In [53]:
temp.loc[0]  #基于label 

13

In [55]:
temp.loc[-1]  #base on label

KeyError: 'the label [-1] is not in the [index]'

In [56]:
temp.iloc[-1]  #基于position

29

In [57]:
temp.iloc[0:3]

0    13
1    15
2    20
dtype: int64

In [7]:
temp=pd.Series([13,15,20,27,29],index=['M','T','W','T','F'])
temp

M    13
T    15
W    20
T    27
F    29
dtype: int64

In [59]:
temp['M'] #both position and label work

13

In [60]:
temp[0]

13

In [61]:
temp['T']

T    15
T    27
dtype: int64

In [62]:
temp.M

13

In [30]:
temp.T  #transpose, check with dir(temp)  转置函数

M    13
T    15
W    20
T    27
F    29
dtype: int64

#### 修改与删除Series中的值

In [11]:
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
dtype: float64

In [33]:
price['wh']=16000
price

wh    16000.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
dtype: float64

In [34]:
price.iloc[0]=15280
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
dtype: float64

In [14]:
price1 = price.append(pd.Series([9500],index=['cd']))  #return a new series 增加
price1

wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
cd     9500.0
dtype: float64

In [37]:
price['cd']=9500  #直接添加
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
cd     9500.0
dtype: float64

In [36]:
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    55689.0
gz    28410.0
nj    27566.3
dtype: float64

In [38]:
price.set_value('bj',62000) # update inplace

wh    15280.0
sh    45888.0
hz    15692.0
bj    62000.0
gz    28410.0
nj    27566.3
cd     9500.0
dtype: float64

In [71]:
price

city
wh    15280.0
sh    45888.0
hz    15692.0
bj    62000.0
gz    28410.0
nj    27566.3
dtype: float64

In [39]:
del price['nj']  #删除del

In [40]:
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    62000.0
gz    28410.0
cd     9500.0
dtype: float64

In [41]:
price<50000

wh     True
sh     True
hz     True
bj    False
gz     True
cd     True
dtype: bool

In [82]:
#filter
price[price<50000]  #过滤

city
wh    15280.0
sh    45888.0
hz    15692.0
gz    28410.0
dtype: float64

#### summary statistics 描述性统计

In [42]:
price.min()

9500.0

In [43]:
price.max()

62000.0

In [44]:
price.median()

22051.0

In [45]:
price.describe()

count        6.000000
mean     29461.666667
std      20585.747940
min       9500.000000
25%      15383.000000
50%      22051.000000
75%      41518.500000
max      62000.000000
dtype: float64

In [88]:
price.quantile(0.25)

15692.0

In [89]:
price.describe(percentiles=[0.25,0.5])

count        5.000000
mean     33454.000000
std      20252.798128
min      15280.000000
25%      15692.000000
50%      28410.000000
max      62000.000000
dtype: float64

In [46]:
temp=pd.Series(['hw','apple','vivo','mi','hw','oppo','samsung','vivo'],dtype='category')
temp

0         hw
1      apple
2       vivo
3         mi
4         hw
5       oppo
6    samsung
7       vivo
dtype: category
Categories (6, object): [apple, hw, mi, oppo, samsung, vivo]

In [48]:
print(dir(price))

['T', '_AXIS_ALIASES', '_AXIS_IALIASES', '_AXIS_LEN', '_AXIS_NAMES', '_AXIS_NUMBERS', '_AXIS_ORDERS', '_AXIS_REVERSED', '_AXIS_SLICEMAP', '__abs__', '__add__', '__and__', '__array__', '__array_prepare__', '__array_priority__', '__array_wrap__', '__bool__', '__bytes__', '__class__', '__contains__', '__delattr__', '__delitem__', '__dict__', '__dir__', '__div__', '__doc__', '__eq__', '__finalize__', '__float__', '__floordiv__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__iadd__', '__imul__', '__init__', '__int__', '__invert__', '__ipow__', '__isub__', '__iter__', '__itruediv__', '__le__', '__len__', '__long__', '__lt__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pow__', '__radd__', '__rand__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__ror__', '__round__', '__rpow__', '__rsub__', '__rtruediv__', '__rxor__', '__

In [47]:
temp.value_counts()

vivo       2
hw         2
samsung    1
oppo       1
mi         1
apple      1
dtype: int64

#### 向量化操作与广播

In [15]:
price * 2

wh     30560.0
sh     91776.0
hz     31384.0
bj    111378.0
gz     56820.0
nj     55132.6
dtype: float64

In [16]:
price+1000 #+,-,/,*

wh    16280.0
sh    46888.0
hz    16692.0
bj    56689.0
gz    29410.0
nj    28566.3
dtype: float64

In [52]:
np.log(price) #取对数

wh     9.634300
sh    10.733959
hz     9.660906
bj    11.034890
gz    10.254496
cd     9.159047
dtype: float64

In [53]:
li=[2000,3000,3000] #list
li+2

TypeError: can only concatenate list (not "int") to list

In [54]:
li*2

[2000, 3000, 3000, 2000, 3000, 3000]

In [17]:
s=pd.Series([10,20,30,40]) #标签自动对齐
s

0    10
1    20
2    30
3    40
dtype: int64

In [18]:
s2=pd.Series([10,20,30],index=[2,3,4])
s2

2    10
3    20
4    30
dtype: int64

In [20]:
s+s2  #NaN 在pandas里面表示不是一个数字 

0     NaN
1     NaN
2    40.0
3    60.0
4     NaN
dtype: float64

In [21]:
s.apply(float)

0    10.0
1    20.0
2    30.0
3    40.0
dtype: float64

In [22]:
s.astype(int)

0    10
1    20
2    30
3    40
dtype: int32

#### iteration 迭代

In [23]:
for num in s:
    print(num)

10
20
30
40


In [59]:
40 in s

False

In [104]:
40 in s.values #检查值

True

In [106]:
0 in s #why? series是key-value存储，所以这里实际是index o #检查index

True

In [61]:
price

wh    15280.0
sh    45888.0
hz    15692.0
bj    62000.0
gz    28410.0
cd     9500.0
dtype: float64

In [60]:
'bj' in price

True

In [62]:
li=[10,20,30,40]
40 in li

True

In [110]:
# looping over dictionary keys and values
for k,v in price.items():
    print(k,v)

wh 15280.0
sh 45888.0
hz 15692.0
bj 62000.0
gz 28410.0


# 参考资料

[pandas series](http://pandas-docs.github.io/pandas-docs-travis/dsintro.html#series)