# Pandas

### Series

In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

obj = Series([4, 7, -5, 3])
print(obj)
print(obj.array)
print(obj.index)

print("=====>>> 带有确定索引的Series")
obj2 = pd.Series(data = [1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
print(obj2)
# 通过索引取值
print(obj2['a'])
print(obj2[['a', 'c', 'd']])
# 运算操作
print(obj2*2)
print(np.max(obj2))

0    4
1    7
2   -5
3    3
dtype: int64
<PandasArray>
[4, 7, -5, 3]
Length: 4, dtype: int64
RangeIndex(start=0, stop=4, step=1)
=====>>> 带有确定索引的Series
a    1
b    2
c    3
d    4
dtype: int64
1
a    1
c    3
d    4
dtype: int64
a    2
b    4
c    6
d    8
dtype: int64
4


In [2]:
# 基于字典创建Series
sdata = {'Ohlc': 3500, 'Texas': 71000, 'Oregon': 16000}
obj3 = Series(sdata)
print(obj3)
# 转回字典
print(obj3.to_dict())

Ohlc       3500
Texas     71000
Oregon    16000
dtype: int64
{'Ohlc': 3500, 'Texas': 71000, 'Oregon': 16000}


### DataFrame

In [3]:
# 有行和列的结构
data = {'state': ["chens", "chenx", "chen", "Nevada", "Nevada", "Nevada"], 
       'year': [2000, 1999, 1997, 1993, 2001, 2003, ],
       'salary': [0, 0, 13000, 18000, 9000, 8000]}
frame = pd.DataFrame(data)
frame.index = ['a', 'b', 'c', 'd', 'e', 'f']
print(frame)
print(frame.head()) # 前5行
print(frame.tail()) # 后5行
# 指定列的顺序
print(DataFrame(data, columns=['state', 'salary', 'year', 'other']))

    state  year  salary
a   chens  2000       0
b   chenx  1999       0
c    chen  1997   13000
d  Nevada  1993   18000
e  Nevada  2001    9000
f  Nevada  2003    8000
    state  year  salary
a   chens  2000       0
b   chenx  1999       0
c    chen  1997   13000
d  Nevada  1993   18000
e  Nevada  2001    9000
    state  year  salary
b   chenx  1999       0
c    chen  1997   13000
d  Nevada  1993   18000
e  Nevada  2001    9000
f  Nevada  2003    8000
    state  salary  year other
0   chens       0  2000   NaN
1   chenx       0  1999   NaN
2    chen   13000  1997   NaN
3  Nevada   18000  1993   NaN
4  Nevada    9000  2001   NaN
5  Nevada    8000  2003   NaN


In [4]:
# DataFrame的一列可以作为一个Series取出
print(frame["salary"])
print(frame.year)
# 获取一行
print("=====>>> 获取一行")
print(frame.loc['a']) # 通过索引名称获取
print(frame.iloc[1]) # 通过索引序号获取
# 为一列赋值
frame['other'] = 10.3
# del frame['salary'] # 删除列
frame.T

a        0
b        0
c    13000
d    18000
e     9000
f     8000
Name: salary, dtype: int64
a    2000
b    1999
c    1997
d    1993
e    2001
f    2003
Name: year, dtype: int64
=====>>> 获取一行
state     chens
year       2000
salary        0
Name: a, dtype: object
state     chenx
year       1999
salary        0
Name: b, dtype: object


Unnamed: 0,a,b,c,d,e,f
state,chens,chenx,chen,Nevada,Nevada,Nevada
year,2000,1999,1997,1993,2001,2003
salary,0,0,13000,18000,9000,8000
other,10.3,10.3,10.3,10.3,10.3,10.3


In [5]:
# 转为 numpy
frame.state.to_numpy()

array(['chens', 'chenx', 'chen', 'Nevada', 'Nevada', 'Nevada'],
      dtype=object)

## 基本功能

In [6]:
obj = Series([3.5, 7.12, 2.9, 6.26, 5.38], index=['a', 'b', 'c', 'd', 'e'])
print(obj)
# 重建索引
 # 默认不存在的index填充NaN
print(obj.reindex(['e', 'd', 'c', 'b', 'a', 'm'], method='ffill')) # ffill向前填充


a    3.50
b    7.12
c    2.90
d    6.26
e    5.38
dtype: float64
e    5.38
d    6.26
c    2.90
b    7.12
a    3.50
m    5.38
dtype: float64


In [7]:
obj2 = Series(np.arange(5.))
print(obj2.drop(1))

0    0.0
2    2.0
3    3.0
4    4.0
dtype: float64


## 描述性统计和汇总计算

In [12]:
df = DataFrame([[1.4, np.nan], [7.1, -4.6], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.6
c,,
d,0.75,-1.3


In [18]:
# 默认会跳过NaN，可以通过skipna设置
print(df.sum())
print(df.sum(axis=1))
print(df.mean(axis='columns')) # 至少有一个NaN，否则结果为NaN

one    9.25
two   -5.90
dtype: float64
a    1.40
b    2.50
c    0.00
d   -0.55
dtype: float64
a    1.400
b    1.250
c      NaN
d   -0.275
dtype: float64


In [20]:
print(df.idxmax()) # 返回最大值所在的索引
print(df.idxmax(axis=1)) # 最大值所在的列

one    b
two    d
dtype: object
a    one
b    one
c    NaN
d    one
dtype: object


In [21]:
print(df.describe()) # 汇总所有数据

            one       two
count  3.000000  2.000000
mean   3.083333 -2.950000
std    3.493685  2.333452
min    0.750000 -4.600000
25%    1.075000 -3.775000
50%    1.400000 -2.950000
75%    4.250000 -2.125000
max    7.100000 -1.300000
