## Pandas 的 Series数据类型

In [35]:
import pandas as pd
data = pd.read_excel("data_excel.xlsx")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
id       10 non-null int64
age      10 non-null int64
place    10 non-null int64
dtypes: int64(3)
memory usage: 368.0 bytes


In [17]:
type(data)

pandas.core.frame.DataFrame

In [18]:
type(data["id"])

pandas.core.series.Series

In [19]:
data["id"]

0     1
1     2
2     3
3     4
4     5
5     6
6     7
7     8
8     9
9    10
Name: id, dtype: int64

In [20]:
data["id"][0]

1

In [None]:
my_list=[1,2,3,4]
my_list[0]

## 自己声明一个Series

In [8]:
obj = pd.Series(my_list)
obj

0    1
1    2
2    3
3    4
dtype: int64

In [9]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [10]:
obj.values

array([1, 2, 3, 4])

In [11]:
obj = pd.Series(["a","b","c","d","e"], index=[1,2,3,4,5])
obj

1    a
2    b
3    c
4    d
5    e
dtype: object

In [13]:
data = {
    "a":[100,1,2,3],
    "b":200,
    "c":300
}
obj = pd.Series(data)
obj


a    [100, 1, 2, 3]
b               200
c               300
dtype: object

In [14]:
obj["a"]

[100, 1, 2, 3]

## 数据处理
- 我们在处理的时候，一般也都是一列一列的数据进行处理

### 缺失值处理

In [27]:
data = {
    "a":None,
    "b":200,
    "c":300
}
obj = pd.Series(data)
obj


a      NaN
b    200.0
c    300.0
dtype: float64

In [28]:
pd.isnull(obj)

a     True
b    False
c    False
dtype: bool

In [29]:
obj.isnull()

a     True
b    False
c    False
dtype: bool

In [30]:
obj.notnull()

a    False
b     True
c     True
dtype: bool

### 算数运算和数据对齐

In [33]:
d1=pd.Series([1.3,1.5,2.6,-3.2], index=['a','b','c','d'])
d2=pd.Series([-1.5,-1.2,-2.0,3.2,4.5], index=['a','b','c','d','e'])

In [34]:
d1+d2

a   -0.2
b    0.3
c    0.6
d    0.0
e    NaN
dtype: float64

In [36]:
## 补充知识：Pandas如何读取大数据量文件
pd.read_csv("executive.csv", nrows=5)
# 与head的区别是，使用head是显示结果的前多少条，而使用nrows参数是只读前几行

Unnamed: 0,name:ID,sex,age,:LABEL
0,谢永林,男,50,people
1,胡跃飞,男,56,people
2,刘勇,男,49,people
3,朱季成,男,37,people
4,张丕杰,男,57,people


In [44]:
data = pd.read_csv("executive.csv", chunksize=100)
data

<pandas.io.parsers.TextFileReader at 0x11a05e320>

In [38]:
for d in data:
    print(d)

   name:ID sex  age  :LABEL
0      谢永林   男   50  people
1      胡跃飞   男   56  people
2       刘勇   男   49  people
3      朱季成   男   37  people
4      张丕杰   男   57  people
..     ...  ..  ...     ...
95      艾萍   女   39  people
96      张旸   男   39  people
97     于洪涛   男   51  people
98      梁旭   男   55  people
99     梁健锋   男   53  people

[100 rows x 4 columns]
    name:ID sex  age  :LABEL
100      林萌   男   52  people
101     雷菊芳   女   65  people
102     邓淑芬   女   54  people
103     陈劲松   男   54  people
104     黄伟中   男   50  people
..      ...  ..  ...     ...
195     钱震斌   男   59  people
196     杨振华   男   54  people
197      周中   男   51  people
198      张曦   男   44  people
199     符冠华   男   55  people

[100 rows x 4 columns]
    name:ID sex  age  :LABEL
200     黄国宏   男   49  people
201      高波   男   44  people
202     罗叶兰   女   41  people
203      张兵   男   47  people
204     王军华   男   57  people
..      ...  ..  ...     ...
295     梁永岑   男   50  people
296     王万强   男   41  people
297    

     name:ID sex  age  :LABEL
2600      宋勇   男   55  people
2601     吴学民   男   55  people
2602     邹支农   男   50  people
2603     林建伟   男   52  people
2604     徐德勇   男   41  people
...      ...  ..  ...     ...
2695     万国江   男   54  people
2696     周红卫   男   51  people
2697     罗建文   男   70  people
2698     杨震华   男   58  people
2699      樊志   男   56  people

[100 rows x 4 columns]
        name:ID sex  age  :LABEL
2700        邵秀英   女   46  people
2701        赵云文   男   63  people
2702  Hollis Li   男   55  people
2703        左海波   男   44  people
2704         姚宁   男   50  people
...         ...  ..  ...     ...
2795         胡霞   女   54  people
2796        严学文   男   38  people
2797        张昌楠   男   46  people
2798        孙伟龙   男   50  people
2799         鲍蕾   女   38  people

[100 rows x 4 columns]
     name:ID sex  age  :LABEL
2800      顾群   男   48  people
2801     胡东群   男   52  people
2802     章文藻   男   61  people
2803     朱汉平   男   55  people
2804      马焰   男   51  people
...      ...  ..

     name:ID sex  age  :LABEL
4800     周广林   男   56  people
4801     李邦良   男   72  people
4802     李春光   男   50  people
4803     陈锦石   男   56  people
4804     周武平   男   52  people
...      ...  ..  ...     ...
4895     揭小健   男   53  people
4896     王义栋   男   50  people
4897     丁建国   男   43  people
4898     吴岩松   男   47  people
4899     代云辉   女   54  people

[100 rows x 4 columns]
     name:ID sex  age  :LABEL
4900     杨华锋   男   44  people
4901     韩广荣   男   56  people
4902     刘玉新   男   50  people
4903      杨林   男   46  people
4904      李镇   男   48  people
...      ...  ..  ...     ...
4995     莫翊斌   男   50  people
4996     王永志   男   52  people
4997     周世荣   男   49  people
4998     吴智勇   男   41  people
4999     彭东升   男   56  people

[100 rows x 4 columns]
     name:ID sex  age  :LABEL
5000     蔡卫东   男   49  people
5001      贾浚   男   48  people
5002     许仕清   男   54  people
5003      张湧   男   51  people
5004     袁汉源   男   56  people
...      ...  ..  ...     ...
5095     林阿头   男   54 

### NumPy
- NumPy其实就是一个多维的数组(列表、list)对象  [1,2,3,4,5]

## 如果你是独立安装的Python那么你需要安装NumPy     
- 命令行安装：pip install numpy
- Jupyter内安装  !pip install numpy
## 如果你是Anaconda，那么不需要安装就可以使用

In [48]:
import numpy as np
data = [1,2,3,4]
n = np.array(data * 10 )
n

array([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
       3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4])

In [47]:
data = [1,2,3,4]
data

[1, 2, 3, 4]

In [49]:
# 观察n的形状
n.shape

(40,)

In [51]:
arr = [[1,2,3,4],[5,6,7,8]]
arr

[[1, 2, 3, 4], [5, 6, 7, 8]]

In [52]:
arr2 = np.array(arr)
arr2

array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [53]:
arr2.shape

(2, 4)

In [54]:
arr2.ndim

2

In [56]:
arr = [
    [
        [1,2],
        [3,4]],
       
       [
        [5,6],
        [7,8]]
]
arr3 = np.array(arr)
arr3

array([[[1, 2],
        [3, 4]],

       [[5, 6],
        [7, 8]]])

In [57]:
arr3.ndim

3

In [58]:
arr3.shape

(2, 2, 2)

### NumPy对数据类型进行合理推断



In [61]:
arr = [["a","2",3,4],[5,6,7,8]]
arr2 = np.array(arr)
arr2

array([['a', '2', '3', '4'],
       ['5', '6', '7', '8']], dtype='<U1')

In [62]:
arr = [[1.2,3,4,5],[5,6,7,8]]
arr2 = np.array(arr)
arr2

array([[1.2, 3. , 4. , 5. ],
       [5. , 6. , 7. , 8. ]])

### numpy创建指定长度的数组

In [63]:
np.zeros(10)

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [64]:
np.ones((2,3))

array([[1., 1., 1.],
       [1., 1., 1.]])

In [66]:
np.empty((2,3,4))

array([[[0.00000000e+000, 0.00000000e+000, 2.41907520e-312,
         2.14321575e-312],
        [2.46151512e-312, 2.31297541e-312, 2.35541533e-312,
         2.05833592e-312],
        [2.22809558e-312, 2.56761491e-312, 2.48273508e-312,
         2.05833592e-312]],

       [[2.05833592e-312, 2.29175545e-312, 2.07955588e-312,
         2.14321575e-312],
        [0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
         0.00000000e+000],
        [2.47032823e-323, 2.14321575e-312, 0.00000000e+000,
         0.00000000e+000]]])

In [67]:
### range(10)
### arange

In [69]:
np.arange(1,10)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [70]:
### 类型转换
arr = np.array([1.2,1.4,1.8,-4.5,-7.5])
arr

array([ 1.2,  1.4,  1.8, -4.5, -7.5])

In [72]:
arr.astype(np.int64)


array([ 1,  1,  1, -4, -7])

## 矢量化
### 在不用编写循环的情况下就可以进行批量运行，这就叫矢量化

In [73]:
arr1 = [1,2,3,4]
arr2 = [5,6,7,8]
arr1 + arr2

[1, 2, 3, 4, 5, 6, 7, 8]

In [81]:
arr1 = np.array([1,2,3,4])
arr2 = np.array([5,6,7,8])
arr1 + arr2

array([ 6,  8, 10, 12])

In [75]:
arr1 - arr2

array([-4, -4, -4, -4])

In [76]:
arr1 * arr2

array([ 5, 12, 21, 32])

In [77]:
arr1 * 5

array([ 5, 10, 15, 20])

In [82]:
arr1 / arr2

array([0.2       , 0.33333333, 0.42857143, 0.5       ])

## 切片操作

In [83]:
arr = np.arange(10)
arr

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [84]:
arr[4]

4

In [85]:
arr[3:]

array([3, 4, 5, 6, 7, 8, 9])

In [86]:
arr[1:3] = 11

In [87]:
arr

array([ 0, 11, 11,  3,  4,  5,  6,  7,  8,  9])

In [89]:
arr1 = np.array([[1,2,3,4],[5,6,7,8]])
arr1


array([[1, 2, 3, 4],
       [5, 6, 7, 8]])

In [90]:
arr1[1][1]

6

In [91]:
names = np.array(["HanMeiMei","LiLei","Tony","Jack"])
names == "HanMeiMei"

array([ True, False, False, False])

In [93]:
(names == "HanMeiMei")  &  (names == "Tony")

array([False, False, False, False])

In [None]:
#  &  对应Python中的 and
#  |  对应Python中的 or

In [94]:
(names == "HanMeiMei")  |  (names == "Tony")

array([ True, False,  True, False])

### 花式索引  Fancy Indexing 
- 这是NumPy中的一个术语 ， 指的是利用整数数组进行索引

In [100]:
arr = np.empty((8,4))
for i in range(8):
    arr[i] = i
arr

array([[0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [2., 2., 2., 2.],
       [3., 3., 3., 3.],
       [4., 4., 4., 4.],
       [5., 5., 5., 5.],
       [6., 6., 6., 6.],
       [7., 7., 7., 7.]])

In [101]:
arr[[4,3,0,6]]

array([[4., 4., 4., 4.],
       [3., 3., 3., 3.],
       [0., 0., 0., 0.],
       [6., 6., 6., 6.]])

In [103]:
arr = np.arange(32).reshape((8,4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [104]:
arr[[1,5,7,2]]

array([[ 4,  5,  6,  7],
       [20, 21, 22, 23],
       [28, 29, 30, 31],
       [ 8,  9, 10, 11]])

In [105]:
arr[[1,5,7,2],[0,3,1,2]]

array([ 4, 23, 29, 10])

In [106]:
arr = np.arange(15).reshape((3,5))
arr

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [107]:
arr.T

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

In [108]:
arr.transpose()

array([[ 0,  5, 10],
       [ 1,  6, 11],
       [ 2,  7, 12],
       [ 3,  8, 13],
       [ 4,  9, 14]])

# NumPy降维操作

In [109]:
arr = np.array([[1,10,100],[2,20,200],[3,30,300]])
arr

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300]])

In [141]:
data = arr.ravel()

In [124]:
data

array([  1,  10, 100,   2,  20, 200,   3,  30, 300])

In [117]:
arr

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300]])

In [118]:
arr.reshape(-1)

array([  1,  10, 100,   2,  20, 200,   3,  30, 300])

In [119]:
arr

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300]])

In [120]:
arr.flatten()

array([  1,  10, 100,   2,  20, 200,   3,  30, 300])

In [121]:
arr

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300]])

In [127]:
### 改变排序模式的降维操作
arr.ravel(order="F")

array([  1,   2,   3,  10,  20,  30, 100, 200, 300])

In [128]:
arr.reshape(-1,order="F")

array([  1,   2,   3,  10,  20,  30, 100, 200, 300])

In [129]:
arr.flatten(order="F")

array([  1,   2,   3,  10,  20,  30, 100, 200, 300])

In [131]:
arr.flatten()[0] = 2000

In [132]:
arr

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300]])

In [133]:
arr.ravel()[1]=1000

In [134]:
arr

array([[   1, 1000,  100],
       [   2,   20,  200],
       [   3,   30,  300]])

In [135]:
arr.reshape(-1)[2]=3000

In [136]:
arr

array([[   1, 1000, 3000],
       [   2,   20,  200],
       [   3,   30,  300]])

In [138]:
arr.ravel()

array([   1, 1000, 3000,    2,   20,  200,    3,   30,  300])

In [143]:
arr.ravel()[6]

3

In [140]:
arr

array([[   1, 1000, 3000],
       [   2,   20,  200],
       [   3,   30,  300]])

## NumPy堆叠的操作

In [153]:
arr1 = np.array([[1,10,100],[2,20,200],[3,30,300]])
arr2 = np.array([1,2,3])
arr1

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300]])

In [150]:
arr2

array([1, 2, 3])

In [147]:
### 纵向堆叠
np.vstack([arr1,arr2])

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300],
       [  1,   2,   3]])

In [151]:
np.row_stack([arr1,arr2])

array([[  1,  10, 100],
       [  2,  20, 200],
       [  3,  30, 300],
       [  1,   2,   3]])

In [156]:
arr3 = np.array([[5],[6],[7]])
### 横向堆叠
np.hstack([arr1,arr3])

array([[  1,  10, 100,   5],
       [  2,  20, 200,   6],
       [  3,  30, 300,   7]])

In [157]:
np.column_stack([arr1,arr3])

array([[  1,  10, 100,   5],
       [  2,  20, 200,   6],
       [  3,  30, 300,   7]])

### 广播运算

In [158]:
arr1 = np.arange(12).reshape(3,4)
arr2 = np.arange(101,113).reshape(3,4)
arr1

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [159]:
arr2

array([[101, 102, 103, 104],
       [105, 106, 107, 108],
       [109, 110, 111, 112]])

In [160]:
arr1 + arr2

array([[101, 103, 105, 107],
       [109, 111, 113, 115],
       [117, 119, 121, 123]])

In [None]:
#### 对应位置相加不叫广播运算

In [161]:
#  从后边开始数，维度是一致的，都是4 * 3
arr1 = np.arange(60).reshape(5,4,3)
arr2 = np.arange(12).reshape(4,3)
arr1

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]],

       [[12, 13, 14],
        [15, 16, 17],
        [18, 19, 20],
        [21, 22, 23]],

       [[24, 25, 26],
        [27, 28, 29],
        [30, 31, 32],
        [33, 34, 35]],

       [[36, 37, 38],
        [39, 40, 41],
        [42, 43, 44],
        [45, 46, 47]],

       [[48, 49, 50],
        [51, 52, 53],
        [54, 55, 56],
        [57, 58, 59]]])

In [162]:
arr2

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [163]:
arr1 + arr2

array([[[ 0,  2,  4],
        [ 6,  8, 10],
        [12, 14, 16],
        [18, 20, 22]],

       [[12, 14, 16],
        [18, 20, 22],
        [24, 26, 28],
        [30, 32, 34]],

       [[24, 26, 28],
        [30, 32, 34],
        [36, 38, 40],
        [42, 44, 46]],

       [[36, 38, 40],
        [42, 44, 46],
        [48, 50, 52],
        [54, 56, 58]],

       [[48, 50, 52],
        [54, 56, 58],
        [60, 62, 64],
        [66, 68, 70]]])

In [164]:
arr1 = np.arange(60).reshape(5,4,3)
arr2 = np.arange(4).reshape(4,1)
arr1

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]],

       [[12, 13, 14],
        [15, 16, 17],
        [18, 19, 20],
        [21, 22, 23]],

       [[24, 25, 26],
        [27, 28, 29],
        [30, 31, 32],
        [33, 34, 35]],

       [[36, 37, 38],
        [39, 40, 41],
        [42, 43, 44],
        [45, 46, 47]],

       [[48, 49, 50],
        [51, 52, 53],
        [54, 55, 56],
        [57, 58, 59]]])

In [165]:
arr2

array([[0],
       [1],
       [2],
       [3]])

In [166]:
arr1 + arr2

array([[[ 0,  1,  2],
        [ 4,  5,  6],
        [ 8,  9, 10],
        [12, 13, 14]],

       [[12, 13, 14],
        [16, 17, 18],
        [20, 21, 22],
        [24, 25, 26]],

       [[24, 25, 26],
        [28, 29, 30],
        [32, 33, 34],
        [36, 37, 38]],

       [[36, 37, 38],
        [40, 41, 42],
        [44, 45, 46],
        [48, 49, 50]],

       [[48, 49, 50],
        [52, 53, 54],
        [56, 57, 58],
        [60, 61, 62]]])

In [167]:
arr1 = np.arange(12).reshape(4,3)
arr2 = np.array([1,2,3])
arr1

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [168]:
arr2

array([1, 2, 3])

In [169]:
arr1 + arr2

array([[ 1,  3,  5],
       [ 4,  6,  8],
       [ 7,  9, 11],
       [10, 12, 14]])