In [10]:
import pandas as pd

# Series

In [11]:
# pandas支持的数据类型：Series
# Series包含两列，一列索引index，一列数据values

ser_obj = pd.Series(range(10, 20)) #默认索引是0-9
# pd.Series(1, index=list(range(3,7)),dtype='float32') 指定索引
print(ser_obj)
#创建返回值会带有类型

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64


In [12]:
# 获取数据
print(ser_obj.values)
print(type(ser_obj.values)) #values为ndarray类型
# 获取索引
print(ser_obj.index)  #索引为RangeIndex类型
print(type(ser_obj.index))
ser_obj.dtype #数据类型


[10 11 12 13 14 15 16 17 18 19]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=10, step=1)
<class 'pandas.RangeIndex'>


dtype('int64')

In [13]:
# 取某个值
print(ser_obj[0])
ser_obj[9] #
# 访问不存在的索引下标会keyerror

10


np.int64(19)

In [14]:
#元素级乘法
print(ser_obj * 2)

#返回一个bool序列
print(ser_obj > 15)

0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool


In [15]:
#字典转series，索引为字典的key，value为字典的value
year_data = {2001: 17.8, 2005: 20.1, 2003: 16.5}
ser_obj2 = pd.Series(year_data)
print(ser_obj2)
print('-'*50)
print(ser_obj2.index)
print('-'*50)
print(ser_obj2[2001])
ser_obj2.values

2001    17.8
2005    20.1
2003    16.5
dtype: float64
--------------------------------------------------
Index([2001, 2005, 2003], dtype='int64')
--------------------------------------------------
17.8


array([17.8, 20.1, 16.5])

# DataFrame

In [16]:
import numpy as np

# 通过ndarray构建DataFrame，DataFrame有行索引和列索引，且可修改
t = pd.DataFrame(np.arange(12).reshape((3,4))) #默认索引是0-2
print(t)

print(t.head()) #默认显示前5行

# 取某一行,类型是series
t.loc[0]

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


0    0
1    1
2    2
3    3
Name: 0, dtype: int64

In [17]:
# 列表套字典转DataFrame
d2 =[{"name" : "xiaohong" ,"age" :32,"tel" :10010},
     { "name": "xiaogang" ,"tel": 10000} ,
     {"name":"xiaowang" ,"age":22}]
df=pd.DataFrame(d2)
print(df) #缺失值会用NaN填充
print(type(df.values)) #ndarray

       name   age      tel
0  xiaohong  32.0  10010.0
1  xiaogang   NaN  10000.0
2  xiaowang  22.0      NaN
<class 'numpy.ndarray'>


In [18]:
#df中不同列可以是不同的数据类型,同一列必须是一个数据类型
import pandas as pd
import numpy as np
dict_data = {'A': 1,
             'B': pd.Timestamp('20190926'),
             'C': pd.Series(1, index=list(range(4)),dtype='float32'),
             'D': np.array([1,2,3,4],dtype='int32'),
             'E': ["Python","Java","C++","C"],
             'F': 'yue' } #会默认与最长的对其，但填充长度只能为1或最长，否则会报错
df_obj2 = pd.DataFrame(dict_data)
print(df_obj2)


   A          B    C  D       E    F
0  1 2019-09-26  1.0  1  Python  yue
1  1 2019-09-26  1.0  2    Java  yue
2  1 2019-09-26  1.0  3     C++  yue
3  1 2019-09-26  1.0  4       C  yue


In [19]:
print(df_obj2.index) #行索引
# df_obj2.index[0]=2  不可以单独修改某个索引值

print(df_obj2.columns) #列索引
df_obj2.dtypes #每一列的数据类型，以便训练模型时知道哪一列数据类型需修改

Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='str')


A             int64
B    datetime64[us]
C           float32
D             int32
E               str
F               str
dtype: object

In [20]:
# 初始化df，设置行索引，列索引
dates = pd.date_range('20130101', periods=6) #默认freq='D'，即天
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)
print('-'*50)
print(df.index)


                   A         B         C         D
2013-01-01 -1.145354 -0.750633 -0.455142 -0.168271
2013-01-02  1.664323 -1.230712 -2.220370 -0.469500
2013-01-03 -0.390525 -0.042975  0.042647  0.200869
2013-01-04  3.817346 -0.268133  2.105803 -0.738199
2013-01-05 -2.959976 -0.209535 -0.513207 -0.208429
2013-01-06 -1.404860  0.439047  0.685439 -0.008349
--------------------------------------------------
DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[us]', freq='D')


In [21]:
# 取数据
print(df_obj2)
print('-'*50)
print(type(df_obj2))
print('-'*50)
# pd中使用索引名来取某一行，或者列
print(df_obj2['B'])
print('-'*50)
# 把df的某一列取出来是series
print(type(df_obj2['B']))

   A          B    C  D       E    F
0  1 2019-09-26  1.0  1  Python  yue
1  1 2019-09-26  1.0  2    Java  yue
2  1 2019-09-26  1.0  3     C++  yue
3  1 2019-09-26  1.0  4       C  yue
--------------------------------------------------
<class 'pandas.DataFrame'>
--------------------------------------------------
0   2019-09-26
1   2019-09-26
2   2019-09-26
3   2019-09-26
Name: B, dtype: datetime64[us]
--------------------------------------------------
<class 'pandas.Series'>


In [22]:
# 增加列，列名自定义
df_obj2['G'] = df_obj2['D'] + 4
print(df_obj2.head())

   A          B    C  D       E    F  G
0  1 2019-09-26  1.0  1  Python  yue  5
1  1 2019-09-26  1.0  2    Java  yue  6
2  1 2019-09-26  1.0  3     C++  yue  7
3  1 2019-09-26  1.0  4       C  yue  8


In [23]:
# 删除列
del(df_obj2['G'])
print(df_obj2.head())

   A          B    C  D       E    F
0  1 2019-09-26  1.0  1  Python  yue
1  1 2019-09-26  1.0  2    Java  yue
2  1 2019-09-26  1.0  3     C++  yue
3  1 2019-09-26  1.0  4       C  yue
