In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

## Create Series

In [10]:
s = pd.Series(np.array([1,2,3,4,5,6]), name = 'test')
s

0    1
1    2
2    3
3    4
4    5
5    6
Name: test, dtype: int32

## Create Dataframe

In [16]:
# dataframe创建方式一
date = pd.date_range('2020-08-15', periods = 10)

date_df = pd.DataFrame(index = date, data = np.random.normal(10, 4), columns = ['A', 'B', 'C', 'D'])

date_df

Unnamed: 0,A,B,C,D
2020-08-15,4.87717,4.87717,4.87717,4.87717
2020-08-16,4.87717,4.87717,4.87717,4.87717
2020-08-17,4.87717,4.87717,4.87717,4.87717
2020-08-18,4.87717,4.87717,4.87717,4.87717
2020-08-19,4.87717,4.87717,4.87717,4.87717
2020-08-20,4.87717,4.87717,4.87717,4.87717
2020-08-21,4.87717,4.87717,4.87717,4.87717
2020-08-22,4.87717,4.87717,4.87717,4.87717
2020-08-23,4.87717,4.87717,4.87717,4.87717
2020-08-24,4.87717,4.87717,4.87717,4.87717


In [23]:
# dataframe创建方式二
df = pd.DataFrame(data={
    'A':pd.Timestamp('2020-09-01'),
    'B':pd.Categorical(['One', 'Two','Three', 'Four']),
    'C':pd.Series(list(range(4)), dtype = np.float16),
    'D':np.array([3] * 4, dtype = np.int16)
})

In [30]:
#查看dataframe详细信息
print(df.describe())

# 查看dataframe的列名
print(df.columns)

# 查看dataframe的索引
print(df.index)

              C    D
count  4.000000  4.0
mean   1.500000  3.0
std    1.291016  0.0
min    0.000000  3.0
25%    0.750000  3.0
50%    1.500000  3.0
75%    2.250000  3.0
max    3.000000  3.0
Index(['A', 'B', 'C', 'D'], dtype='object')
RangeIndex(start=0, stop=4, step=1)


In [32]:
#对索引进行排列，axis = 1表示对columns进行排序，axis表示对index进行排序，ascending默认为True表示升序，False为降序
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
0,3,0.0,One,2020-09-01
1,3,1.0,Two,2020-09-01
2,3,2.0,Three,2020-09-01
3,3,3.0,Four,2020-09-01


In [34]:
# 对某一列的数值进行排序
df.sort_values('C', ascending = False)

Unnamed: 0,A,B,C,D
3,2020-09-01,Four,3.0,3
2,2020-09-01,Three,2.0,3
1,2020-09-01,Two,1.0,3
0,2020-09-01,One,0.0,3


## 筛选数据

In [49]:
date = pd.date_range('20200801', periods = 6)
df = pd.DataFrame(np.array(range(24)).reshape(6,4), index = date, columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2020-08-01,0,1,2,3
2020-08-02,4,5,6,7
2020-08-03,8,9,10,11
2020-08-04,12,13,14,15
2020-08-05,16,17,18,19
2020-08-06,20,21,22,23


In [60]:
# 选择某一列:两种方式
print(df.A)
print(df['A'])

2020-08-01     0
2020-08-02     4
2020-08-03     8
2020-08-04    12
2020-08-05    16
2020-08-06    20
Freq: D, Name: A, dtype: int32
2020-08-01     0
2020-08-02     4
2020-08-03     8
2020-08-04    12
2020-08-05    16
2020-08-06    20
Freq: D, Name: A, dtype: int32


In [78]:
# loc只能使用名称来选择
# 选择连续几行
print(df[1:3])
# 选择特定的某一行
print(df.loc['20200802'])

# 选择多列两种方法
print(df.loc[:'20200802', ['A', 'B']])
print(df.loc[:,'A':'B'])

# 特定行列
df.loc['20200802',['A', 'B']]

            A  B   C   D
2020-08-02  4  5   6   7
2020-08-03  8  9  10  11
A    4
B    5
C    6
D    7
Name: 2020-08-02 00:00:00, dtype: int32
            A  B
2020-08-01  0  1
2020-08-02  4  5
             A   B
2020-08-01   0   1
2020-08-02   4   5
2020-08-03   8   9
2020-08-04  12  13
2020-08-05  16  17
2020-08-06  20  21


A    4
B    5
Name: 2020-08-02 00:00:00, dtype: int32

In [73]:
# iloc只允许使用index来选择
# 使用iloc获取跨行跨列
print(df.iloc[1:3, :4])
print(df.iloc[[1,2,4], :])
print(df.iloc[2:, [1,3]])

            A  B   C   D
2020-08-02  4  5   6   7
2020-08-03  8  9  10  11
             A   B   C   D
2020-08-02   4   5   6   7
2020-08-03   8   9  10  11
2020-08-05  16  17  18  19


In [81]:
# 通过条件来选择
df[df['A']>3]

Unnamed: 0,A,B,C,D
2020-08-02,4,5,6,7
2020-08-03,8,9,10,11
2020-08-04,12,13,14,15
2020-08-05,16,17,18,19
2020-08-06,20,21,22,23


## 设置值

In [82]:
df

Unnamed: 0,A,B,C,D
2020-08-01,0,1,2,3
2020-08-02,4,5,6,7
2020-08-03,8,9,10,11
2020-08-04,12,13,14,15
2020-08-05,16,17,18,19
2020-08-06,20,21,22,23


In [88]:
# 使用iloc设置
df.iloc[2,3] = 7
# 使用loc设置
df.loc['20200801', 'A'] = 15
# 根据条件来设置
df[df.A == 8] = 7
df.B.loc[df.A>10] = 0
df

Unnamed: 0,A,B,C,D
2020-08-01,15,0,2,3
2020-08-02,4,5,6,7
2020-08-03,7,7,7,7
2020-08-04,12,0,14,15
2020-08-05,16,0,18,19
2020-08-06,20,0,22,23


## 处理丢失的值

In [89]:
df

Unnamed: 0,A,B,C,D
2020-08-01,15,0,2,3
2020-08-02,4,5,6,7
2020-08-03,7,7,7,7
2020-08-04,12,0,14,15
2020-08-05,16,0,18,19
2020-08-06,20,0,22,23


In [100]:
# 设置NAN
df.loc['20200801'] = np.NAN
df.loc['20200804', 'A'] = np.NAN

In [101]:
# 查看NAN
df.isna()

Unnamed: 0,A,B,C,D
2020-08-01,True,True,True,True
2020-08-02,False,False,False,False
2020-08-03,False,False,False,False
2020-08-04,True,False,False,False
2020-08-05,False,False,False,False
2020-08-06,False,False,False,False


In [102]:
# 默认去掉含有NAN的行和列
df.dropna()

Unnamed: 0,A,B,C,D
2020-08-02,4.0,5.0,6.0,7.0
2020-08-03,7.0,7.0,7.0,7.0
2020-08-05,16.0,0.0,18.0,19.0
2020-08-06,20.0,0.0,22.0,23.0


In [103]:
# 使用特定的值填充NAN
df.fillna(0)

Unnamed: 0,A,B,C,D
2020-08-01,0.0,0.0,0.0,0.0
2020-08-02,4.0,5.0,6.0,7.0
2020-08-03,7.0,7.0,7.0,7.0
2020-08-04,0.0,0.0,14.0,15.0
2020-08-05,16.0,0.0,18.0,19.0
2020-08-06,20.0,0.0,22.0,23.0


In [109]:
# 根据条件去掉含有NAN的行或列，axis = 0是对行进行操作， axis = 1是对列进行操作， how = any是表示只要含有NAN均去掉行或列
# how = all则为整行或列为NAN才去掉行或者列

df.dropna(axis = 1, how = 'all')
df.dropna(axis = 0, how = 'all')

Unnamed: 0,A,B,C,D
2020-08-02,4.0,5.0,6.0,7.0
2020-08-03,7.0,7.0,7.0,7.0
2020-08-04,,0.0,14.0,15.0
2020-08-05,16.0,0.0,18.0,19.0
2020-08-06,20.0,0.0,22.0,23.0
