In [1]:
import pandas as pd

# Series

In [2]:
s = pd.Series(range(10, 20))
print(s)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64


In [3]:
print(s.values)
print(type(s.values))
print(s.index)

[10 11 12 13 14 15 16 17 18 19]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=10, step=1)


In [4]:
print(s[0])
print('-' * 50)
print(s * 2)
print('-' * 50)
print(s > 15)

10
--------------------------------------------------
0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int64
--------------------------------------------------
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool


In [5]:
#自定义索引
s = pd.Series(range(10, 20), index=list('abcdefghij'))
print(s)
print('-' * 50)
print(s['a'])

a    10
b    11
c    12
d    13
e    14
f    15
g    16
h    17
i    18
j    19
dtype: int64
--------------------------------------------------
10


# DataFrame

In [6]:
import numpy as np

t = pd.DataFrame(np.arange(12).reshape(3, 4))
print(t)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [7]:
a = np.random.rand(5, 4)
print(a)
print('-' * 50)
b = pd.DataFrame(a)
print(b.head())  #默认显示前5行

[[0.61937049 0.97630887 0.40615322 0.12410012]
 [0.56829062 0.9408697  0.34300544 0.41432685]
 [0.81240865 0.10972573 0.8962212  0.88595192]
 [0.40359854 0.96202629 0.47658494 0.22798008]
 [0.95105563 0.71021452 0.37959325 0.32207937]]
--------------------------------------------------
          0         1         2         3
0  0.619370  0.976309  0.406153  0.124100
1  0.568291  0.940870  0.343005  0.414327
2  0.812409  0.109726  0.896221  0.885952
3  0.403599  0.962026  0.476585  0.227980
4  0.951056  0.710215  0.379593  0.322079


In [8]:
d = {'name': ['Alice', 'Bob', 'Charlie'], 'age': [25, None, 35]}
df = pd.DataFrame(d)
print(df)  #缺失值用NaN表示
print(type(df.values))
print(f'行索引：{df.index}')
print(f'列索引：{df.columns}')

      name   age
0    Alice  25.0
1      Bob   NaN
2  Charlie  35.0
<class 'numpy.ndarray'>
行索引：RangeIndex(start=0, stop=3, step=1)
列索引：Index(['name', 'age'], dtype='object')


In [9]:
#感受日期，初始化df，设置行索引为日期
dates = pd.date_range('20250101', periods=6)
df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=list('ABCD'))
print(df)
print('-' * 50)
print(df.index)

                   A         B         C         D
2025-01-01  0.696562  0.858803  0.833374  0.463414
2025-01-02  0.516410  0.371232  0.731640  0.300296
2025-01-03  0.120726  0.798443  0.961280  0.401154
2025-01-04  0.200468  0.313010  0.671822  0.961301
2025-01-05  0.392244  0.291116  0.954212  0.329472
2025-01-06  0.485116  0.600690  0.680065  0.443880
--------------------------------------------------
DatetimeIndex(['2025-01-01', '2025-01-02', '2025-01-03', '2025-01-04',
               '2025-01-05', '2025-01-06'],
              dtype='datetime64[ns]', freq='D')


# Series的索引操作

In [10]:
s = pd.Series(range(5), index=list('abcde'))
print(s)
print('-' * 50)
print(s['b'])  #按标签索引
print(s.iloc[1])  #按位置索引
print('-' * 50)
print(s[1:3])  #索引位置取数据，左闭右开
print('-' * 50)
print(s['b':'d'])  #索引位置取索引名，左闭右闭
print('-' * 50)
print(s[s > 2])  #取出s中大于2的元素

a    0
b    1
c    2
d    3
e    4
dtype: int64
--------------------------------------------------
1
1
--------------------------------------------------
b    1
c    2
dtype: int64
--------------------------------------------------
b    1
c    2
d    3
dtype: int64
--------------------------------------------------
d    3
e    4
dtype: int64


# DataFrame的索引

In [11]:
import numpy as np

t = pd.DataFrame(np.random.rand(3, 5), columns=list('abcde'))
print(t)
print('-' * 50)
print(f'series类型：\n{t['a']}')  #输出series类型
print('-' * 50)
print(f'dataframe类型：\n{t[['a']]}')  #输出dataframe类型

          a         b         c         d         e
0  0.574349  0.140711  0.114021  0.056545  0.171142
1  0.909886  0.781379  0.588740  0.471729  0.192527
2  0.198274  0.585205  0.293764  0.261037  0.463744
--------------------------------------------------
series类型：
0    0.574349
1    0.909886
2    0.198274
Name: a, dtype: float64
--------------------------------------------------
dataframe类型：
          a
0  0.574349
1  0.909886
2  0.198274


# 对齐操作

In [12]:
s1 = pd.Series([1, 2, 3], index=list('abc'))
s2 = pd.Series([4, 5, 6], index=list('bcd'))
print(s1)
print(s2)
print('-' * 50)
print(s1 + s2)  #对齐操作，按索引对齐，并按索引合并
print('-' * 50)
print(s1.add(s2, fill_value=0))  #对齐操作，按索引对齐，并按索引合并，fill_value=0表示对齐的元素缺失时填充0

a    1
b    2
c    3
dtype: int64
b    4
c    5
d    6
dtype: int64
--------------------------------------------------
a    NaN
b    6.0
c    8.0
d    NaN
dtype: float64
--------------------------------------------------
a    1.0
b    6.0
c    8.0
d    6.0
dtype: float64


# 处理缺失数据

In [13]:
df_data=pd.DataFrame([np.random.randn(3),[1.,2.,np.nan],
                      [np.nan,4.,np.nan],[1.,2.,3.]])
print(df_data)
print('-' * 50)
print(df_data.isnull())
print('-' * 50)
print(df_data.isnull().sum()/len(df_data))#计算缺失值比例

          0        1         2
0  0.407843  0.30905  1.410581
1  1.000000  2.00000       NaN
2       NaN  4.00000       NaN
3  1.000000  2.00000  3.000000
--------------------------------------------------
       0      1      2
0  False  False  False
1  False  False   True
2   True  False   True
3  False  False  False
--------------------------------------------------
0    0.25
1    0.00
2    0.50
dtype: float64


In [14]:
#删除缺失数据
print(df_data.dropna())#删除所有含有缺失值的行
print('-' * 50)
print(df_data.dropna(subset=[0]))#删除列0含有缺失值的行

          0        1         2
0  0.407843  0.30905  1.410581
3  1.000000  2.00000  3.000000
--------------------------------------------------
          0        1         2
0  0.407843  0.30905  1.410581
1  1.000000  2.00000       NaN
3  1.000000  2.00000  3.000000


In [15]:
#填充缺失数据用fillna()方法 
#用均值填充
df_data.iloc[:,0]=df_data.iloc[:,0].fillna(df_data.iloc[:,0].mean()) 
df_data.iloc[:,1]=df_data.iloc[:,1].fillna(df_data.iloc[:,1].mean()) 
df_data.iloc[:,2]=df_data.iloc[:,2].fillna(df_data.iloc[:,2].mean()) 
print(df_data)

          0        1         2
0  0.407843  0.30905  1.410581
1  1.000000  2.00000  2.205290
2  0.802614  4.00000  2.205290
3  1.000000  2.00000  3.000000


# 数据连接

In [25]:
df1=pd.DataFrame({'key1':list('bbacaab'),'data1':np.random.randint(0,10,7)})
df2=pd.DataFrame({'key2':['a','b','d'],'data2':np.random.randint(0,10,3)})
print(df1)
print('-'*50)
print(df2) 
print('-'*50)
# print(pd.merge(df1,df2))
print('-'*50)
print(pd.merge(df1,df2,left_on='key1',right_on='key2',how='outer'))

  key1  data1
0    b      3
1    b      4
2    a      3
3    c      1
4    a      0
5    a      4
6    b      2
--------------------------------------------------
  key2  data2
0    a      9
1    b      1
2    d      5
--------------------------------------------------
--------------------------------------------------
  key1  data1 key2  data2
0    a    3.0    a    9.0
1    a    0.0    a    9.0
2    a    4.0    a    9.0
3    b    3.0    b    1.0
4    b    4.0    b    1.0
5    b    2.0    b    1.0
6    c    1.0  NaN    NaN
7  NaN    NaN    d    5.0
