### pandas 基本操作


In [1]:
# 1 导入数据库
import pandas as pd
pd.__version__

'1.1.5'

In [2]:
# 2 创建Series
arr = [1,2,3,4,5]
df = pd.Series(arr) #如果不指定索引，则默认从 0 开始
df

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [3]:
# 3 从字典创建Series  不是df
d = {'a':1,'b':2,'c':3,'d':4,'e':5}
df = pd.Series(d)
df

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [5]:
# 从Numpy 数组创建DataFrame
import numpy as np

dates = pd.date_range('today',periods=6) # 定义时间序列作为 index
num_arr = np.random.randn(6,4) # 传入 numpy 随机数组
num_arr
columns = ['A','B','C','D'] # 将列表作为列名
df = pd.DataFrame(num_arr, index = dates, columns = columns)
df

Unnamed: 0,A,B,C,D
2021-05-28 11:01:46.117117,-0.685967,-1.335861,-0.254415,-2.672227
2021-05-29 11:01:46.117117,2.478526,-1.130746,-1.976778,0.106129
2021-05-30 11:01:46.117117,0.147527,0.127145,1.972152,1.100032
2021-05-31 11:01:46.117117,-0.504891,0.49486,0.889161,0.575903
2021-06-01 11:01:46.117117,-1.577483,-0.402318,0.413941,1.150724
2021-06-02 11:01:46.117117,0.014062,-1.880337,0.94524,-0.530013


In [6]:
import numpy as np

data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index=labels)
df.info


<bound method DataFrame.info of   animal  age  visits priority
a    cat  2.5       1      yes
b    cat  3.0       3      yes
c  snake  0.5       2       no
d    dog  NaN       3      yes
e    dog  5.0       2       no
f    cat  2.0       3       no
g  snake  4.5       1       no
h    cat  NaN       1      yes
i    dog  7.0       2       no
j    dog  3.0       1       no>

In [7]:
df.iloc[:3]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [8]:
# 取出其中两列 
df[['animal','age']]

Unnamed: 0,animal,age
a,cat,2.5
b,cat,3.0
c,snake,0.5
d,dog,
e,dog,5.0
f,cat,2.0
g,snake,4.5
h,cat,
i,dog,7.0
j,dog,3.0


In [9]:
2000*19*19

722000

In [11]:
### 取出索引为【3，4，8】行的animal和age列
df.loc[df.index[[3,4,8]],['animal','age']]



Unnamed: 0,animal,age
d,dog,
e,dog,5.0
i,dog,7.0


In [12]:
# 取出age大于3的行
df[df['age']>3]
df[df['visits']>1]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
i,dog,7.0,2,no


In [18]:
df[df['age'].isnull()]

Unnamed: 0,animal,age,visits,priority
d,dog,,3,yes
h,cat,,1,yes


In [19]:
# 13.取出age在2,4间的行（不含）
df[(df['age']>2) & (df['age']<4)]

# 方法二  between函数的使用
df[df['age'].between(2, 4)]

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
f,cat,2.0,3,no
j,dog,3.0,1,no


In [20]:
# 14. `f行的age改为1.5
df.loc['f','age'] = 1.6
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,1.6,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [21]:
# 15. 计算visits的总和
df['visits'].sum()

19

In [22]:
df.head()

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no


In [23]:
# 16. 计算每个不同种类animal的age的平均数
a = df.groupby('animal')['age'].mean()
a = pd.DataFrame(a)
a

Unnamed: 0_level_0,age
animal,Unnamed: 1_level_1
cat,2.366667
dog,5.0
snake,2.5


In [24]:
# 17. 计算df中每个种类animal的数量
df['animal'].value_counts()


dog      4
cat      4
snake    2
Name: animal, dtype: int64

In [25]:
# 18. 先按age降序排列，后按visits升序排列
df.sort_values(by=['age','visits'],ascending=[False,True])

Unnamed: 0,animal,age,visits,priority
i,dog,7.0,2,no
e,dog,5.0,2,no
g,snake,4.5,1,no
j,dog,3.0,1,no
b,cat,3.0,3,yes
a,cat,2.5,1,yes
f,cat,1.6,3,no
c,snake,0.5,2,no
h,cat,,1,yes
d,dog,,3,yes


In [30]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']
df = pd.DataFrame(data, index=labels)
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [31]:
# 19. 将priority列中的yes, no替换为布尔值True, False map函数的使用
df['priority'] = df['priority'].map({'no':False,'yes':True})
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,True
b,cat,3.0,3,True
c,snake,0.5,2,False
d,dog,,3,True
e,dog,5.0,2,False
f,cat,2.0,3,False
g,snake,4.5,1,False
h,cat,,1,True
i,dog,7.0,2,False
j,dog,3.0,1,False


In [29]:
# 20. 将animal列中的snake替换为python
df['animal'] = df['animal'].replace('snake','python')
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,
b,cat,3.0,3,
c,python,0.5,2,
d,dog,,3,
e,dog,5.0,2,
f,cat,1.6,3,
g,python,4.5,1,
h,cat,,1,
i,dog,7.0,2,
j,dog,3.0,1,


In [23]:
# 21. 对每种animal的每种不同数量visits，计算平均age，即，返回一个表格，
# 行是aniaml种类，列是visits数量，表格值是行动物种类列访客数量的平均年龄

df = df.fillna(0)
df1 = pd.pivot_table(data=df,index='animal',columns='visits',values='age',aggfunc=np.sum)
df1


visits,1,2,3
animal,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cat,2.5,,5.0
dog,3.0,12.0,0.0
snake,4.5,0.5,


In [32]:
# 22. 在df中插入新行k，然后删除该行
df.loc['k'] = [5.5,'dog','no',2]
df = df.drop('k')
df

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,1
b,cat,3.0,3,1
c,snake,0.5,2,0
d,dog,,3,1
e,dog,5.0,2,0
f,cat,2.0,3,0
g,snake,4.5,1,0
h,cat,,1,1
i,dog,7.0,2,0
j,dog,3.0,1,0


In [54]:
# 透视表的使用
import os
os.listdir('./')
df = pd.read_csv('./James_Harden.csv')
df.head()   # 前五行
# 练习透视表
df.tail()   # 后5行
#pivot_table(data, values=None, index=None, columns=None,aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All')
# pivot_table有四个最重要的参数index、values、columns、aggfunc，本文以这四个参数为中心讲解pivot操作是如何进行。
df

# 需求1   想看哈登 对阵每个队伍的得分
pd.pivot_table(df,index='对手')


Unnamed: 0_level_0,Unnamed: 1_level_0,3分命中率,助攻,命中,得分,投篮命中率,投篮数,篮板
对手,主客场,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
76人,主,0.429,7.0,8.0,29.0,0.381,21.0,4.0
76人,客,0.25,13.0,10.0,27.0,0.5,20.0,3.0
勇士,客,0.444,11.0,10.0,27.0,0.435,23.0,6.0
国王,客,0.286,9.0,8.0,27.0,0.381,21.0,3.0
太阳,客,0.545,7.0,12.0,48.0,0.545,22.0,2.0
小牛,主,0.462,7.0,10.0,29.0,0.526,19.0,3.0
尼克斯,主,0.385,10.0,12.0,37.0,0.444,27.0,2.0
尼克斯,客,0.353,9.0,9.0,31.0,0.391,23.0,5.0
开拓者,客,0.571,3.0,16.0,48.0,0.552,29.0,8.0
掘金,主,0.143,9.0,6.0,21.0,0.375,16.0,8.0


In [55]:
# 需求2 对手成为了第一层索引，还想看看对阵同一对手在不同主客场下的数据，试着将对手与胜负与主客场都设置为index
#Index就是层次字段，要通过透视表获取什么信息就按照相应的顺序设置字段，所以在进行pivot之前你也需要足够了解你的数据。
pd.pivot_table(df,index=['对手','主客场'])

Unnamed: 0_level_0,Unnamed: 1_level_0,3分命中率,助攻,命中,得分,投篮命中率,投篮数,篮板
对手,主客场,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
76人,主,0.429,7.0,8.0,29.0,0.381,21.0,4.0
76人,客,0.25,13.0,10.0,27.0,0.5,20.0,3.0
勇士,客,0.444,11.0,10.0,27.0,0.435,23.0,6.0
国王,客,0.286,9.0,8.0,27.0,0.381,21.0,3.0
太阳,客,0.545,7.0,12.0,48.0,0.545,22.0,2.0
小牛,主,0.462,7.0,10.0,29.0,0.526,19.0,3.0
尼克斯,主,0.385,10.0,12.0,37.0,0.444,27.0,2.0
尼克斯,客,0.353,9.0,9.0,31.0,0.391,23.0,5.0
开拓者,客,0.571,3.0,16.0,48.0,0.552,29.0,8.0
掘金,主,0.143,9.0,6.0,21.0,0.375,16.0,8.0


In [59]:
#通过上面的操作，我们获取了james harden在对阵对手时的所有数据，而Values可以对需要的计算数据进行筛选，
# 如果我们只需要james harden在主客场和不同胜负情况下的得分、篮板与助攻三项数据：
df
pd.pivot_table(df,index=['主客场','胜负'],values=['篮板','助攻','得分'])


Unnamed: 0_level_0,Unnamed: 1_level_0,助攻,得分,篮板
主客场,胜负,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
主,胜,10.555556,34.222222,5.444444
主,负,8.666667,29.666667,5.0
客,胜,9.0,32.0,4.916667
客,负,8.0,20.0,4.0


In [63]:
# aggfunc  参数可以设置我们对数据聚合时进行的函数操作。
# 当我们未设置aggfunc时，它默认aggfunc='mean'计算均值。
# 我们还想要获得james harden在主客场和不同胜负情况下的总得分、总篮板、总助攻时：
import numpy as np
pd.pivot_table(df,index=['主客场','胜负'],values=['得分','助攻','篮板'],aggfunc=[np.sum,np.mean,np.max])


Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,mean,mean,mean,amax,amax,amax
Unnamed: 0_level_1,Unnamed: 1_level_1,助攻,得分,篮板,助攻,得分,篮板,助攻,得分,篮板
主客场,胜负,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
主,胜,95,308,49,10.555556,34.222222,5.444444,17,56,11
主,负,26,89,15,8.666667,29.666667,5.0,11,38,6
客,胜,108,384,59,9.0,32.0,4.916667,15,48,10
客,负,8,20,4,8.0,20.0,4.0,8,20,4


In [67]:
# columns
#Columns类似Index可以设置列层次字段，它不是一个必要参数，作为一种分割数据的可选方式。
#fill_value填充空值,margins=True进行汇总
pd.pivot_table(df,index=['主客场'],columns=['对手'],values=['得分'],aggfunc=[np.sum],fill_value=0,margins=1)



Unnamed: 0_level_0,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分,得分
对手,76人,勇士,国王,太阳,小牛,尼克斯,开拓者,掘金,步行者,湖人,灰熊,爵士,猛龙,篮网,老鹰,骑士,鹈鹕,黄蜂,All
主客场,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
主,29,0,0,0,29,37,0,21,29.0,0,30.0,56.0,38,37,0,35,26,0,33.083333
客,27,27,27,48,0,31,48,0,26.0,36,24.5,29.0,0,0,29,0,0,27,31.076923
All,28,27,27,48,29,34,48,21,27.5,36,27.25,42.5,38,37,29,35,26,27,32.04


In [69]:
# 不同columns
table=pd.pivot_table(df,index=[u'对手',u'胜负'],columns=[u'主客场'],values=[u'得分',u'助攻',u'篮板'],aggfunc=[np.mean],fill_value=0)
table
table.query('对手 == ["灰熊"]')

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,助攻,助攻,得分,得分,篮板,篮板
Unnamed: 0_level_2,主客场,主,客,主,客,主,客
对手,胜负,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
76人,胜,0,13,0,27,0,3
76人,负,7,0,29,0,4,0
勇士,胜,0,11,0,27,0,6
国王,胜,0,9,0,27,0,3
太阳,胜,0,7,0,48,0,2
小牛,胜,7,0,29,0,3,0
尼克斯,胜,10,9,37,31,2,5
开拓者,胜,0,3,0,48,0,8
掘金,胜,9,0,21,0,8,0
步行者,胜,10,15,29,26,8,5


In [76]:
# 统计哈登不同主客场与胜负下的场数以及投篮命中率（哈登在主场且胜利的场数以及在这种情况下的投篮命中率）
df1 = df[df['胜负']=='胜']
df1
pd.pivot_table(df,index=['主客场'],columns=['胜负','对手'],values=['投篮命中率'],aggfunc=[np.sum],fill_value=0)

Unnamed: 0_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率,投篮命中率
胜负,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,胜,负,负,负
对手,76人,勇士,国王,太阳,小牛,尼克斯,开拓者,掘金,步行者,湖人,灰熊,爵士,篮网,老鹰,骑士,鹈鹕,黄蜂,76人,灰熊,猛龙
主客场,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4
主,0.0,0.0,0.0,0.0,0.526,0.444,0.0,0.375,0.364,0.0,0.44,0.76,0.65,0.0,0.381,0.5,0.0,0.381,0.4,0.32
客,0.5,0.435,0.381,0.545,0.0,0.391,0.552,0.0,0.429,0.591,0.45,0.421,0.0,0.533,0.0,0.0,0.444,0.0,0.316,0.0


In [86]:
# 23. 有一列整数列A的DatraFrame，删除数值重复的行
df = pd.DataFrame({'A': [1, 2, 2, 3, 4, 5, 5, 5, 6, 7, 7]})
df
# df1 =df.drop_duplicates(subset='A')
df1 = df.loc[df['A'].shift() != df['A']]
df1
df.drop_duplicates(subset='A')

Unnamed: 0,A
0,1
1,2
3,3
4,4
5,5
8,6
9,7


In [27]:
import pandas as pd
import numpy as np
# 24. 一个全数值DatraFrame，每个数字减去该行的平均数  sub用法

df = pd.DataFrame(np.random.randint(low=1,high=10,size=(5, 3)))
print(df)
print(df.mean(axis=0))  # axis=0 为按照列求均值
print(df.mean(axis=1))  # axis=1 为按照行求均值
df1 = df.sub(df.mean(axis=1),axis=0)  #  axis=1 按照行取均值，然后按照列去减
df1

print(df1)

# importing pandas as pd 
import pandas as pd 
# Creating the dataframe  
df_2 = pd.DataFrame({"A":[1, 5, 3, 4, 2], 
                   "B":[3, 2, 4, 3, 4],  
                   "C":[2, 2, 7, 3, 4],  
                   "D":[4, 3, 6, 12, 7]},  
                   index =["A1", "A2", "A3", "A4", "A5"]) 


print('df_2',df_2)
sr = pd.Series([12, 25, 64, 18], index =["A", "B", "C", "D"])
df_sub = df_2.sub(sr, axis = 1)    
df_sub


   0  1  2
0  2  3  1
1  5  8  9
2  6  8  7
3  1  4  2
4  3  9  5
0    3.4
1    6.4
2    4.8
dtype: float64
0    2.000000
1    7.333333
2    7.000000
3    2.333333
4    5.666667
dtype: float64
df_2     A  B  C   D
A1  1  3  2   4
A2  5  2  2   3
A3  3  4  7   6
A4  4  3  3  12
A5  2  4  4   7


Unnamed: 0,A,B,C,D
A1,-11,-22,-62,-14
A2,-7,-23,-62,-15
A3,-9,-21,-57,-12
A4,-8,-22,-61,-6
A5,-10,-21,-60,-11


In [18]:
# 25. 一个有5列的DataFrame，求哪一列的和最小
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.randint(low=0,high=10,size=(5,4)),columns=list('abcd'))
df1['index'] = list('一二三四五')
df1 = df1.set_index('index',drop=True)
df1
df1.sum(axis=0).idxmin()  # 找出一列和最小
df1.sum(axis=1).idxmin()  # 找出哪一行和最小


'三'

In [63]:
# 26. 给定DataFrame，求代号列不同分组数据中  ‘数值’数值最大的三个值的和 
df = pd.DataFrame({'代号': list('aaabbcaabcccbbc'), 
                   '数值': [12,345,3,1,45,14,4,52,54,23,235,21,57,3,87]})

# 方法1
df1 = df.groupby('代号')['数值'].nlargest(3).sum(level=0)   # 主要是groupby的用法
print('方法1',df1)

# 方法2
def nlargest_sum(data,n):
    data = data.sort_values(ascending=False)
    return data[:n].sum()

df_nlagest = df.groupby('代号')['数值'].apply(lambda x : nlargest_sum(x,n=3))
print('方法2',df_nlagest)


方法1 代号
a    409
b    156
c    345
Name: 数值, dtype: int64
方法2 代号
a    409
b    156
c    345
Name: 数值, dtype: int64


In [64]:
# 27. 给定DataFrame，有列A, B，A的值在1-100（含），对A列数值 10步长，求对应的B的值和
# 练习 pd.cut()函数
df = pd.DataFrame({'A': [1,2,11,11,33,34,35,40,79,99], 
                   'B': [1,2,11,11,33,34,35,40,79,99]})
cut_data = pd.cut(df['A'],bins=np.arange(0,101,10))
bins = np.arange(0,101,10)          # np用法
df.groupby(cut_data)['B'].sum()     # 这个用法并不熟悉


A
(0, 10]        3
(10, 20]      22
(20, 30]       0
(30, 40]     142
(40, 50]       0
(50, 60]       0
(60, 70]       0
(70, 80]      79
(80, 90]       0
(90, 100]     99
Name: B, dtype: int64

In [78]:
# np.r_ 按照行   np.c_是按照列进行矩阵的拼接
import numpy as np
x = np.arange(9).reshape(3, 3)
y = np.ones([3, 3])
c = np.r_[x, y]
d = np.c_[x, y]

array([[0., 1., 2., 1., 1., 1.],
       [3., 4., 5., 1., 1., 1.],
       [6., 7., 8., 1., 1., 1.]])

In [None]:
np.searchsorted

In [90]:
import numpy as np
import pandas as pd

# 28. 给定DataFrame，计算每个元素至左边0的距离  生成新列y 
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
# 左边 0 的位置，然后所有的数据的位置 
first_zero_index = df.loc[df['X']==0].index[0]
index_all = np.arange(0,df.shape[0],1)
df['Y'] = abs(index_all - first_zero_index)  
# print(df)

# 28. 给定DataFrame，计算每个元素至左边最近的0 如果左边没有0那就是到开头的距离的距离，生成新列y 
df = pd.DataFrame({'X': [7, 2, 0, 3, 4, 2, 5, 0, 3, 4]})
izero = np.r_[-1, (df['X'] == 0).to_numpy().nonzero()[0]] # 标记0的位置
print('izero',izero)
idx = np.arange(len(df))
print(idx)


df['Y'] = idx - izero[np.searchsorted(izero - 1, idx) - 1]
print(df)

izero [-1  2  7]
[0 1 2 3 4 5 6 7 8 9]
   X  Y
0  7  1
1  2  2
2  0  0
3  3  1
4  4  2
5  2  3
6  5  4
7  0  0
8  3  1
9  4  2
