In [1]:
import pandas as pd
import numpy as np

In [2]:
"""
case1: df中某列存在序列特征，想要将列中的内容进行合并。
"""

df = pd.DataFrame({
    'sid': ['session1', 'session1', 'session1', 'session2', 'session2'],
    'content': ['A', 'B', 'C', 'D', 'E']
})
df

Unnamed: 0,sid,content
0,session1,A
1,session1,B
2,session1,C
3,session2,D
4,session2,E


In [3]:
# 利用transform
df['target'] = df.groupby(['sid'])['content'].transform(lambda x: '|'.join(x))
df

Unnamed: 0,sid,content,target
0,session1,A,A|B|C
1,session1,B,A|B|C
2,session1,C,A|B|C
3,session2,D,D|E
4,session2,E,D|E


In [4]:
# 利用drop_duplicates()去重
res = df[['sid', 'target']].drop_duplicates().reset_index(drop=True)
res

Unnamed: 0,sid,target
0,session1,A|B|C
1,session2,D|E


In [17]:
"""
case2: 根据df中的两列对数据进行筛选（两列存在关联）
"""

# 想筛选出高于limit中数值的数据
limit = {'1': 0.3, '2': 0.5}
df = pd.DataFrame({
    'prdLine': ['1', '2', '2', '1', '1'],
    'score': [0.5, 0.2, 0.6, 0.1, 0.9]
})
df

Unnamed: 0,prdLine,score
0,1,0.5
1,2,0.2
2,2,0.6
3,1,0.1
4,1,0.9


In [18]:
"""
思路1: 使用apply函数生成列，再根据列筛选
"""

def judge(_df):
    line = _df['prdLine']
    score = _df['score']
    
    return score >= limit[line]

df['tmp'] = df.apply(judge, axis=1)
df

Unnamed: 0,prdLine,score,tmp
0,1,0.5,True
1,2,0.2,False
2,2,0.6,True
3,1,0.1,False
4,1,0.9,True


In [19]:
res = df[df['tmp'] == True][['prdLine', 'score']].reset_index(drop=True)
res

Unnamed: 0,prdLine,score
0,1,0.5
1,2,0.6
2,1,0.9
