In [4]:
import pandas as pd
import numpy as np
import geopandas as gpd
import transbigdata as tbd
import warnings
warnings.filterwarnings('ignore')

In [5]:
stay = pd.read_csv(r'staymove/sh_2311_lifepattern_activity.csv')
stay = stay.sort_values(by = 'rank')
stay['reindex'].drop_duplicates()

move = pd.read_csv(r'staymove/sh_2311_lifepattern_move.csv')
move.loc[move['etype'].isnull(),'etype']='O_0'
move = move.sort_values(by = 'rank')
move['reindex'].drop_duplicates()


# 重命名move DataFrame的列，以便与stay DataFrame的列匹配
move = move.rename(columns={'shour': 'hour'})

# 创建一个新的列'type'，将'stype'和'etype'合并
move['type'] = move['stype'] + '.' + move['etype']

# 从move DataFrame中选择必要的列
move_lifepattern = move[['reindex', 'type', 'hour', 'count']]

# 定义一个函数以扩展小时范围
def expand_hours(row):
    if row['shour'] <= row['ehour']:
        return list(range(row['shour'], row['ehour'] + 1))
    else:
        return list(range(row['shour'], 24)) + list(range(0, row['ehour'] + 1))

# 应用expand_hours函数并扩展DataFrame
stay_tag = stay[['type', 'shour', 'ehour']].drop_duplicates()
expanded_hours = stay_tag.apply(expand_hours, axis=1)

df_expanded = stay_tag.loc[stay_tag.index.repeat(expanded_hours.str.len())]

df_expanded['hour'] = [hour for sublist in expanded_hours for hour in sublist]

# 重置最终DataFrame的索引
df_expanded.reset_index(drop=True, inplace=True)

# 合并stay和df_expanded DataFrame，按'reindex'，'type'和'hour'分组并计算总数
stay_lifepattern = pd.merge(stay, df_expanded).groupby(['reindex', 'type', 'hour'])['count'].sum().reset_index()

# 修改'type'列的值，将其合并为类型对
stay_lifepattern['type'] = stay_lifepattern['type'] + '.' + stay_lifepattern['type']

# 合并move_lifepattern和stay_lifepattern
lifepattern = pd.concat([move_lifepattern, stay_lifepattern])
lifepattern['otype'] = lifepattern['type'].apply(lambda x: x.split('.')[0])
lifepattern['dtype'] = lifepattern['type'].apply(lambda x: x.split('.')[1])

0          100000
45          99999
94          99998
133         99997
193         99996
            ...  
4432785         5
4432816         4
4432839         3
4432875         2
4432914         1
Name: reindex, Length: 100000, dtype: int64

In [68]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

def generate_seq(lifepattern_i,days=100,seq_type='matrix',starttime = '2020-01-01'):
    #TODO 加入工作日和非工作日区分
    def getprob(f):
        f = f[['dtype','count']]
        f['prob'] = f['count']/f['count'].sum()
        return f[['dtype','prob']].values
    lifepattern_dict = lifepattern_i.groupby(['hour','otype']).apply(lambda x:getprob(x)).to_dict()
    lifepattern_dict_hour = lifepattern_i.groupby(['hour']).apply(lambda x:getprob(x)).to_dict()
    initstate = lifepattern_i.groupby(['otype'])['count'].sum().index[0]
    # 马尔科夫链
    currenthour = 0
    currentstate = initstate
    allstates = [currentstate]
    repeattimes = 0
    for i in range(24*days-1):
        if (currenthour,currentstate) in lifepattern_dict:
            p = lifepattern_dict[(currenthour,currentstate)]
            nextstate = np.random.choice(p[:,0],size = 1,p=list(p[:,1]))[0]
        else:
            #此处为随机选择
            if currenthour in lifepattern_dict_hour:
                p = lifepattern_dict_hour[currenthour]
                nextstate = np.random.choice(p[:,0],size = 1,p=list(p[:,1]))[0]
            else:
                nextstate = initstate

        currenthour+=1
        if currenthour== 24:
            currenthour = 0

        #重复过多次则剔除
        #print(nextstate,currentstate,repeattimes)
        if nextstate == currentstate:
            
            repeattimes += 1
            if repeattimes == 24:
                nextstate = initstate
                repeattimes = 0
        else:
            repeattimes = 0
        currentstate = nextstate

        allstates.append(currentstate)
        #print([currenthour,currentstate])
    if seq_type == 'matrix':

        return np.array(allstates).reshape(-1,24).tolist()
    elif seq_type == 'df':
        allstates = pd.DataFrame(allstates,columns=['type'])
        allstates['hour'] = range(len(allstates))
        allstates['time'] = allstates['hour'].apply(lambda x:pd.Timestamp(starttime)+pd.Timedelta(hours = x))+np.random.uniform(0*60,60*60,len(allstates)).astype(int)*pd.Timedelta('1 second')
        allstates = allstates[(allstates['type'].shift())!=allstates['type']]
        return allstates[['time','type']]



INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [69]:
uid = 1	
allstates = generate_seq(lifepattern[lifepattern['reindex'] == uid],days = 100,seq_type = 'df')


In [None]:

generated_sequences_long_shaped = list(lifepattern.groupby(['reindex']).parallel_apply(lambda lifepattern_i:generate_seq(lifepattern_i,days = 100)))

#存储为pkl文件
import pickle
with open('generated_sequences.pkl', 'wb') as f:
    pickle.dump(generated_sequences_long_shaped, f)


In [308]:
len(generated_sequences_long_shaped)
for i in range(20):
    generated_sequences_long_shaped[i*5000:(i+1)*5000]

100000

In [310]:
len(a)

5000

In [314]:
i=19
for i in range(20):
    a = generated_sequences_long_shaped[i*5000:(i+1)*5000]
    #存储为pkl文件
    import pickle
    with open(f'seq/generated_sequences_{i}.pkl', 'wb') as f:
        pickle.dump(a, f)