# 模块导入

In [3]:
import pandas as pd
import os

from os import path

In [4]:
import pickle

# 定义函数

In [5]:
def get_data(file_name):
    '''
    传入的是文件路径
    读取并对数据进行处理
    '''
    file_extension = path.splitext(file_name)[1]
    if file_extension == '.csv':
        data = pd.read_csv(file_name, encoding='gbk')
    elif file_extension == '.xlsx':
        data = pd.read_excel(file_name, encoding='gbk')
    drop_col = ['imei_', 'area_', 'msisdn_']
    data.drop(drop_col, axis=1, inplace=True)
    col = data.columns
    for each in col:
        if data[each].dtype == object:
            data[each] = data[each].str.replace('\t', '')
    print('data processed successfully')
    return data

In [6]:
def convert_to_time(data):
    '''
    将时间列，转变成index处理。
    '''
    data.index = pd.to_datetime(data['update_time_'])
    print('successfully')
    return data

In [66]:
def trace(data, treshold = 1):
    '''
    用户的轨迹追踪。
    treshhold = 0 设置的阈值。大于这个阈值的才进行轨迹追踪
    '''
    data['Frequency'] = data.imsi_.map(data.imsi_.value_counts())
    print('trace users successfully')
    return data[data.Frequency >= treshold].groupby(['imsi_', 'update_time_',])['device_id_'].value_counts().unstack(fill_value=0)

In [72]:
#数据转化部分

def data_transform(data):
    data_storage = {}
    group_data = data.reset_index(level=1)
    misi = list(set(group_data.index))
    for each in misi:
        data_storage[each] = {}
        try:
            # 缺失部分的数据都是有问题的，比如1分31秒，传感器统计了两次。
            temp = group_data.loc[each, :].to_dict('list')
            data_storage[each].update(temp)
        except TypeError:
            temp = group_data.loc[each,:].to_frame().T.to_dict('list')
            data_storage[each].update(temp)
#             print(f'{each} is missing..')
    print('data transform successfully')
    return data_storage

In [61]:
def save_to_pickle(data):
    with open('trace_data.txt', 'wb') as f:
        pickle.dump(data, f)
    print('save successfully')

In [62]:
def main():
    # 读取数据
    data1 = get_data('../Project Data/20190524-001.csv')

    # 获取轨迹数据
    data_trace = trace(data1, treshold=1)
    
    # DF数据转化成表格，然后存储为pickle类型的二进制数据
    transform = data_transform(data_trace)
    print(transform[460069007035670])
    print(len(transform.keys()))
    save_to_pickle(transform)

# 函数调用

In [63]:
main()

data processed successfully


IndexError: Too many levels: Index has only 2 levels, not 3

# 代码调试部分

## 数据存储 

In [64]:
data1 = get_data('../Project Data/20190524-001.csv')

data processed successfully


In [65]:
data_trace = trace(data1, treshold=2)

In [73]:
data_stor = data_transform(data_trace) # 可以发现这些数据缺失了
# 检查下这些数据为什么会缺失

data transform successfully


In [35]:
with open('trace_data.txt', 'wb') as f:
    pickle.dump(data_stor, f)

In [70]:
data_stor[460079768010752]

{'update_time_': ['2019-05-23 12:37:07',
  '2019-05-23 12:46:31',
  '2019-05-23 13:39:46'],
 'DX-SZSC001': [0, 0, 0],
 'LT-SZSC001': [0, 0, 0],
 'YD-SZSC001': [1, 1, 1]}

In [20]:
pd.DataFrame(data_stor[460079768010752])

Unnamed: 0,update_time_,DX-SZSC001,LT-SZSC001,YD-SZSC001
0,2019-05-23 12:37:07,0,0,1
1,2019-05-23 12:46:31,0,0,1
2,2019-05-23 13:39:46,0,0,1


In [21]:
group_data = data_trace.reset_index(level=1)

In [22]:
def func(x):
    print(x)
    temp = {x.updata_time_:(x.DX-SZSC001, x.LT-SZSC001, x.YD-SZSC001)}
    d[x].update(temp)

In [55]:
group_data.loc[460069007035670, :]# 与上面的数据进行对比，可以发现这些会报错的id，可能是机器检测错误

device_id_
update_time_    2019-05-13 00:55:10
DX-SZSC001                        0
LT-SZSC001                        2
YD-SZSC001                        0
Name: 460069007035670, dtype: object

In [56]:
group_data.loc[460069007035670, :].to_frame().T.to_dict('list') # 使用转置将行变成列

{'update_time_': ['2019-05-13 00:55:10'],
 'DX-SZSC001': [0],
 'LT-SZSC001': [2],
 'YD-SZSC001': [0]}

In [53]:
d = {}
d[460069007035670] = {}
d[460069007035670].update(group_data.loc[460069007035670, :].to_frame().T.to_dict('list'))
d

{460069007035670: {'update_time_': ['2019-05-13 00:55:10'],
  'DX-SZSC001': [0],
  'LT-SZSC001': [2],
  'YD-SZSC001': [0]}}

In [74]:
data_stor[460069007035670]

{'update_time_': ['2019-05-13 00:55:10'],
 'DX-SZSC001': [0],
 'LT-SZSC001': [2],
 'YD-SZSC001': [0]}

# TODO

1. 给轨迹追踪增加阈值。记录达到多少次后才追踪