# 模块导入

In [3]:
import pandas as pd
import os

from os import path

In [4]:
import pickle

# 定义函数

In [5]:
def get_data(file_name):
    '''
    传入的是文件路径
    读取并对数据进行处理
    '''
    file_extension = path.splitext(file_name)[1]
    if file_extension == '.csv':
        data = pd.read_csv(file_name, encoding='gbk')
    elif file_extension == '.xlsx':
        data = pd.read_excel(file_name, encoding='gbk')
    drop_col = ['imei_', 'area_', 'msisdn_']
    data.drop(drop_col, axis=1, inplace=True)
    col = data.columns
    for each in col:
        if data[each].dtype == object:
            data[each] = data[each].str.replace('\t', '')
    print('data processed successfully')
    return data

In [6]:
def convert_to_time(data):
    '''
    将时间列，转变成index处理。
    '''
    data.index = pd.to_datetime(data['update_time_'])
    print('successfully')
    return data

In [7]:
def trace(data, treshold = 1):
    '''
    用户的轨迹追踪。
    treshhold = 0 设置的阈值。大于这个阈值的才进行轨迹追踪
    '''
    data['Frequency'] = data.imsi_.map(data.imsi_.value_counts())
    return data[data.Frequency >= treshold].groupby(['imsi_', 'update_time_',])['device_id_'].value_counts().unstack(fill_value=0)

In [14]:
#数据转化部分

def data_transform(data):
    data_storage = {}
    group_data = data.reset_index(level=1)
    misi = list(set(group_data.index))
    for each in misi:
        data_storage[each] = {}
        try:
            # 缺失部分的数据都是有问题的，比如1分31秒，传感器统计了两次。
            temp = group_data.loc[each, :].to_dict('list')
            data_storage[each].update(temp)
        except TypeError:
            temp = pd.DataFrame(group_data.loc[each,:]).to_dict('list')
#             print(f'{each} is missing..')
            continue
    print('data transform successfully')
    return data_storage

In [9]:
def save_to_pickle(data):
    with open('trace_data.txt', 'wb') as f:
        pickle.dump(data, f)
    print('save successfully')

In [12]:
def main():
    # 读取数据
    data1 = get_data('../Project Data/20190524-001.csv')

    # 获取轨迹数据
    data_trace = trace(data1, treshold=1)
    
    # DF数据转化成表格，然后存储为pickle类型的二进制数据
    transform = data_transform(data_trace)
    print(len(transform.keys()))
    save_to_pickle(transform)

# 函数调用

In [13]:
main()

data processed successfully
92509
save successfully


# 代码调试部分

## 数据存储 

In [8]:
data1 = get_data('../Project Data/20190524-001.csv')

data processed successfully


In [9]:
data_trace = trace(data1, treshold=2)

In [21]:
data_stor = data_transform(data_trace) # 可以发现这些数据缺失了
# 检查下这些数据为什么会缺失

460110417649620 is missing..
460069007035670 is missing..
460015766518368 is missing..
460016661002199 is missing..


In [35]:
with open('trace_data.txt', 'wb') as f:
    pickle.dump(data_stor, f)

In [31]:
data_stor[460079768010752]

{'update_time_': ['2019-05-23 12:37:07',
  '2019-05-23 12:46:31',
  '2019-05-23 13:39:46'],
 'DX-SZSC001': [0, 0, 0],
 'LT-SZSC001': [0, 0, 0],
 'YD-SZSC001': [1, 1, 1]}

In [32]:
pd.DataFrame(data_stor[460079768010752])

Unnamed: 0,update_time_,DX-SZSC001,LT-SZSC001,YD-SZSC001
0,2019-05-23 12:37:07,0,0,1
1,2019-05-23 12:46:31,0,0,1
2,2019-05-23 13:39:46,0,0,1


In [24]:
group_data = data_trace.reset_index(level=1)

In [52]:
def func(x):
    print(x)
    temp = {x.updata_time_:(x.DX-SZSC001, x.LT-SZSC001, x.YD-SZSC001)}
    d[x].update(temp)

In [29]:
d = {}
# d[222992312514644] = {}
group_data.loc[222992312514644, :]
# d[222992312514644].update(group_data.loc[222992312514644, :].to_dict('list'))

device_id_,update_time_,DX-SZSC001,LT-SZSC001,YD-SZSC001
imsi_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
222992312514644,2019-05-08 21:30:45,0,1,0
222992312514644,2019-05-09 08:54:47,0,1,0
222992312514644,2019-05-09 21:27:52,0,1,0
222992312514644,2019-05-10 08:48:07,0,1,0
222992312514644,2019-05-10 19:28:31,0,1,0
222992312514644,2019-05-11 13:45:16,0,1,0
222992312514644,2019-05-11 18:25:41,0,1,0
222992312514644,2019-05-13 08:53:11,0,1,0
222992312514644,2019-05-13 17:00:19,0,1,0
222992312514644,2019-05-13 18:46:10,0,1,0


In [28]:
group_data.loc[460069007035670, :] # 与上面的数据进行对比，可以发现这些会报错的id，可能是机器检测错误

device_id_
update_time_    2019-05-13 00:55:10
DX-SZSC001                        0
LT-SZSC001                        2
YD-SZSC001                        0
Name: 460069007035670, dtype: object

In [55]:
help(pd.DataFrame.to_dict)

Help on function to_dict in module pandas.core.frame:

to_dict(self, orient='dict', into=<class 'dict'>)
    Convert the DataFrame to a dictionary.
    
    The type of the key-value pairs can be customized with the parameters
    (see below).
    
    Parameters
    ----------
    orient : str {'dict', 'list', 'series', 'split', 'records', 'index'}
        Determines the type of the values of the dictionary.
    
        - 'dict' (default) : dict like {column -> {index -> value}}
        - 'list' : dict like {column -> [values]}
        - 'series' : dict like {column -> Series(values)}
        - 'split' : dict like
          {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
        - 'records' : list like
          [{column -> value}, ... , {column -> value}]
        - 'index' : dict like {index -> {column -> value}}
    
        Abbreviations are allowed. `s` indicates `series` and `sp`
        indicates `split`.
    
    into : class, default dict
        The collecti

# TODO

1. 给轨迹追踪增加阈值。记录达到多少次后才追踪