# 导入基本模块

In [1]:
import pandas as pd

import os
from os import path

In [2]:
import warnings

warnings.filterwarnings(action='ignore')

In [3]:
import pickle

# 定义函数

In [13]:
def get_frequency(data1, treshold = 2):
    '''
    data1.index.week表示一年中第几个星期。比如1月2日表示第1周。12月是50多周
    可以将所处周相同的。作为新的一个特征，然后使用groupby进行分类。统计周出现的频率。月出现的频率
    '''
    data1['Frequency'] = data1.imsi_.map(data1.imsi_.value_counts()) # 频率使用的是imsi_出现的次数
    data1 = data1[data1.Frequency >= treshold]
    data1.index = pd.to_datetime(data1.update_time_)
    data1['day'] = data1.index.day # 要统计一个imsi_每天，每周，每年出现的次数，可以增加新的特征列
    data1['week'] = data1.index.week
    data1['month'] = data1.index.month
    day = data1.groupby(['imsi_',data1.day])['device_id_'].value_counts().unstack(fill_value=0)
    week = data1.groupby(['imsi_',data1.week])['device_id_'].value_counts().unstack(fill_value=0)
    month = data1.groupby(['imsi_',data1.month])['device_id_'].value_counts().unstack(fill_value=0)
    print('data processed successfully')
    return day, week, month

In [14]:
def get_data(file_name):
    '''
    传入的是文件路径
    读取并对数据进行处理
    '''
    file_extension = path.splitext(file_name)[1]
    if file_extension == '.csv':
        data = pd.read_csv(file_name, encoding='gbk')
    elif file_extension == '.xlsx':
        data = pd.read_excel(file_name, encoding='gbk')
    drop_col = ['imei_', 'area_', 'msisdn_']
    data.drop(drop_col, axis=1, inplace=True)
    col = data.columns
    for each in col:
        if data[each].dtype == object:
            data[each] = data[each].str.replace('\t', '')
    print('data load successfully')
    return data

In [38]:
def transform(day, week, month):
    day_d = {}
    week_d = {}
    month_d = {}
    # 解除multyindex
    day, week, month = day.reset_index(level=1), week.reset_index(level=1), month.reset_index(level=1)
    day_user_id = list(set(day.index)) # 获取数据的基础id
    week_user_id = list(set(week.index))
    month_user_id = list(set(month.index))
    for each in day_user_id:
        day_d[each] = {}
        try:
            day_d[each].update(day.loc[each,:].to_dict('list'))
        except TypeError:
            day_d[each].update(day.loc[each,:].to_frame().T.to_dict('list'))
    for each in week_user_id:
        week_d[each] = {}
        try:
            week_d[each].update(week.loc[each,:].to_dict('list'))
        except TypeError:
            week_d[each].update(week.loc[each,:].to_frame().T.to_dict('list'))
    for each in month_user_id:
        month_d[each] = {}
        try:
            month_d[each].update(month.loc[each,:].to_dict('list'))
        except TypeError:
            month_d[each].update(month.loc[each,:].to_frame().T.to_dict('list'))
    print('data transform successfully')
    return day_d, week_d, month_d

In [24]:
def main():
    '''
    定义主函数，方便运行
    '''
    os.listdir('../Project Data/')
    os.chdir('../Project Data/') # 切换到数据所在目录
    data = get_data('20190524-001.csv')
    day, week, month = get_frequency(data, treshold=1)
    day, week, month = transform(day, week, month)
    print(len(day))
    print(day)

# 运行代码

In [25]:
main()

data load successfully
data processed successfully


KeyboardInterrupt: 

# 代码调试

In [26]:
os.listdir('../Project Data/')

os.chdir('../Project Data/') # 切换到数据所在目录

In [27]:
data = get_data('20190524-001.csv')

data load successfully


In [28]:
data.head()

Unnamed: 0,imsi_,update_time_,lac_,cellid_,status_,ap_type_,device_id_,eventid_
0,460110413044304,2019-05-08 20:09:29,0,0,拒绝,FDD-LTE,DX-SZSC001,Normal Lau
1,460110413042577,2019-05-08 20:09:41,0,0,拒绝,FDD-LTE,DX-SZSC001,Normal Lau
2,460110425023300,2019-05-08 20:10:14,0,0,拒绝,FDD-LTE,DX-SZSC001,Normal Lau
3,460110714141865,2019-05-08 20:10:40,0,0,拒绝,FDD-LTE,DX-SZSC001,Normal Lau
4,460016692088409,2019-05-08 20:08:55,0,0,拒绝,FDD-LTE,LT-SZSC001,Normal Lau


In [29]:
day, week, month = get_frequency(data, treshold=2)

data processed successfully


In [30]:
day = day.reset_index(level=1)  # https://www.cnblogs.com/beyondChan/p/10926788.html

# multiple index 的方法

In [31]:
day.head()

device_id_,day,DX-SZSC001,LT-SZSC001,YD-SZSC001
imsi_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
222992312514644,8,0,1,0
222992312514644,9,0,2,0
222992312514644,10,0,2,0
222992312514644,11,0,2,0
222992312514644,13,0,4,0


In [158]:
set(week.index) == set(month.index)

False

In [159]:
day.loc[460079768010752,:].to_dict('list')
# 直接这样报错，原因是数据类型是Series，转换成DF即可

TypeError: unsupported type: <class 'str'>

In [165]:
day.loc[460079768010752,:].to_frame()

Unnamed: 0_level_0,460079768010752
device_id_,Unnamed: 1_level_1
day,23
DX-SZSC001,0
LT-SZSC001,0
YD-SZSC001,3


In [167]:
d = {}
d[460079768010752] = {}
d[460079768010752].update(day.loc[460079768010752,:].to_frame().T.to_dict('list'))
d

{460079768010752: {'day': [23],
  'DX-SZSC001': [0],
  'LT-SZSC001': [0],
  'YD-SZSC001': [3]}}

In [131]:
try:
    raise AttributeError
except (TypeError,AttributeError) as reason:
    if isinstance(reason, TypeError):
        print(1)
    else:
        print(2)

2


In [126]:
day.head()

device_id_,day,DX-SZSC001,LT-SZSC001,YD-SZSC001
imsi_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
222992312514644,8,0,1,0
222992312514644,9,0,2,0
222992312514644,10,0,2,0
222992312514644,11,0,2,0
222992312514644,13,0,4,0


In [127]:
day.columns

Index(['day', 'DX-SZSC001', 'LT-SZSC001', 'YD-SZSC001'], dtype='object', name='device_id_')

In [168]:
userid = list(set(day.index))
d = {}
for each in userid:
    d[each] = {}
    try:
        d[each].update(day.loc[each,:].to_dict('list'))
    except (TypeError, AttributeError) as r:
        if isinstance(r, TypeError):
            try:
                d[each].update(day.loc[each,:].to_frame().T.to_dict('list'))
            except:
                print(f'{each} is missing...')

In [145]:
d[460001092568480].update(day.loc[460001092568480,:].to_dict('list'))

TypeError: unsupported type: <class 'str'>

In [147]:
d = {}
d[460001092568480] = {}
pd.DataFrame(day.loc[460001092568480,:]).to_dict('list')

{460001092568480: [11, 0, 0, 3]}

In [150]:
pd.DataFrame(day.loc[460001092568480,:]).to_dict('list')

{460001092568480: [11, 0, 0, 3]}

In [171]:
list(set(day.index)) == list(set(week.index))

False

In [32]:
os.listdir('../Project Data/')
os.chdir('../Project Data/') # 切换到数据所在目录
data = get_data('20190524-001.csv')

data load successfully


In [47]:
day, week, month = get_frequency(data, treshold=2)

data processed successfully


In [48]:
day, week, month = transform(day, week, month)

data transform successfully


In [49]:
day

{460079768010752: {'day': [23],
  'DX-SZSC001': [0],
  'LT-SZSC001': [0],
  'YD-SZSC001': [3]},
 460110771912705: {'day': [15, 16, 17, 18, 20, 21, 24],
  'DX-SZSC001': [5, 2, 9, 3, 5, 11, 2],
  'LT-SZSC001': [0, 0, 0, 0, 0, 0, 0],
  'YD-SZSC001': [0, 0, 0, 0, 0, 0, 0]},
 460017747623943: {'day': [8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24],
  'DX-SZSC001': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
  'LT-SZSC001': [1, 5, 4, 5, 6, 3, 6, 5, 3, 1, 4, 4, 3, 5, 2, 7, 5],
  'YD-SZSC001': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]},
 460111109423112: {'day': [18, 21],
  'DX-SZSC001': [2, 1],
  'LT-SZSC001': [0, 0],
  'YD-SZSC001': [0, 0]},
 460000280182793: {'day': [15],
  'DX-SZSC001': [0],
  'LT-SZSC001': [0],
  'YD-SZSC001': [2]},
 460091092107274: {'day': [13, 14, 16, 18],
  'DX-SZSC001': [0, 0, 0, 0],
  'LT-SZSC001': [2, 1, 2, 1],
  'YD-SZSC001': [0, 0, 0, 0]},
 460110411595787: {'day': [8],
  'DX-SZ