In [1]:
import pandas as pd
pd.set_option('display.max_columns', 25)
pd.set_option('display.max_rows', 10)
pd.set_option('display.float_format', lambda x: '%.6f' % x) #为了直观的显示数字，不采用科学计数法
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
import numpy as np
np.set_printoptions(suppress = True)
import scipy
import os
import math
import time
import random
from joblib import Parallel, delayed
import warnings
from tqdm.notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
folder = '../compdata/4G5G_Data/Train_Data/'
files = list(filter(lambda x: x[-3:] == 'csv', os.listdir(folder)))
files

['4g_pm_20210604_20210609_AFE97F546A10368F.csv',
 '5g_pm_20210514_20210519_AFE97F546A10368F.csv',
 '5g_pm_20210611_20210616_C48FDFBFC4072E0E.csv',
 '5g_pm_20210611_20210616_EA5EAA705108BDA0.csv',
 '4g_pm_20210312_20210318_EA5EAA705108BDA0.csv',
 '4g_pm_20210305_20210311_AFE97F546A10368F.csv',
 '5g_pm_20210122_20210128_F37F452354AC87C9.csv',
 '5g_pm_20210205_20210211_AFE97F546A10368F.csv',
 '5g_pm_20210212_20210218_C48FDFBFC4072E0E.csv',
 '5g_pm_20210305_20210311_AFE97F546A10368F.csv',
 '5g_pm_20210305_20210311_C48FDFBFC4072E0E.csv',
 '5g_pm_20210409_20210415_C48FDFBFC4072E0E.csv',
 '5g_pm_20210409_20210415_F37F452354AC87C9.csv',
 '5g_pm_20210416_20210422_AFE97F546A10368F.csv',
 '4g_pm_20210617-20210617_EA5EAA705108BDA0.csv',
 '4g_pm_20210402_20210408_C48FDFBFC4072E0E.csv',
 '4g_pm_20210402_20210408_EA5EAA705108BDA0.csv',
 '4g_pm_20210416_20210422_EA5EAA705108BDA0.csv',
 '4g_pm_20210423_20210429_EA5EAA705108BDA0.csv',
 '4g_pm_20210423_20210429_F37F452354AC87C9.csv',
 '4g_pm_20210430_202

In [3]:
kpi_dict = {
    '4g': {
        'PUSCH': '上行利用率PUSCH',
        'PDSCH': '下行利用率PDSCH',
        'PDCCH': '下行利用率PDCCH',
        'RRC': '有效RRC连接平均数',
        'PDCPUL': '上行流量',
        'PDCPDL': '下行流量'
    },
    '5g': {
        'PUSCH': '上行利用率PUSCH',
        'PDSCH': '下行利用率PDSCH',
        'PDCCH': '下行利用率PDCCH',
        'RRC': '有数据传输的RRC数',
        'PDCPUL': '上行流量',
        'PDCPDL': '下行流量'
    }
}

def retrieve_useful_parts(file):
    filename_split = file[:-4].split('_')
    base_type, city = filename_split[0], filename_split[-1]
#     print(base_type, city)
    data = pd.read_csv(folder + file, encoding='GBK', parse_dates=['TimeStamp'])
    data = data[data.TimeStamp >= pd.Timestamp('2021-03-01')]
    if data.shape[0] == 0:
        return
    for ullist_file in os.listdir('preprocessed_data/UserLabel_lists/'):
        ullist_file_split = ullist_file[:-4].split('_')
        ullist_base_type, ullist_city = ullist_file_split[0], ullist_file_split[-2]
#         print(ullist_base_type, ullist_city)
        if ullist_base_type != base_type or ullist_city != city:
            continue
        userlabel_list = pd.read_pickle('preprocessed_data/UserLabel_lists/%s' % ullist_file)
        kpi = ullist_file.split('_')[1]
        useful_part = data.loc[data.UserLabel.isin(userlabel_list.UserLabel), ['UserLabel', 'TimeStamp', kpi_dict[base_type][kpi]]]
        useful_part[kpi_dict[base_type][kpi]] = useful_part[kpi_dict[base_type][kpi]].astype(np.float32).values
        output_folder = './preprocessed_data/tmp/%s/' % ullist_file[:-4]
        output_file = '%s.pkl' % file[6:23]
        print('outputting %s file with dataframe shape %s' % (output_folder + output_file, useful_part.shape))
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        useful_part.to_pickle(output_folder + output_file)

In [4]:
# retrieve_useful_parts(files[0])

# sample_output = pd.read_pickle('preprocessed_data/tmp/4g_PDCPUL_AFE97F546A10368F_2/20210604_20210609.pkl')
# sample_output

In [None]:
for file in tqdm(files):
    retrieve_useful_parts(file)

HBox(children=(FloatProgress(value=0.0, max=248.0), HTML(value='')))

outputting ./preprocessed_data/tmp/4g_RRC_AFE97F546A10368F_0/20210604_20210609.pkl file with dataframe shape (1124083, 3)
outputting ./preprocessed_data/tmp/4g_PDCCH_AFE97F546A10368F_0/20210604_20210609.pkl file with dataframe shape (1111242, 3)
outputting ./preprocessed_data/tmp/4g_PDSCH_AFE97F546A10368F_0/20210604_20210609.pkl file with dataframe shape (1117743, 3)
outputting ./preprocessed_data/tmp/4g_PDCCH_AFE97F546A10368F_2/20210604_20210609.pkl file with dataframe shape (1104859, 3)
outputting ./preprocessed_data/tmp/4g_PUSCH_AFE97F546A10368F_1/20210604_20210609.pkl file with dataframe shape (1117245, 3)
outputting ./preprocessed_data/tmp/4g_PDCPUL_AFE97F546A10368F_1/20210604_20210609.pkl file with dataframe shape (1122964, 3)
outputting ./preprocessed_data/tmp/4g_PDCPDL_AFE97F546A10368F_0/20210604_20210609.pkl file with dataframe shape (1127154, 3)
outputting ./preprocessed_data/tmp/4g_PDCPDL_AFE97F546A10368F_1/20210604_20210609.pkl file with dataframe shape (1124161, 3)
outputt