In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 10)
pd.set_option('display.float_format', lambda x: '%.6f' % x) #为了直观的显示数字，不采用科学计数法
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
import numpy as np
np.set_printoptions(suppress = True)
import scipy
import os
import math
import time
import random
from joblib import Parallel, delayed
import warnings
from tqdm.notebook import tqdm

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from sklearn.preprocessing import StandardScaler, OneHotEncoder
import gc

In [2]:
input_periods = 24 * 21
input_cols = ['input_%s' % i for i in range(input_periods)]

In [3]:
data_folder = 'preprocessed_data/tmp/'
folders = os.listdir(data_folder)
base_types = ['4g', '5g']
kpis = ['PDCCH', 'PDCPDL', 'PDCPUL', 'PDSCH', 'PUSCH', 'RRC']
folder_dict = {base_type: {kpi: [] for kpi in kpis} for base_type in base_types}
for folder in folders:
    base_type, kpi, city, base_group = folder.split('_')
    folder_dict[base_type][kpi].append(data_folder + folder + '/')
folder_dict

{'4g': {'PDCCH': ['preprocessed_data/tmp/4g_PDCCH_AFE97F546A10368F_0/',
   'preprocessed_data/tmp/4g_PDCCH_EA5EAA705108BDA0_0/',
   'preprocessed_data/tmp/4g_PDCCH_EA5EAA705108BDA0_1/',
   'preprocessed_data/tmp/4g_PDCCH_C48FDFBFC4072E0E_2/',
   'preprocessed_data/tmp/4g_PDCCH_F37F452354AC87C9_0/',
   'preprocessed_data/tmp/4g_PDCCH_AFE97F546A10368F_2/',
   'preprocessed_data/tmp/4g_PDCCH_AFE97F546A10368F_1/',
   'preprocessed_data/tmp/4g_PDCCH_C48FDFBFC4072E0E_1/',
   'preprocessed_data/tmp/4g_PDCCH_C48FDFBFC4072E0E_0/'],
  'PDCPDL': ['preprocessed_data/tmp/4g_PDCPDL_AFE97F546A10368F_2/',
   'preprocessed_data/tmp/4g_PDCPDL_F37F452354AC87C9_0/',
   'preprocessed_data/tmp/4g_PDCPDL_C48FDFBFC4072E0E_2/',
   'preprocessed_data/tmp/4g_PDCPDL_C48FDFBFC4072E0E_0/',
   'preprocessed_data/tmp/4g_PDCPDL_AFE97F546A10368F_1/',
   'preprocessed_data/tmp/4g_PDCPDL_AFE97F546A10368F_0/',
   'preprocessed_data/tmp/4g_PDCPDL_EA5EAA705108BDA0_0/',
   'preprocessed_data/tmp/4g_PDCPDL_EA5EAA705108BDA0_1/

In [4]:
# def preprocess(data, fit_preprocessors):
#     data.dropna(subset=input_cols, inplace=True)
#     if fit_preprocessors:
#         data[['经度', '纬度']] = scaler.fit_transform(data[['经度', '纬度']])
#     else:
#         data[['经度', '纬度']] = scaler.transform(data[['经度', '纬度']])
#     data[['经度', '纬度']] = data[['经度', '纬度']].fillna(0.)
#     data.rename({'经度': 'lon_feature', '纬度': 'lat_feature'}, axis=1, inplace=True)
#     if fit_preprocessors:
#         base_features = pd.DataFrame(encoder.fit_transform(data[['覆盖类型', '覆盖场景']].fillna('空')), index=data.index)
#     else:
#         base_features = pd.DataFrame(encoder.transform(data[['覆盖类型', '覆盖场景']].fillna('空')), index=data.index)
#     base_features.rename(lambda x: 'base_feature_%s' % x, axis=1, inplace=True)
#     data.drop(['覆盖类型', '覆盖场景'], axis=1, inplace=True)
#     return pd.concat([base_features, data], axis=1).astype(np.float32)

def preprocess(data):
    data.dropna(subset=input_cols, inplace=True)
    return data

In [5]:
def concat_preprocess(base_type, kpi):
    folders = folder_dict[base_type][kpi]
    global scaler, encoder
    scaler = StandardScaler()
    encoder = OneHotEncoder(sparse=False)
    
    for file_name in ['regression_train_data.pkl', 'regression_predict_inputs.pkl']:
        data = pd.concat([pd.read_pickle(os.path.join(folder, file_name)) for folder in folders])
#         preprocessed_data = preprocess(data, file_name=='regression_train_data.pkl')
        preprocessed_data = preprocess(data)
        del data; gc.collect()
        if not os.path.exists('preprocessed_data/final/%s_%s/' % (base_type, kpi)):
            os.makedirs('preprocessed_data/final/%s_%s/' % (base_type, kpi))
        preprocessed_data.to_pickle('preprocessed_data/final/%s_%s/%s' % (base_type, kpi, file_name))
        del preprocessed_data; gc.collect()
        
        for folder in folders:
            os.remove(os.path.join(folder, file_name))

In [6]:
for base_type in base_types:
    for kpi in kpis:
        concat_preprocess(base_type, kpi)