In [1]:
# 引入相关包
import zipfile
import pandas as pd
import time

In [2]:
# 引入文件路劲：data_train, data_test, train_label
trainZip = zipfile.ZipFile('data/data_train.zip')
testZip = zipfile.ZipFile('data/data_test.zip')
# columnsNames =  ['活塞工作时长', '发动机转速', '油泵转速', '泵送压力', '液压油温',  '流量档位', 
#                  '分配压力', '排量电流', '低压开关', '高压开关', '搅拌超压信号', '正泵', '反泵',]

columnsNames_1 =  ['发动机转速', '油泵转速', '泵送压力', '液压油温',  '流量档位', 
                 '分配压力', '排量电流']
columnsNames_2 = ['活塞工作时长', '低压开关', '高压开关', '正泵', '反泵']
columnsNames_3 = ['搅拌超压信号']

# 获取压缩包里面的文件路径
trainfileNameList = trainZip.namelist()
testfileNameList = testZip.namelist()

In [4]:
# 定义两个list 用于初始存储提取的特征
trainDFList = []
testDFList = []

### 构造训练集特征

In [5]:
# 循环遍历zip 包里面的文件路径， 解析CSV文件夹，然后读取文件里面的内容，构建相关特征
start = time.time()
for fname in trainfileNameList:
    if fname.endswith('.csv'):
        dict_f = {}
        dict_f['file_name'] = fname.split('/')[-1]
        f_info = trainZip.getinfo(fname)
        fo = trainZip.open(f_info)
        fDF = pd.read_csv(fo)
        del fDF['设备类型']
        dfmean=fDF.mean()
        dfmin = fDF.min()
        dfmax = fDF.max()
        dfstd = fDF.std()
        dfmedian = fDF.median()
        for col in columnsNames_1:
            dict_f[col+'_mean'] = dfmean[col]           
            dict_f[col+'_min'] = dfmin[col]          
            dict_f[col+'_max'] = dfmax[col]          
            dict_f[col+'_std'] = dfstd[col]
            dict_f[col+'_median'] = dfmedian[col]
            dict_f[col+'range'] = dfmax[col]-dfmin[col]
        for col in columnsNames_2:
            dict_f[col] = dfmax[col]
        for col in columnsNames_3:
            if dfmax[col] == 0:
                dict_f[col+'diya'] = 1
                dict_f[col+'gaoya'] = 0
            if dfmax[col] == 1:
                dict_f[col+'diya'] = 0
                dict_f[col+'gaoya'] = 1
        dict_f['样本量'] = len(fDF)
        trainDFList.append(dict_f)
    else:
        continue
end = time.time()
print('total consuming for feature extraction:', end-start)

total consuming for feature extraction: 201.7180564403534


### 构建测试集特征

In [6]:
# 循环遍历zip 包里面的文件路径， 解析CSV文件夹，然后读取文件里面的内容，构建相关特征
start = time.time()
for fname in testfileNameList:
    if fname.endswith('.csv'):
        dict_f = {}
        dict_f['file_name'] = fname.split('/')[-1]
        f_info = testZip.getinfo(fname)
        fo = testZip.open(f_info)
        fDF = pd.read_csv(fo)
        del fDF['设备类型']
        dfmean=fDF.mean()
        dfmin = fDF.min()
        dfmax = fDF.max()
        dfstd = fDF.std()
        dfmedian = fDF.median()
        for col in columnsNames_1:
            dict_f[col+'_mean'] = dfmean[col]           
            dict_f[col+'_min'] = dfmin[col]          
            dict_f[col+'_max'] = dfmax[col]          
            dict_f[col+'_std'] = dfstd[col]
            dict_f[col+'_median'] = dfmedian[col]
            dict_f[col+'range'] = dfmax[col]-dfmin[col]
        for col in columnsNames_2:
            dict_f[col] = dfmax[col] 
        for col in columnsNames_3:
            if dfmax[col] == 0:
                dict_f[col+'diya'] = 1
                dict_f[col+'gaoya'] = 0
            if dfmax[col] == 1:
                dict_f[col+'diya'] = 0
                dict_f[col+'gaoya'] = 1
        dict_f['样本量'] = len(fDF)
        testDFList.append(dict_f)
    else:
        continue
end = time.time()
print('total consuming for feature extraction:', end-start)

total consuming for feature extraction: 163.43720126152039


### 转化特征list 为array, 然后保存

In [7]:
trainDF = pd.DataFrame(trainDFList)
testDF = pd.DataFrame(testDFList)

In [8]:
trainDF.columns

Index(['file_name', '低压开关', '分配压力_max', '分配压力_mean', '分配压力_median', '分配压力_min',
       '分配压力_std', '分配压力range', '反泵', '发动机转速_max', '发动机转速_mean',
       '发动机转速_median', '发动机转速_min', '发动机转速_std', '发动机转速range', '排量电流_max',
       '排量电流_mean', '排量电流_median', '排量电流_min', '排量电流_std', '排量电流range',
       '搅拌超压信号diya', '搅拌超压信号gaoya', '样本量', '正泵', '油泵转速_max', '油泵转速_mean',
       '油泵转速_median', '油泵转速_min', '油泵转速_std', '油泵转速range', '泵送压力_max',
       '泵送压力_mean', '泵送压力_median', '泵送压力_min', '泵送压力_std', '泵送压力range',
       '活塞工作时长', '流量档位_max', '流量档位_mean', '流量档位_median', '流量档位_min',
       '流量档位_std', '流量档位range', '液压油温_max', '液压油温_mean', '液压油温_median',
       '液压油温_min', '液压油温_std', '液压油温range', '高压开关'],
      dtype='object')

In [11]:
trainDF[1:3]['低压开关']

1    0.0
2    0.0
Name: 低压开关, dtype: float64

In [12]:
start = time.time()
trainDF.to_csv('data/train_features_3.csv', header=True, index=False)
testDF.to_csv('data/test_features_3.csv', header=True, index=False)
end = time.time()
print('total consuming:', end-start)

total consuming: 6.2866411209106445
