# 将原始的txt数据处理，保存为csv格式，以4个变量值命名

In [128]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd
import glob
import tqdm
import os
import shutil
import csv

In [129]:
# 获取原始数据txt文件中的直径
def get_diameter(file_path):
    diameters = []   # 存储测量直径
    with open(file_path, 'r') as f:
        lines = f.readlines()
        num_diameter = int(lines[3].split('\t')[1])
        for line in lines[8: 8 + num_diameter]:
            single_data = line.split('\t')[1]
            diameters.append(single_data)
    return diameters

# 根据4个变量的取值，以及测量直径值，构建数据集
def construct_dataset(wt, kv, mlh, cm, diameters):
    features_diameter = []
    for dia in diameters:
        single_data = (wt, kv, mlh, cm, dia)
        features_diameter.append(single_data)
    return features_diameter

# 遍历批量处理所有txt文件
def batch_precess(data_paths_list):
    for data_path in tqdm.tqdm(data_paths_list):
        file_name = data_path.split(os.sep)[-1].split('.txt')[0]
        # # 根据文件名，获取4个变量的取值
        if data_paths_list == glob.glob('../电镜数据02_43+5+1/00_3/*.txt'):
            wt, kv, mlh, cm = file_name[:2], file_name[3:5], file_name[6:9], file_name[-3:-1]   # 00_3
        elif data_paths_list == glob.glob('../电镜数据02_43+5+1/01_16/*.txt'):
            wt, kv, mlh, cm = file_name[:2], file_name[4:6], file_name[7:10], file_name[-2:]   # 01_16
        elif data_paths_list == glob.glob('../电镜数据02_43+5+1/02_9/*.txt'):    
            wt, kv, mlh, cm = file_name[:2], file_name[8:10], file_name[4:7],  file_name[-2:]   # 02_9
        elif data_paths_list == glob.glob('../电镜数据02_43+5+1/03_4/*.txt'):    
            wt, kv, mlh, cm = file_name[:2], file_name[3:5], file_name[6:9],  file_name[-3:-1]   # 03_4
        elif data_paths_list == glob.glob('../电镜数据02_43+5+1/04_11/*.txt'):   
            wt, kv, mlh, cm = file_name[:2], file_name[4:6], file_name[9:12],  file_name[-2:]   # 04_12
        elif data_paths_list == glob.glob('../电镜数据03_36+4/*.txt'):
            wt, kv, mlh, cm = file_name[:2], file_name[4:6], file_name[7:10],  file_name[-2:]   # 03_36+4
        elif data_paths_list == glob.glob('../电镜数据04_8+1/*.txt'):    
            wt, kv, mlh, cm = file_name[:2], file_name[4:6], file_name[7:10],  file_name[-2:]   # 04_8+1
        diameters = get_diameter(data_path)
        # 构建数据
        datas = construct_dataset(wt, kv, mlh, cm, diameters)
        # 创建csv文件，并写入数据
        new_name = '_'.join((wt, kv, mlh, cm))
        new_file_path = os.path.join('.', 'data', new_name + '.csv')
        # 判断文件是否存在，不存在则创建
        if not os.path.exists(new_file_path):
            with open (new_file_path, 'w', encoding='utf-8', newline='') as f:
                pass
        features_names = ['Concentration', 'Voltage', 'Injection Rate', 'Receiving Distance', 'Fiber Diameter']
        with open (new_file_path, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(features_names)
            writer.writerows(datas)

In [130]:
# 搜索所有txt数据文件
data_paths_list0 = glob.glob('../电镜数据02_43+5+1/00_3/*.txt')    # 00_3
data_paths_list1 = glob.glob('../电镜数据02_43+5+1/01_16/*.txt')    # 01_16
data_paths_list2 = glob.glob('../电镜数据02_43+5+1/02_9/*.txt')    # 02_9
data_paths_list3 = glob.glob('../电镜数据02_43+5+1/03_4/*.txt')    # 03_4
data_paths_list4 = glob.glob('../电镜数据02_43+5+1/04_11/*.txt')    # 04_12
data_paths_list5 = glob.glob('../电镜数据03_36+4/*.txt')    # 03_36+4
data_paths_list6 = glob.glob('../电镜数据04_8+1/*.txt')    # 04_8+1
# 批量处理
for data_paths_list in [data_paths_list6]:
    batch_precess(data_paths_list)

100%|██████████| 8/8 [00:00<00:00, 475.65it/s]


In [131]:
# 读取数据验证
data = pd.read_csv('./data/24_20_0.6_19.csv')


In [132]:
data

Unnamed: 0,Concentration,Voltage,Injection Rate,Receiving Distance,Fiber Diameter
0,24,20,0.6,19,308.00
1,24,20,0.6,19,72.09
2,24,20,0.6,19,198.26
3,24,20,0.6,19,235.00
4,24,20,0.6,19,205.51
...,...,...,...,...,...
95,24,20,0.6,19,205.50
96,24,20,0.6,19,378.52
97,24,20,0.6,19,217.03
98,24,20,0.6,19,210.19
