## 项目配置

In [1]:
import os
import re
from loguru import logger
from multiprocessing import Pool
import json
import contextlib
import subprocess
import wave
import sys

In [15]:
project_root = '/home/projectmanager2/chenranfang/KS'
project_wavs_path = os.path.join(project_root,'wav')
project_txts_path = os.path.join(project_root,'config.ini')
time_map = os.path.join(project_root,'time_map.txt')
save_upl_path = os.path.join(project_root,os.path.basename(project_root)+'.txt')

Txt_is_ok = None
logger.add('log.txt')

2

## 设置文本
> key-value 格式文本

In [17]:
def check_valid():
    partern = re.compile('[a-zA-Z0-9_]+\t(.*?)\n')
    with open(project_txts_path,'r',encoding='utf-8') as f:
        for line in f.readlines():
            if not re.match(partern,line):
                return None
    return True

Txt_is_ok = check_valid()

## 设置分包数

In [9]:
classify = 5

## 获取音频时长

In [5]:

def acquire_time(wav_path):
    """
    获取音频时长
    :param wav_path: 音频路径
    :return: 音频时长
    """
    cmd = 'sox --i -D %s' % wav_path

    p = subprocess.Popen(cmd,  # 使用sox计算音频时长
                         stdout=subprocess.PIPE, shell=True,
                         stdin=subprocess.PIPE,
                         stderr=subprocess.PIPE)
    out = p.stdout.read().decode()
    err = p.stderr.read().decode()

    if out and re.match('[0-9.]+', out) and not err:  # 判断sox是否成功
        logger.debug('[%s] %s' %(wav_path,out))
        wav_time = float(out)
        return wav_time
    else:
        logger.debug('[err] %s' % err)

    logger.warning('[%s] 文件未能通过sox统计时长 ' % wav_path)
    try:
        with contextlib.closing(wave.open(wav_path, 'r')) as f:
            frames = f.getnframes()
            rate = f.getframerate()
            duration = frames / float(rate)
            return duration
    except Exception:
        pass
        # raise CustomError('[%s] 未能获取音频时长，请检查音频格式') from None
    return None


def creat_mapping(file, root):
    """
    创建音频地址映射关系map表
    """

    file_path = os.path.join(root, file)
    time = acquire_time(file_path)
    return {file.rstrip('.wav'): time}

logger.info(f'构建音频中！')
result = []
dic_map = {}
pool = Pool(processes=20)

for root, dirs, files in os.walk(project_wavs_path):
    for file in files:
        wav_name, suf = os.path.splitext(file)
        if suf != '.wav':
            continue

        result.append(pool.apply_async(creat_mapping, args=(file, root)))  # 维持执行的进程总数为10，当一个进程执行完后启动一个新进程.

pool.close()
pool.join()

for i in result:
    dic_map.update(i.get())

with open(time_map, 'w', encoding='utf-8') as f:
    json.dump(dic_map, f)
    
logger.info('保存timemap成功！ 共%s条'%len(dic_map.keys()))

2019-06-03 16:56:59.815 | INFO     | __main__:<module>:46 - 构建音频中！
2019-06-03 16:56:59.953 | DEBUG    | __main__:acquire_time:18 - [/home/projectmanager2/chenranfang/KS/wav/13657600/136576001365769.wav] 86.145625

2019-06-03 16:56:59.955 | DEBUG    | __main__:acquire_time:18 - [/home/projectmanager2/chenranfang/KS/wav/13657860/136578601365782.wav] 87.998625

2019-06-03 16:56:59.955 | DEBUG    | __main__:acquire_time:18 - [/home/projectmanager2/chenranfang/KS/wav/13657600/1365760013657610.wav] 89.396438

2019-06-03 16:56:59.961 | DEBUG    | __main__:acquire_time:18 - [/home/projectmanager2/chenranfang/KS/wav/13657860/136578601365789.wav] 82.198625

2019-06-03 16:56:59.958 | DEBUG    | __main__:acquire_time:18 - [/home/projectmanager2/chenranfang/KS/wav/13657600/136576001365766.wav] 80.343563

2019-06-03 16:56:59.962 | DEBUG    | __main__:acquire_time:18 - [/home/projectmanager2/chenranfang/KS/wav/13657860/136578601365786.wav] 4.958625

2019-06-03 16:56:59.963 | DEBUG    | __main__:acqui

## 程序运行

In [18]:
if not os.path.exists(time_map) or not Txt_is_ok:
    logger.info('ERROR!!!')
    sys.exit()
    
with open(time_map, 'r', encoding='utf-8') as f:
    dic_map = json.loads(f.read())

logger.info('构建音频map完成！ \n--> %s' % {key: dic_map[key] for key in list(dic_map.keys())[:6]})

class CreatPTFile():
    def __init__(self, txtfile, wavs_map,custom_classfy ):
        """
        :param config_path: config文件路径
        :param wav_path: 对应音频文件路径
        """
        self.wav_time_map = wavs_map  # 创建音频地址映射关系
        
        self.custom_classfy = custom_classfy 
        self.txtfile = txtfile
        
    def mklines(self):
        with open(self.txtfile,'r',encoding='utf-8') as f:
            for line in f.readlines():
                SPK = None
                wav_name, text = line.rstrip('\n').split('\t')  # 获取文件名，文本内容
                if not classify:
                    SPK, *_ = re.split('[^a-zA-Z0-9]+', wav_name)  # 获取SPK

                yield SPK, wav_name, text
    
    @staticmethod
    def mkdir(path):
        if not os.path.exists(path):
            os.mkdir(path)
        return path

    def run(self):
        """
        遍历config文件和音频路径，将其新平台文本格式信息写到output/result.txt文件中
        :param custom_classfy: 自定义分包条数，默认按spk音频数分包
        """
        logger.debug('遍历wav_name,text信息')
        wav_suf = 'wav'
        counter = 0
        wavs_info_map = {}
        logger.info(f'按照 【%s】 进行分包!' % self.custom_classfy if self.custom_classfy else 'SPK')

        for SPK, wav_name, content in self.mklines():  # 遍历说话人id，音频名，音频内容 信息
#             logger.debug('[wav_info] %s - %s - %s ' % (SPK, wav_name, content))

            if wav_name not in self.wav_time_map:
                logger.warning('未获取到音频 [%s]' % wav_name)
                continue

            wav_time = self.wav_time_map.get(wav_name)
            if wav_time is None:
                logger.warning('未获取到音频时间 [%s]' % wav_name)
                continue

            wav_info = [  # 填充新平台文本格式
                {
                    "Wav_name": wav_name,
                    "Length_time": wav_time,
                    "Data": [
                        {
                            "text": content,
                            "start_time": 0,
                            "end_time": wav_time
                        }
                    ],
                    "Wav_suf": wav_suf
                }
            ]

            if self.custom_classfy:  # 指定分包数的模式

                id = f'{counter}'
                if id not in wavs_info_map:
                    wavs_info_map[id] = [wav_info]

                else:
                    if len(wavs_info_map[id]) == self.custom_classfy - 1:
                        counter += 1

                    wavs_info_map[id].append(wav_info)

            else:  # 默认分包模式

                if SPK not in wavs_info_map:
                    wavs_info_map[SPK] = [wav_info]
                else:
                    wavs_info_map[SPK].append(wav_info) 

        return wavs_info_map


def save_to_file(_save_path, wavs_info_map):
    with open(_save_path, 'w', encoding='utf-8') as f:
        for key, value in wavs_info_map.items():
            f.write(json.dumps(value, ensure_ascii=False) + '\n')
result_map = CreatPTFile(project_txts_path, dic_map,classify).run()

save_to_file(save_upl_path, result_map)

logger.info('Success!!!')

2019-06-03 17:08:02.080 | INFO     | __main__:<module>:8 - 构建音频map完成！ 
--> {'136578601365786': 4.958625, '136578601365782': 87.998625, '136578601365788': 85.038625, '136578601365789': 82.198625, '136578601365787': 77.038625, '136578601365784': 75.678625}
2019-06-03 17:08:02.082 | DEBUG    | __main__:run:42 - 遍历wav_name,text信息
2019-06-03 17:08:02.084 | INFO     | __main__:run:46 - 按照 【5】 进行分包!
2019-06-03 17:08:02.137 | INFO     | __main__:<module>:105 - Success!!!


In [20]:
len([[{"Wav_name": "137296211372969", "Length_time": 73.351438, "Data": [{"text": "Your body contains more water than anything else about sixty percent of our total body weight. water helps regulate your body temperature, transports nutrients, and helps remove waste. the big question is how much water you need to drink every day. although that's a simple question, it doesn't have an easy answer . it depends on some environmental and physical factors that can change every day. also, it's not just the water you drink about twenty percent of your water intake comes from the foods you eat. the remaining eighty percent comes from beverages , including water , coffee , tea , milk , and anything liquid The health authorities commonly recommend eight glasses, which equals about 2 liters, or half a gallon.", "start_time": 0, "end_time": 73.351438}], "Wav_suf": "wav"}], [{"Wav_name": "137296211372968", "Length_time": 76.860563, "Data": [{"text": "In pre-industrial times, deaths among the young and middle-aged were more common than they are today. this is not due to genetics, but because of environmental factors such as disease, accidents, and malnutrition, especially since the former were not generally treatable with pre-20th century medicine. Deaths from childbirth were common in women . and many children did not live past infancy. in addition, most people who did reach old age were likely to die quickly from the above-mentioned untreatable health problems. despite this, we do find many examples of pre-20th century individuals attaining life-spans of 75 years or greater, including benjamin franklin, thomas Jefferson, and john adams. this was also true for poorer people like peasants or laborers.", "start_time": 0, "end_time": 76.860563}], "Wav_suf": "wav"}], [{"Wav_name": "1372962113729611", "Length_time": 78.651938, "Data": [{"text": "The focus on increasing participation in the fields of science, technology, engineering and maths, or stem, has attracted much criticism. in his article \"The myth of the science and engineering shortage\" in the atlantic, the author criticized the efforts of the u.s. government to increase the number of stem graduates. he said that among studies on the subject,\" no one has been able to find any evidence indicating current widespread labor market shortages Nor is there any evidence indicating hiring difficulties in science and engineering occupations. most studies report that real wages in many science and engineering occupations have been flat or slow-growing, and unemployment is as high as or higher than in many other occupations requiring comparable skills.\"", "start_time": 0, "end_time": 78.651938}], "Wav_suf": "wav"}], [{"Wav_name": "1372962113729610", "Length_time": 69.984562, "Data": [{"text": "as globalization is bringing the world ever closer together. the need for global citizens to be competent in other languages is becoming increasingly important. the united states is the only developed country that routinely graduates students from high school who lack knowledge of a foreign language. whereas 52.7 % of europeans are fluent in both their native tongue and at least one other language, only 9.3 % of americans are fluent in both their native tongue and another language. this statistic does not project a good future for america in a global society. the upward trend in language learning must accelerate if the u.s. is to continue to be a major participant on the international stage.", "start_time": 0, "end_time": 69.984562}], "Wav_suf": "wav"}], [{"Wav_name": "137296211372965", "Length_time": 72.004688, "Data": [{"text": "A personal budget is a finance plan that allocates future personal income towards expenses , savings and debt repayment . there are several methods and tools available for creating , using and adjusting a personal budget . for example , jobs are an income source , while bills and rent payments are expenses. a variety of tools are helpful for constructing a personal budget . A simple budget can be written on a piece of paper with a pencil and , optionally , a calculator . regardless of the tool used , a budget's usefulness relies on the accuracy and currency of the data. computer generated budgets have become commonly used as they replace the need to rewrite and recalculate the budget every time there is a change.", "start_time": 0, "end_time": 72.004688}], "Wav_suf": "wav"}]]

)

5