Skip to content
Permalink
Browse files

Add files via upload

  • Loading branch information...
HowardNTUST committed Oct 3, 2018
1 parent 3e42066 commit 2a0743eb6a83d805af7d3dd9ab7f26ce4c88e0ae
Binary file not shown.
@@ -0,0 +1,3 @@
,文章段句,機器認字信心水準,start,end,改善順序
0,台灣第一個行銷資料科學知識部落格跟粉絲專頁在探討資料科學之基礎概念曲是新工作和數作讓粉絲們了解資料科學的行銷運用並開啟後質數據分析能力之契機,0.91,0:00:00,0:00:18,1
1,自然語言處理應用和弦演技術回顧半衰的今年機器學習的熱炒自然語言處理成為了目前是否可樂的研究方向同時也是Google Microsoft Facebook百度阿里巴巴等大公司投入具有資金和高端人力努力爭奪下的一個互聯網流量路口的如智能助手自然音鄉的今日我們邀請的恐小卷來介紹自然語言處理的一些基本知識行業發展的現狀基於深度學習的通用的ML理流程,0.92,0:00:20,0:00:57,2
@@ -0,0 +1,12 @@
{
"type": "service_account",
"project_id": "speech2text-206407",
"private_key_id": "3de4444fd46ad13c3dc79a330645ed8cb4fae277",
"private_key": "-----BEGIN PRIVATE KEY-----\nMIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCYLdHPAiCEbWxd\nG+vre5Il2jTyrEW/lip26MNvyXdPnDNu4bD2w91YoacsdkXuSTH15g+T6WRrKU9j\nogiMtWgx7CCMc9g5dyhCmMB9hdhCNQJmZlDF887mm6WyaROJlzHcBDdyJ6hxYLpB\nYInkESHnXZJCd4T993Nj/xdiOuXBiXfme2QlsLTY5OMBOS0dzwrU1GAfIZMRosAT\nm8m9XEZB744kkxC4hRJfmdVnbRJujGSoLq2CEBCYsFNzSduMWPL+4lSX5HDwLhk1\ng1WIf92wJdqo1wI9c/Jib3lYbVe9iThdSl7Ms8QmSVkwOGgN13JrcjB4t9ve0Vkm\nnhgm9hlZAgMBAAECggEAC28r69I018ZyBTf3HMk65Lm1z0MW9YuGsssP/jsEyjhK\nHAt8tLnoTIPO8BYN8JbS2G9aHcrEVxHyS2HuIwZxZwjfNDZzGXRE0+Yu1m+jXWwm\nu0N2T71fPzAUPejLxUOVRTsS4HKu4x/5trZWfgq0gbIj7ncVCNEaPDn7S4/x2f4I\nwB1VExC14ngPffRon6Hwm79Vfoqv+gSNXQGSPE+u5u/2TnmYQZIPcDsM+vuuLzGC\nunmiJ7/lt4lLiROb5Q4f3SJWmTZ5ZF4YpDkRhaL08Oy0Bu1ugXUeeEUcmYpSf4hW\nYXjWQprtuT11XpT7B25ekyG0c/QGW443tPI3xDoZYQKBgQDWHpT33BlRpa/26qQU\njpv/U9G1XQpOn0HrJdkale0LLF59yu0+ynRr4R2du7TcCiOrCxvlKfuD+R3+H8fB\nEWGTMDp+ZmiqEDgawT6Q8/to24E61ECVzRcI2uv7wAJOBeQcvj3PnUOOAIzMDCZr\njdl6uqEf2OZCzy6RqriqGQLocQKBgQC18b7tt0iNBxkcYkgSfNgvHmSWDC1B1jS5\n+Fs6seUMtHyiD83xRRa+1cyKVE6amtsgu1MIUiakioQZFEilR1rHXbpfBaQjyPGq\nX6UF8aQtgoIUwbsx/2Mq4jBUh79E1bcgSLmt0m+IFYhtlRK4rHkFnD2zn4wFtgSv\nK9bWGPpzaQKBgGOGJ3vl5CbdOqMsgUPE3aGaVjlUm7vbroocN4Nx+JJe9zTFI+0x\nsVeLdXahjvlQBTA1o6urDetT2asLHpARAiemHSovc6rIiyt0cx5xKYSdsr2jdrmq\ncHB/QXzszKVze4oYqyZbVNEmVEtrwlfvwQyYvtfMu4qGPoEat2bo5T8hAoGAAO7M\nW8ukw5RVvJ+EXxPkx9kQCvTlBoFio9FyJkrjMb17JhmOpL4F79SIx+zGumQw22oa\ncuB48j8Djl1cQfyrqnoAipwsy0IsewMihXVHsjQJFrInk4oLqBygws4vXNr7pV66\nFPoPOFNj76vj/K5lsHGsn08iGKVmQL+oylA2eIECgYEAspwkH3FX8X8XNG6aOVtm\nK5Kgrb4aOpm6xiWZaG6hJhjDh4kkUstoVnpMk4+NrP8+doij4m0Jocqu2b27Xmaw\nNS9fp8+YnhYM4MmJpX/L9m3Ly8dRDQdzDQuS2Cf3YSzKh+LyNNAX09HiVj1871eZ\nC1KxDl/ujJmw9EtdyFovkQU=\n-----END PRIVATE KEY-----\n",
"client_email": "speech2text@speech2text-206407.iam.gserviceaccount.com",
"client_id": "104434911873872023279",
"auth_uri": "https://accounts.google.com/o/oauth2/auth",
"token_uri": "https://accounts.google.com/o/oauth2/token",
"auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
"client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/speech2text%40speech2text-206407.iam.gserviceaccount.com"
}
@@ -0,0 +1,136 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 3 16:04:58 2018
@author: Howard Chung
"""


# Imports the Google Cloud client library
from google.cloud import speech
from google.cloud.speech import enums
from google.cloud.speech import types
from google.cloud import storage
import datetime
import io
import os
import time
import numpy as np
import pandas as pd

'''
# 語音基礎串接
一分鐘內語音辨識
'''

os.chdir('your working directory')
#os.chdir('/home/slave1/git/speech2text_1min')

def speech_to_text_in_a_min(title_pattern='nlpno',
wd ='re',
json_os = 'speech2text-3de4444fd46a.json',
sample_rate_hertz = 48000):
'''
* json_os:憑證檔的路徑
* title_pattern:錄音檔的名稱模式
* sample_rate_hertz:錄音的取樣頻率
* doc_title:docx文件名稱
* wd:工作目錄
'''

# 計時
# start_time = time.time()
# 從python client端對雲端speech2text服務進行驗證
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] =json_os
client = speech.SpeechClient()

os.chdir(wd)
file_list = os.listdir()

# 選出title_pattern的錄音檔
select_wav = []
for i in file_list:
if title_pattern in i:
select_wav.append(i)

# [START migration_sync_request]
# [START migration_audio_config_file]

aa = pd.DataFrame()

for music in select_wav:

# 將 audio錄音檔 讀入進來
with io.open(music, 'rb') as audio_file:
content = audio_file.read()

# 將錄音檔轉換成google 看得懂的格式
audio = types.RecognitionAudio(content=content)

# 設定格式錄音檔
config = types.RecognitionConfig(
encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
sample_rate_hertz=sample_rate_hertz,
language_code='cmn-Hant-TW' ,
enable_word_time_offsets=True)

# 機器學習文字辨識(speech2text)
print('')
response = client.recognize(config, audio)


transcript_list = []
transcript_confidence = []
timerecored = []
# Each result is for a consecutive portion of the audio. Iterate through
# them to get the transcripts for the entire audio file.
for result in response.results:
alternative = result.alternatives[0]
# The first alternative is the most likely one for this portion.
transcript_list.append(alternative.transcript)
transcript_confidence.append(alternative.confidence)
print('Transcript: {}'.format(alternative.transcript))
print('Confidence: {}'.format(alternative.confidence))


# begining and end time of a sentence
sentence_start_time = alternative.words[0].start_time
sentence_end_time = alternative.words[len(alternative.words)-1].end_time

# make time
sentence_start_time = round( sentence_start_time.seconds + sentence_start_time.nanos * 1e-9)
sentence_end_time = round( sentence_end_time.seconds + sentence_end_time.nanos * 1e-9)

# make min
sentence_start_time= str(datetime.timedelta(seconds=sentence_start_time))
sentence_end_time =str(datetime.timedelta(seconds=sentence_end_time))
timerecored.append([sentence_start_time, sentence_end_time])

# pandas 建立信心程度資料表
# make df
transcript_df = pd.DataFrame(transcript_list, columns = ['文章段句'])
confidence_df = pd.DataFrame(transcript_confidence, columns = ['機器認字信心水準'])
confidence_df['機器認字信心水準'] = round(confidence_df['機器認字信心水準'],2)
time_df = pd.DataFrame(timerecored, columns = ['start', 'end'])
correctness_summary_df = pd.concat([transcript_df , confidence_df,time_df], axis = 1)
correctness_summary_df = correctness_summary_df.sort_values(['機器認字信心水準'])
correctness_summary_df['改善順序'] = range(1, len(correctness_summary_df)+1)

timer_translist =[]
for hah,timer in zip(transcript_list,timerecored):
timer_translist.append(hah+' ' +''+' to '.join(timer)+'')

aa = pd.concat([ aa, correctness_summary_df])


return aa.to_csv('文章認字信心矩陣.csv')


# main
matr = speech_to_text_in_a_min(title_pattern='nlpno',
wd ='re',
json_os = 'speech2text-3de4444fd46a.json',
sample_rate_hertz = 48000)

0 comments on commit 2a0743e

Please sign in to comment.
You can’t perform that action at this time.