# Basic

In [None]:
# basic library
import os
import pandas as pd
import numpy as np
import re
import wave
import librosa
import noisereduce as nr
import pysrt
import pandas as pd
import os
import re
import json
import pandas as pd
from kiwipiepy import Kiwi

In [None]:
with open("env.json") as f: # input your env file path
    envs = json.load(f)

# Audio

In [None]:
speaker ='counselor' # or customer

# 데이터 파일명 형식 통일
non_matching_files = []
pattern = re.compile(r"\d{4}-\d{2}-\d{2}_\d{6}__[Pp]\d{1,2}\.txt")


origin_path = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO',str(speaker))
for root, _, files in os.walk(origin_path):
    for file in files:
        if file.endswith('.txt') and not pattern.match(file):
            non_matching_files.append(os.path.join(root, file))

#  수동으로 파일 이름 수정
for file in non_matching_files:
    print(file)

### Feature Extraction

#### LLDs

In [16]:
def extract_features(file_path,file_name,audio_path):

    y, sr = librosa.load(file_path, sr=8000)# 오디오 파일 로드
    y = nr.reduce_noise(y=y, sr=sr) # 잡음 제거

    # Extraction 준비
    frame = 0.025 # 25ms
    sliding = 0.01 # 10ms
    n_fft = int(round(frame * sr))
    hop_length = int(round(sliding * sr))


    # 특성 추출
    features = {
        'f0': librosa.pyin(y, fmin=85, fmax=400, frame_length=n_fft, hop_length=hop_length)[0], # 음성 특징을 반영한 fmin, fmax
        'energy': librosa.feature.rms(y=y, frame_length=n_fft, hop_length=hop_length).flatten(),
        'mfccs': librosa.feature.mfcc(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mfcc=13, window='hamming'),
        'spec_bw': librosa.feature.spectral_bandwidth(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length).flatten(),
        'zcr': librosa.feature.zero_crossing_rate(y=y, frame_length=n_fft, hop_length=hop_length).flatten(),
        'cent': librosa.feature.spectral_centroid(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, window='hamming').flatten(),
        'voiced_flag': librosa.pyin(y, fmin=85, fmax=400, frame_length=n_fft, hop_length=hop_length)[1],
        'rolloff': librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, roll_percent=0.99).flatten(),
        'rolloff_min': librosa.feature.spectral_rolloff(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, roll_percent=0.01).flatten(),
        'contrast': librosa.feature.spectral_contrast(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, fmin=85), 
    }

    # Transpose mfccs and contrast
    features['mfccs'] = features['mfccs'].T
    features['contrast'] = features['contrast'].T

    # Find the minimum length of all features
    min_length = min([len(x) for x in features.values()])

    # Truncate all features to the same length
    for key in features.keys():
        features[key] = features[key][:min_length]

    # Split the mfccs into individual columns
    mfcc_columns = {}
    for j in range(features['mfccs'].shape[1]):
        mfcc_columns[f'mfcc_{j + 1}'] = features['mfccs'][:, j]
    
    # Split the contrast into individual columns
    contrast_columns = {}
    for j in range(features['contrast'].shape[1]):
        contrast_columns[f'contrast_{j + 1}'] = features['contrast'][:, j]

    # Update the features dictionary with the individual columns
    del features['mfccs']
    del features['contrast']
    features.update(mfcc_columns)
    features.update(contrast_columns)

    # Save features into a DataFrame
    df = pd.DataFrame(features)
    
    output_file = os.path.join(audio_path,file_name).replace(".wav",".csv")
    df.to_csv(output_file,index=False)

    return features

In [None]:
# 오디오 파일 루트 디렉토리 설정
speaker = 'counselor' # or customer

audio_root = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO',str(speaker))

file_list = []
for root, dirs, files in os.walk(audio_root):
    for file in files:
        if file.endswith('.wav'):
            file_list.append([os.path.join(root, file),file])


speaker = 'worker'
audio_dest_root = os.path.join(envs['DATA_PATH'],'3_feature_extraction','CAL_AUDIO',str(speaker))
for file_path,file_name in file_list: # 중간에 끊겼던 부분부터 시작
    extract_features(file_path,file_name,audio_dest_root)

#### LLD Statistics

In [None]:
speaker ='worker'
folder_path = os.path.join(envs['DATA_PATH'],'3_feature_extraction','CAL_AUDIO',str(speaker))
summary_data = []

# Iterate over all CSV files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):

        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        
        # Calculate mean, std, min, and max for each column
        summary = df.describe().loc[['mean', 'std', 'min', 'max']].T
        summary['file'] = filename
        summary_data.append(summary)

# Combine all summary DataFrames into one
new_df = pd.concat(summary_data)
new_df = new_df.reset_index()
grouped_new_df = new_df.groupby(['file', 'index']).agg(
    mean=('mean', 'mean'),
    std=('std', 'mean'),
    min=('min', 'mean'),
    max=('max', 'mean')
).reset_index()
new_df_pivoted = grouped_new_df.pivot_table(index='file', columns='index', values=['mean', 'std', 'min', 'max'])
new_df_pivoted.columns = ['_'.join(col).strip() for col in new_df_pivoted.columns.values]

# Reset the index to bring 'file' back as a column
new_df_pivoted = new_df_pivoted.reset_index()


# Save data
import re
new_df_pivoted['file'] = new_df_pivoted['file'].apply(lambda x: str(x).replace('.csv',''))
filename_pattern = re.compile(r'^\d{4}-\d{2}-\d{2}_\d{6}__P\d{2}$')
def filename_matching(file_name):
    if not filename_pattern.match(file_name):
        # 파일 이름 수정 작업
        new_filename = file_name

        # '_'가 하나만 있는 경우 수정
        if "_" not in file_name.split('_')[-1]:
            new_filename = new_filename.replace(r'\d+_P', '__P')

        # 소문자 p로 되어 있는 경우 수정
        if 'p' in new_filename:
            new_filename = new_filename.replace('p', 'P')
        
        return new_filename
    return file_name
new_df_pivoted['file']= new_df_pivoted['file'].apply(filename_matching)
new_df_pivoted['pnum'] = new_df_pivoted['file'].apply(lambda x : str(x)[-2:])
new_df_pivoted['start_second'] = new_df_pivoted['file'].apply(lambda x : str(x)[:17])

new_df_pivoted.loc[new_df_pivoted['pnum'] == "남희", 'pnum'] = 15 
new_df_pivoted['pnum']  =  new_df_pivoted['pnum'].astype('int')
new_df_pivoted['start_second'] = pd.to_datetime(new_df_pivoted['start_second'],format='%Y-%m-%d_%H%M%S')

new_df_pivoted.drop(['file'],axis=1,inplace=True)
# new_df_pivoted.to_csv(f'/home/iclab/23EmoWorkerField/3_feature_extraction/CALL_AUDO_statistics/{speaker}_audio.csv',index=False)

#### Utterance feature

In [None]:
# 화자별 음성 길이
file_list = []
speaker = "customer"

audio_root =os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO',str(speaker))
for root, dirs, files in os.walk(audio_root):
    for file in files:
        if file.endswith('.wav'):
            file_list.append([os.path.join(root, file),file])

audio_additional_root = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO','audio_additional',str(speaker))
for root, dirs, files in os.walk(audio_additional_root):
    for file in files:
        if file.endswith('.wav'):
            file_list.append([os.path.join(root, file),file])

        
# 정규 표현식 패턴으로 추출하기
dt_pattern = r'\d{4}-\d{2}-\d{2}_\d{6}'
pnum_pattern = r'[Pp](\d{1,2})'

# 파일 정보 저장을 위한 리스트
data = []

# 파일 이름에서 정보 추출 및 wav 파일 duration 계산
for file_path, file_name in file_list:
    # pnum 추출
    pnum_match = re.search(pnum_pattern, file_name)
    pnum = int(pnum_match.group()[-2:])
    # start_second 추출
    start_match = re.search(dt_pattern, file_name)
    start_second = start_match.group()

    # duration 계산
    with wave.open(file_path, 'rb') as wav_file:
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
    
    # 데이터 추가
    data.append([pnum, start_second, duration])


# DataFrame 생성
customer_df = pd.DataFrame(data, columns=['pnum', 'start_second', 'customer_audio_duration'])

In [None]:
file_list = []
speaker = "worker"

audio_root = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO',str(speaker))
for root, dirs, files in os.walk(audio_root):
    for file in files:
        if file.endswith('.wav'):
            file_list.append([os.path.join(root, file),file])

audio_additional_root = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO','audio_additional',str(speaker))
for root, dirs, files in os.walk(audio_additional_root):
    for file in files:
        if file.endswith('.wav'):
            file_list.append([os.path.join(root, file),file])

        
# 정규 표현식 패턴으로 추출하기
dt_pattern = r'\d{4}-\d{2}-\d{2}_\d{6}'
pnum_pattern = r'[Pp](\d{1,2})'

# 파일 정보 저장을 위한 리스트
data = []

# 파일 이름에서 정보 추출 및 wav 파일 duration 계산
for file_path, file_name in file_list:
    # pnum 추출
    pnum_match = re.search(pnum_pattern, file_name)
    pnum = int(pnum_match.group()[-2:])
    # start_second 추출
    start_match = re.search(dt_pattern, file_name)
    start_second = start_match.group()

    # duration 계산
    with wave.open(file_path, 'rb') as wav_file:
        frames = wav_file.getnframes()
        rate = wav_file.getframerate()
        duration = frames / float(rate)
    
    # 데이터 추가
    data.append([pnum, start_second, duration])


# DataFrame 생성
worker_df = pd.DataFrame(data, columns=['pnum', 'start_second', 'worker_audio_duration'])

In [None]:
final_df = pd.concat([
                    customer_df.set_index(['pnum', 'start_second']),
                    worker_df.set_index(['pnum', 'start_second'])],
                    axis=1, join='inner').reset_index()

final_df['start_second'] = pd.to_datetime(final_df['start_second'],format='%Y-%m-%d_%H%M%S')
dest_path = os.path.join(envs['DATA_PATH'],'3_feature_extraction','CAL_AUDIO_statistics','audio_all_duration.csv')
final_df.to_csv(dest_path,index=False)


In [None]:
# Function 
def attatch_prefix_condition(df, prefix,exclude_col = ['pnum','start_second','end','date','matching']):
    df.columns = [f"{prefix}_{col}" if col not in exclude_col else col for col in df.columns]
    return df

#### Rename

In [None]:
# Audio
def load_audio(path):
    customer_audio =  pd.read_csv(os.path.join(path, 'customer_audio.csv'),parse_dates=['start_second']).drop(['min_rolloff_min'],axis=1)
    customer_audio = attatch_prefix_condition(customer_audio,'(S)_customer_audio')

    worker_audio =  pd.read_csv(os.path.join(path, 'worker_audio.csv'),parse_dates=['start_second']).drop(['min_energy','min_rolloff_min'],axis=1)
    worker_audio = attatch_prefix_condition(worker_audio,'(R)_worker_audio')

    audio_duration = pd.read_csv(os.path.join(path,"audio_all_duration.csv"),parse_dates=['start_second'])
    audio_duration.columns = ['pnum','start_second','(S)_customer_audio_duration','(R)_worker_audio_duration']

    final_df = pd.concat([customer_audio.set_index(['pnum', 'start_second']),
                        worker_audio.set_index(['pnum', 'start_second']),
                        audio_duration.set_index(['pnum','start_second'])],
                        axis=1, join='outer').reset_index()
    
    return final_df

path = os.path.join(envs['DATA_PATH'],'3_feature_extraction','CAL_AUDIO_statistics')
audio = load_audio(path)
audio.to_csv(os.path.join(path,'audio.csv'),index=False)



# Transcript

### Data Cleaning

In [None]:
audio_root = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO','customer')
file_list = []

# 오디오 파일 목록 생성
for root, dirs, files in os.walk(audio_root):
    for file in files:
        if file.endswith('.srt'):
            file_list.append([root,os.path.join(root, file),file])

audio_additional_root = os.path.join(envs['DATA_PATH'],'1_raw','CAL_AUDIO','audio_additional','customer')
for root, dirs, files in os.walk(audio_additional_root):
    for file in files:
        if file.endswith('.srt'):
            file_list.append([root,os.path.join(root, file),file])

# 데이터프레임 초기화
features_df = pd.DataFrame()

# 정규 표현식 패턴으로 추출하기

def open_srt_with_encoding(file_path):
    encodings = ['utf-8', 'latin1', 'cp949', 'euc-kr']  # 시도할 인코딩 목록
    for encoding in encodings:
        try:
            print(f"Trying to open the file with encoding: {encoding}")
            subtitles = pysrt.open(file_path, encoding=encoding)
            print("File opened successfully!")
            return subtitles
        except UnicodeDecodeError:
            print(f"Failed to open the file with encoding: {encoding}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
    raise ValueError("Unable to open the file with the provided encodings.")


text_df = pd.DataFrame(columns=['start_second', 'pnum','transcript_worker','transcript_customer','worker_total_turn','customer_total_turn'])
check_file_path = []
dt_pattern = r'\d{4}-\d{2}-\d{2}_\d{6}'
pnum_pattern = r'[Pp](\d{1,2})'

# 각 파일에 대해 피처 추출 및 데이터프레임에 추가
for root,file_path,file_name in file_list: 
    subtitles = pysrt.open(file_path) # 파일 열기
    dt_name = re.search(dt_pattern, file_name).group()
    dt_target = pd.to_datetime(dt_name,format='%Y-%m-%d_%H%M%S')
    pnum_name = re.search(pnum_pattern,file_name).group()
    pnum_int = int(pnum_name[1:])

    rows= []
    for subtitle in subtitles: 
        match = re.match(r'(Speaker\s\d+):\s*(.*)', subtitle.text)

        if match:
            speaker = match.group(1)     # 화자 정보
            text = match.group(2)        # 발화 내용
        else:
            speaker = None               # 화자 정보가 없을 때
            text = subtitle.text         # 전체 텍스트를 발화 내용으로 설정

        row = {
            'Index': subtitle.index,
            'Start Time': subtitle.start.to_time(),  # 시작 시간
            'End Time': subtitle.end.to_time(),      # 종료 시간
            'Speaker': speaker,                      # 화자
            'Text': text                             # 발화 내용
        }
        rows.append(row)

    df = pd.DataFrame(rows)

    if len(df['Speaker'].unique())==2:
        # 'Index' 값이 1인 'Speaker' 정보를 기준으로 다른 행의 'Speaker'를 수정
        worker_speaker = df.loc[df['Index'] == 1, 'Speaker'].values[0]  # Index가 1인 Speaker 정보 가져오기
        # 해당 Speaker가 아닌 다른 Speaker들은 모두 'Customer'로 변경
        df['Speaker'] = df['Speaker'].apply(lambda x: 'worker' if x == worker_speaker else 'customer')

        df.to_csv(os.path.join(root,file_name.split('.')[0] + ".csv"))

        temp_row = []
        # Worker와 Customer의 transcript 분리
        worker_text = ' '.join(df.loc[df['Speaker'] == 'worker', 'Text'].tolist())
        customer_text = ' '.join(df.loc[df['Speaker'] == 'customer', 'Text'].tolist())

        # turn 수
        worker_turn = len(df[df['Speaker']=='worker'])
        customer_turn = len(df[df['Speaker']=='customer'])

        text_row = {
            'start_second' : [dt_target],
            'pnum':[pnum_int],
            'transcript_worker' : [worker_text],
            'transcript_customer':[customer_text],
            'worker_total_turn' : [worker_turn],
            'customer_total_turn' : [customer_turn]
        }
        text_df = pd.concat([text_df, pd.DataFrame(text_row)], ignore_index=True)
    elif len(df['Speaker'].unique())==3:
        worker_speaker = df.loc[df['Index'] == 1, 'Speaker'].values[0]
        
        # 첫 번째 worker가 아닌 Speaker 중 가장 먼저 등장한 Speaker를 customer로 지정
        other_speakers = df[df['Speaker'] != worker_speaker]['Speaker'].unique()
        customer_speaker = other_speakers[0]  # 첫 번째 worker가 아닌 Speaker
        
        # Speaker를 worker, customer, co-worker로 재지정
        df['Speaker'] = df['Speaker'].apply(
            lambda x: 'worker' if x == worker_speaker else 
                    'customer' if x == customer_speaker else 
                    'co-worker'
        )
        df.to_csv(os.path.join(root, file_name.split('.')[0] + ".csv"))

        # Worker, Customer, Co-worker의 transcript 분리
        worker_text = ' '.join(df.loc[df['Speaker'] == 'worker', 'Text'].tolist())
        customer_text = ' '.join(df.loc[df['Speaker'] == 'customer', 'Text'].tolist())
        coworker_text = ' '.join(df.loc[df['Speaker'] == 'co-worker', 'Text'].tolist())

        # turn 수
        worker_turn = len(df[df['Speaker'] == 'worker'])
        customer_turn = len(df[df['Speaker'] == 'customer'])
        coworker_turn = len(df[df['Speaker'] == 'co-worker'])

        text_row = {
            'start_second': [dt_target],
            'pnum': [pnum_int],
            'transcript_worker': [worker_text],
            'transcript_customer': [customer_text],
            'transcript_coworker': [coworker_text],
            'worker_total_turn': [worker_turn],
            'customer_total_turn': [customer_turn],
            'coworker_total_turn': [coworker_turn]
        }
        text_df = pd.concat([text_df, pd.DataFrame(text_row)], ignore_index=True)
        
    else:
        check_file_path .append([file_path,len(df['Speaker'].unique())])




  text_df = pd.concat([text_df, pd.DataFrame(text_row)], ignore_index=True)


In [None]:
kiwi = Kiwi()

# Fillna
text_df['transcript_coworker'] = text_df['transcript_coworker'].fillna(" ")
text_df['coworker_total_turn'] = text_df['coworker_total_turn'].fillna(0)

# Normalization
def normalize_to_base_form(text):
    # 형태소 분석을 통해 각 단어의 기본형 추출
    tokens = kiwi.tokenize(text)
    normalized_text = ''
    
    # 이전 토큰의 끝 위치를 추적
    previous_end = 0
    for token in tokens:
        # 이전 토큰의 끝 위치와 현재 토큰의 시작 위치가 다르면 띄어쓰기가 있었다고 판단
        if token.start > previous_end:
            normalized_text += ' '
        
        # 기본형이 존재하면 base_form을 사용, 없으면 form을 사용
        normalized_text += token.base_form if token.base_form else token.form
        
        # 현재 토큰의 끝 위치를 갱신
        previous_end = token.end
    
    return normalized_text

text_df['transcript_worker_normalized'] = text_df['transcript_worker'].apply(normalize_to_base_form)
text_df['transcript_customer_normalized'] = text_df['transcript_customer'].apply(normalize_to_base_form)
text_df['transcript_coworker_normalized'] = text_df['transcript_coworker'].apply(normalize_to_base_form)

### Feature Extraction

#### Emotional Lexcion-based feature
    * Total Emotion word Count (TEC): 문서 내 감정과 연관된 단어의 개수를 카운트하는 feature.
    * Total Emotion word Intensity (TEI): 문서에 포함된 단어의 감정 강도 점수를 합산한 feature.

* github link: https://github.com/park1200656/KnuSentiLex?tab=readme-ov-file


In [None]:
# Kiwi 초기화
kiwi = Kiwi()

# 감성 사전 불러오기 및 polarity 값을 int로 변환
with open('/SentiWord_info.json', encoding='utf-8-sig', mode='r') as f: 
    SentiWord_info = json.load(f)
sentiword_df = pd.DataFrame(SentiWord_info)

# polarity 값을 int로 변환하여 딕셔너리 생성
sentiword_dict = {word: int(polarity) for word, polarity in zip(sentiword_df['word'], sentiword_df['polarity'])}

# 문장별 긍정/부정 점수 및 단어 개수 계산 함수
def calculate_sentence_emotions(text):
    tokens = kiwi.tokenize(text)
    
    # 초기 값 설정
    positive_score = 0
    negative_score = 0
    positive_count = 0
    negative_count = 0
    
    # 각 토큰에 대해 감정 단어 확인 및 점수와 개수 계산
    for token in tokens:
        word = token.form
        if word in sentiword_dict:
            polarity = sentiword_dict[word]
            if polarity > 0:
                positive_score += polarity
                positive_count += 1
            elif polarity < 0:
                negative_score += abs(polarity)
                negative_count += 1
    
    return positive_score, negative_score, positive_count, negative_count


In [135]:
# 감정 점수 및 단어 개수를 열에 추가
text_df[['(R)_worker_transcript_positive_score', '(R)_worker_transcript_negative_score', '(R)_worker_transcript_positive_count', '(R)_worker_transcript_negative_count']] = \
    text_df['transcript_worker_normalized'].apply(lambda x: pd.Series(calculate_sentence_emotions(x)))

# 감정 점수 및 단어 개수를 열에 추가
text_df[['(S)_customer_transcript_positive_score', '(S)_customer_transcript_negative_score', '(S)_customer_transcript_positive_count', '(S)_customer_transcript_negative_count']] = \
    text_df['transcript_customer_normalized'].apply(lambda x: pd.Series(calculate_sentence_emotions(x)))

text_df[['(S)_coworker_transcript_positive_score', '(S)_coworker_transcript_negative_score', '(S)_coworker_transcript_positive_count', '(S)_coworker_transcript_negative_count']] = \
    text_df['transcript_coworker_normalized'].apply(lambda x: pd.Series(calculate_sentence_emotions(x)))

#### TF-IDF

In [5]:
text_df = text_df.dropna()

In [None]:
# 형태소 분류

kiwi = Kiwi()
def tokenize_with_kiwi(text):
    tokens = kiwi.analyze(text)[0][0]  # 형태소 분석 결과
    return [token[0] for token in tokens]  # 형태소만 추출 (token[0]이 단어)


# TF-IDF Vectorizer - worker
vectorizer = TfidfVectorizer(tokenizer=tokenize_with_kiwi,max_features=50, min_df = 10, max_df = 0.7)  # lowercase=False: 소문자 변환 방지
tfidf_matrix = vectorizer.fit_transform(text_df['transcript_worker_normalized'])
words = vectorizer.get_feature_names_out()
print(words)
modified_words = [f"(R)_worker_transcript_{word}" for word in words]
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = modified_words)
text_df = pd.concat([text_df, tfidf_df], axis=1)

# TF-IDF Vectorizer - customer
vectorizer = TfidfVectorizer(tokenizer=tokenize_with_kiwi,max_features=50, min_df = 10, max_df = 0.7)  # lowercase=False: 소문자 변환 방지
tfidf_matrix = vectorizer.fit_transform(text_df['transcript_customer_normalized'])
words = vectorizer.get_feature_names_out()
print(words)
modified_words = [f"(S)_customer_transcript_{word}" for word in words]
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns =modified_words)
text_df = pd.concat([text_df, tfidf_df], axis=1)


# TF-IDF Vectorizer - coworker
vectorizer = TfidfVectorizer(tokenizer=tokenize_with_kiwi, max_features=50, min_df = 10, max_df = 0.7)  # lowercase=False: 소문자 변환 방지
tfidf_matrix = vectorizer.fit_transform(text_df['transcript_coworker_normalized'])
words = vectorizer.get_feature_names_out()
print(words)
modified_words = [f"(S)_coworker_transcript_{word}" for word in words]
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns =modified_words)
text_df = pd.concat([text_df, tfidf_df], axis=1)


In [None]:
text_dest_path = os.path.join(envs['DATA_PATH'],'3_feature_extraction','CALL_AUDO_statistics','transcript.csv')
text_df.to_csv(text_dest_path ,index=False)