In [None]:
import pandas as pd
import os
import numpy as np
import json
from datetime import datetime
from datetime import timedelta
import time

with open("env.json") as f: # input your env file path
    envs = json.load(f)

# Filter criteria: date, pnum
start_date = datetime.strptime("2023-06-22", "%Y-%m-%d")
end_date = datetime.strptime("2023-07-21", "%Y-%m-%d")
pnum_CSR = [1,2,3,4,
        #5,
        #6
        7,8,9,10,
        #11,
        12,13,14,15,16,17,
        #18,
        19,20,21,22,23] # 5,6,11,18 : manager 


# Basic preprocessing

In [None]:
# 2. CALL LOG data
CALL_LOG_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"1_raw","CALL_LOG")
CALL_LOG_DEST_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed")

call_log_content = pd.read_csv(os.path.join(CALL_LOG_SOURCE_PATH ,'Call_log_contents.csv'))
call_log_time = pd.read_csv(os.path.join(CALL_LOG_SOURCE_PATH ,'Call_log_time.csv'))
print("len: call_log_content",len(call_log_content))


# 2.1. call contents preprocessing
## 2.1.1. Rename column
call_log_content = call_log_content.rename(columns = {'상담일시':'start','상담유형':'agreement',
                                       '문의내용':'question','답변내용':'answer',
                                       '상담사':'pnum','통화시간':'total_duration',
                                       '불만여부':'complain'})

# 2.1.2. Delete call log
call_log_content = call_log_content.dropna(axis=0,subset=['start'])
call_log_content = call_log_content[~((call_log_content['question'].str.contains('문자', na=False)) &(call_log_content['total_duration']=="00:00:00"))] # 문자 상담인 경우 제거.
call_log_content = call_log_content[~((call_log_content['question'].str.contains('문자', na=False)) &(call_log_content['total_duration'].isnull()))] # 문자 상담인 경우 제거.
print("len: call_log_content - 문자 제거",len(call_log_content))

# 2.1.3. Change the data type and value 
def change_format_startime(string_value):
    try:
        string_value=str(string_value)
        date_changed=datetime.strptime(string_value, '%Y-%m-%d %H:%M')
    except:
        string_value=str(string_value)
        date_changed=datetime.strptime(string_value, '%Y-%m-%d %H:%M:%S')
    return date_changed

def change_format_duration(string_value):
    if pd.isna(string_value):
        #print(f"Encountered NaN for value: {string_value}")
        return string_value
    else:
        string_value = str(string_value)
        if 'days' in string_value:
            string_value = string_value.split()[-1]
        string_fin = datetime.strptime(string_value,"%H:%M:%S")
        date_changed = timedelta(hours=string_fin.hour, minutes=string_fin.minute, seconds=string_fin.second)
        return date_changed
    
def delta_to_seconds(value):
    if isinstance(value, str):
        print(f"Encountered a string instead of a timedelta object: {value}")
        return None    
    try:
        seconds_result = value.total_seconds()
        return seconds_result
    except AttributeError:
        print(f"The object passed does not have a 'total_seconds' method. Received: {value}")
        return None
    
def type_agreement_change(value):
    if value=="In(동의)":
        return 1
    elif value=="In(미동의)":
        return 0
    elif value=="Out": # Out
        return 2
    else:
        return 2 # IN

def type_complain_change(value):
    if value=="불만":
        return 1
    elif value=="칭찬":
        return 2
    else:
        return 0

def pnum_change(value):
    value = str(value)
    value = value[-2:]  
    try:
        int_val = int(value)
        return int_val
    except ValueError: # 22번 pnum 같은 경우, 가명처리가 안된 경우가 존재했음음
        if value == '미선':
            return 22
        print(f"Could not convert value '{value}' to integer.")
        return value

call_log_content["start"]=call_log_content["start"].apply(change_format_startime)
call_log_content["start"] = pd.to_datetime(call_log_content['start'])
call_log_content["total_duration"]=call_log_content["total_duration"].apply(change_format_duration) # mute 시간을 포함한 전체 전화 상담 시간
call_log_content["total_duration"]=call_log_content["total_duration"].apply(delta_to_seconds)
call_log_content["agreement"]=call_log_content["agreement"].apply(type_agreement_change)
call_log_content["complain"]=call_log_content["complain"].apply(type_complain_change)
call_log_content["pnum"]=call_log_content["pnum"].apply(pnum_change)

# 2.1.4. Duplicated data
# null값으로 되어 있는 total duration을 제외하고 나머지 기록이 (e.g., start, num, question, agreement) 모두 동일한 중복 데이터 존재
# null값인 total_duration 중복행으로 채움
# 중복 기록된 행 제거거
nan_mask_duration = call_log_content['total_duration'].isna()
for index, row in call_log_content[nan_mask_duration].iterrows():
    matching_rows = call_log_content[
        (call_log_content['start'] == row['start']) & 
        (call_log_content['pnum'] == row['pnum']) & 
        (call_log_content['question'] == row['question']) &
        (~call_log_content['total_duration'].isna())
    ]
    if not matching_rows.empty:
        call_log_content.at[index, 'total_duration'] = matching_rows.iloc[0]['total_duration']

# answer = start, num, question, agreement가 모두 같은 행의 answer으로 대체
nan_mask_answer = call_log_content['answer'].isna()
for index, row in call_log_content[nan_mask_answer].iterrows():
    matching_rows = call_log_content[
        (call_log_content['start'] == row['start']) & 
        (call_log_content['pnum'] == row['pnum']) & 
        (call_log_content['question'] == row['question']) &
        (~call_log_content['answer'].isna())
    ]
    if not matching_rows.empty:
        call_log_content.at[index, 'answer'] = matching_rows.iloc[0]['answer']

# Duplicated data 없애기
call_log_content = call_log_content.drop_duplicates(keep='first')
print("len: call_log_content - 중복 행 통합 이후",len(call_log_content))

# 2.1.5. Filter 
call_log_content = call_log_content[call_log_content['pnum'].isin(pnum_CSR)]
call_log_content['date'] = pd.to_datetime(call_log_content['start']).dt.date
call_log_content  = call_log_content[(call_log_content['date']>=start_date.date()) & (call_log_content['date']<=end_date.date())]
call_log_content.drop(['date'],axis=1,inplace=True)
print("len: call_log_content - pnum, date 선별 이후",len(call_log_content))

# 2.1.6 Sort 
call_log_content = call_log_content.sort_values(by=['pnum', 'start']) # (17725, 8) , total_duration에 8개 missing 


# 2.2 call log time preprocessing
call_log_time = pd.read_csv(os.path.join(CALL_LOG_SOURCE_PATH ,'Call_log_time.csv'))

# 2.2.1. Make time column
call_log_time.loc[:,'start_second'] = pd.to_datetime(call_log_time['통화일자'] + ' ' + call_log_time['통화시간'])

# 2.2.2. Select column and reanme
call_log_time = call_log_time[['start_second','상담원명','통화','구분']].rename(columns={'상담원명':'pnum','통화' : 'active_duration', '구분' : 'in/out'})

# 2.2.3 Change the data type and value 
call_log_time['pnum'] = call_log_time['pnum'].str.replace('p', '').astype(int)
call_log_time['active_duration'] = pd.to_timedelta(call_log_time['active_duration']).dt.total_seconds().astype(float) #  mute 시간을 제거한한 전체 전화 상담 시간
call_log_time['in/out'] = call_log_time['in/out'].apply(lambda x: -1 if x=="발신" else 0 if x=="수신" else 2)


# 2.2.4 Filter 
print("len: call_log_time",len(call_log_time))
call_log_time = call_log_time[call_log_time['pnum'].isin(pnum_CSR)]
call_log_time['date'] = pd.to_datetime(call_log_time['start_second']).dt.date
call_log_time  = call_log_time[(call_log_time['date']>=start_date.date()) & (call_log_time['date']<=end_date.date())]
call_log_time.drop(['date'],axis=1,inplace=True)
print("len: call_log_time- pnum, date 선별 이후",len(call_log_time))
# Sort 
call_log_time = call_log_time.sort_values(by=['pnum', 'start_second']) # (17871,4)



len: call_log_content 18201
len: call_log_content - 문자 제거 18193
len: call_log_content - 중복 행 통합 이후 18160
len: call_log_content - pnum, date 선별 이후 17904
len: call_log_time 18469
len: call_log_time- pnum, date 선별 이후 18434


# Merge
* Call log content : 상담사 분들이 상담 내용을 기록한 로그
* Call log time : 상담사 분들의 전화 상담 내용을 녹음한 음원 관리 서버에서 기록된 로그

### 1차 merge

In [3]:
all_merged_data = pd.DataFrame()
not_merged_content = pd.DataFrame()
not_merged_time = pd.DataFrame()


for pnum in pnum_CSR:

    pnum_call_content = call_log_content[call_log_content['pnum'] == pnum]
    pnum_call_time = call_log_time[call_log_time['pnum'] == pnum]

    # sort
    pnum_call_content = pnum_call_content.sort_values(['start'])
    pnum_call_time = pnum_call_time.sort_values(['start_second'])

    # 1차 병합 - forward (matching 1)
    # 작동 방식: left-join except that we match on nearest key rather than equal keys
    # 이유: content에 있는 call log가 min truncated이기 때문에 더 빨리 기록됨. 
    first_merge = pd.merge_asof(pnum_call_content, pnum_call_time, left_on='start', right_on='start_second', 
                                by='pnum', direction='forward', tolerance=pd.Timedelta(minutes=1), allow_exact_matches=True)

    first_merge_cleaned = first_merge.dropna(subset=['start_second']) # success merging
    first_merge_cleaned['matching'] = 1


    # 2차 병합 - bacward (matching 1)
    # 작동 방식: 1차 병합 때, start_second에 null값이 발생한 content에 대해서 사용되지 않았던 pnum_call_time으로 matching 시도
    # 이유: content에 있는 call log 중 second가 60초에 가까울 경우 min+1로 기록됨. (ex. 1:59이면, 원래 1:00이어야 하나 2:00으로 기록됨)
    first_merge_failed_time = pnum_call_time[~pnum_call_time['start_second'].isin(first_merge_cleaned['start_second'])]
    first_merge_failed_content = first_merge[first_merge['start_second'].isnull()][['start', 'agreement', 'question', 'answer', 'pnum', 'total_duration', 'complain']] 

    second_merge = pd.merge_asof(first_merge_failed_content, 
                                first_merge_failed_time, left_on='start', right_on='start_second', by='pnum', 
                                direction='backward', tolerance=pd.Timedelta(minutes=1), allow_exact_matches=False) 
    second_merge_cleaned = second_merge.dropna(subset=['start_second'])
    second_merge_cleaned['matching'] = 1

    # Concat 1차 병합과 2차 병합 결과
    merged_data = pd.concat([first_merge_cleaned, second_merge_cleaned],axis=0)

    # 병합되지 않은 content과 time 데이터 
    # not_merged_data = pnum_call_content.loc[~pnum_call_content['start'].isin(merged_data['start']).index]

    not_merged_content_data = pnum_call_content.merge(merged_data[['start', 'agreement', 'question', 'answer', 'pnum', 'total_duration',
       'complain']], on=['start', 'pnum'], how='left', indicator=True,suffixes=["","_y"])
    # print(not_merged_content_data['_merge'].value_counts())
    not_merged_content_data = not_merged_content_data[not_merged_content_data['_merge']=="left_only"] [['start', 'agreement', 'question', 'answer', 'pnum', 'total_duration','complain']]
    # print(len(not_merged_content_data))
    # print()

    
    not_merged_time_data = pnum_call_time.merge(merged_data[['start_second','pnum','active_duration','in/out']], on=['start_second', 'pnum'], how='left', indicator=True,suffixes=["","_y"])
    # print(not_merged_time_data['_merge'].value_counts())
    not_merged_time_data = not_merged_time_data[not_merged_time_data['_merge']=="left_only"][['start_second','pnum','active_duration','in/out']]
    # print(len(not_merged_time_data))
    # print()


    # 결과를 all_merged_data에 추가
    all_merged_data = pd.concat([all_merged_data, merged_data],axis=0)
    not_merged_content = pd.concat([not_merged_content, not_merged_content_data],axis=0)
    not_merged_time = pd.concat([not_merged_time, not_merged_time_data],axis=0)

# index sort
all_merged_data = all_merged_data.reset_index(drop=True)
not_merged_content  = not_merged_content.reset_index(drop=True)
not_merged_time = not_merged_time.reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_merge_cleaned['matching'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_merge_cleaned['matching'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_merge_cleaned['matching'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col

#### duplicated data

In [4]:
# Preprocessing for duplicated data
display(all_merged_data[all_merged_data[['pnum','start_second']].duplicated(keep=False)].info())
all_merged_data[all_merged_data[['pnum','start_second']].duplicated(keep=False)]

<class 'pandas.core.frame.DataFrame'>
Index: 28 entries, 2951 to 17103
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   start            28 non-null     datetime64[ns]
 1   agreement        28 non-null     int64         
 2   question         28 non-null     object        
 3   answer           21 non-null     object        
 4   pnum             28 non-null     int64         
 5   total_duration   21 non-null     float64       
 6   complain         28 non-null     int64         
 7   start_second     28 non-null     datetime64[ns]
 8   active_duration  28 non-null     float64       
 9   in/out           28 non-null     float64       
 10  matching         28 non-null     int64         
dtypes: datetime64[ns](2), float64(3), int64(4), object(2)
memory usage: 2.6+ KB


None

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching
2951,2023-07-03 18:01:00,2,213번 배차간격/,34분\n목원대 출발시간/18:32\n,4,51.0,0,2023-07-03 18:01:02,19.0,0.0,1
2952,2023-07-03 18:01:00,2,시청 당직실 연결요청/,270-2222 연결,4,20.0,0,2023-07-03 18:01:02,19.0,0.0,1
4046,2023-07-08 12:32:00,2,1.뺑소니 관련으로 cctv영상 볼수 있는지,1.- 정보공개청구신청 후 10일이내 열람가능여부 통보 \n -3자정보 있을시...,7,246.0,0,2023-07-08 12:32:06,46.0,0.0,1
4047,2023-07-08 12:32:00,0,1. 전세사기 상담 문의,1.-270 6521~7\n -금일 상담 불가. 평일 전화 안내,7,46.0,0,2023-07-08 12:32:06,46.0,0.0,1
4274,2023-07-15 11:34:00,0,무응,무응,7,16.0,0,2023-07-15 11:34:18,17.0,0.0,1
4275,2023-07-15 11:34:00,1,1. 대전역 - 가오동 홈플러스,1. -건너 514 홈플러스가오점,7,70.0,0,2023-07-15 11:34:18,17.0,0.0,1
9475,2023-07-08 10:08:00,1,동물사체 신고/ 유성구,유성구청 당직실 0426112222연결,14,36.0,1,2023-07-08 10:08:02,36.0,0.0,1
9476,2023-07-08 10:08:00,2,문화동 홈플러스->가장교,"서대전역네거리\n311,201,314,613,608,612,513,202,622\n...",14,108.0,0,2023-07-08 10:08:02,36.0,0.0,1
9856,2023-07-15 15:46:00,1,서구 변동 4-14 수도 터졌는지문의,서부사업소 당직실 042-715-6908 직통번호 안내후 연결,14,45.0,1,2023-07-15 15:46:45,46.0,0.0,1
9857,2023-07-15 15:46:00,2,종로3가역->고속터미널역 이동 문의,대전시청으로 타지역 노선 확인어려움 양해,14,24.0,0,2023-07-15 15:46:45,46.0,0.0,1


In [5]:
# 첫번째 중복 타입 (14건건): null인 total duration을 제외한 나머지가 동일한 쌍인 경우

nan_mask_duration = all_merged_data['total_duration'].isna()
for index, row in all_merged_data[nan_mask_duration].iterrows():
    matching_rows = all_merged_data[
        (all_merged_data['start_second'] == row['start_second']) & 
        (all_merged_data['pnum'] == row['pnum']) & 
        (~all_merged_data['total_duration'].isna())
    ]
    if not matching_rows.empty:
        all_merged_data.at[index, 'total_duration'] = matching_rows.iloc[0]['total_duration']

# 두 행 정보 합치기
duplicsated_all_data = all_merged_data[all_merged_data[['pnum','start_second']].duplicated(keep=False)]

nan_mask_answer = duplicsated_all_data['answer'].isna()
for index, row in duplicsated_all_data[nan_mask_answer].iterrows():
    matching_rows = duplicsated_all_data[
        (duplicsated_all_data['start_second'] == row['start_second']) & 
        (duplicsated_all_data['pnum'] == row['pnum']) & 
        (~duplicsated_all_data['answer'].isna())
    ]
    if not matching_rows.empty:
        all_merged_data.at[index, 'answer'] = matching_rows.iloc[0]['answer']

# nan 값에 대한 null값을 채우면서 생긴 중복값 없애기
all_merged_data = all_merged_data.drop_duplicates(subset=['pnum','start_second','answer','total_duration'],keep='last')

In [7]:
# 두번째 중복 타입 (14건) : call center에 전화로 문의로 매칭되는 음원만 남김김
all_merged_data[all_merged_data[['pnum','start_second']].duplicated(keep=False)].info() 

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 2951 to 15684
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   start            14 non-null     datetime64[ns]
 1   agreement        14 non-null     int64         
 2   question         14 non-null     object        
 3   answer           14 non-null     object        
 4   pnum             14 non-null     int64         
 5   total_duration   14 non-null     float64       
 6   complain         14 non-null     int64         
 7   start_second     14 non-null     datetime64[ns]
 8   active_duration  14 non-null     float64       
 9   in/out           14 non-null     float64       
 10  matching         14 non-null     int64         
dtypes: datetime64[ns](2), float64(3), int64(4), object(2)
memory usage: 1.3+ KB


In [8]:
all_merged_data[all_merged_data[['pnum','start_second']].duplicated(keep=False)]

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching
2951,2023-07-03 18:01:00,2,213번 배차간격/,34분\n목원대 출발시간/18:32\n,4,51.0,0,2023-07-03 18:01:02,19.0,0.0,1
2952,2023-07-03 18:01:00,2,시청 당직실 연결요청/,270-2222 연결,4,20.0,0,2023-07-03 18:01:02,19.0,0.0,1
4046,2023-07-08 12:32:00,2,1.뺑소니 관련으로 cctv영상 볼수 있는지,1.- 정보공개청구신청 후 10일이내 열람가능여부 통보 \n -3자정보 있을시...,7,246.0,0,2023-07-08 12:32:06,46.0,0.0,1
4047,2023-07-08 12:32:00,0,1. 전세사기 상담 문의,1.-270 6521~7\n -금일 상담 불가. 평일 전화 안내,7,46.0,0,2023-07-08 12:32:06,46.0,0.0,1
4274,2023-07-15 11:34:00,0,무응,무응,7,16.0,0,2023-07-15 11:34:18,17.0,0.0,1
4275,2023-07-15 11:34:00,1,1. 대전역 - 가오동 홈플러스,1. -건너 514 홈플러스가오점,7,70.0,0,2023-07-15 11:34:18,17.0,0.0,1
9475,2023-07-08 10:08:00,1,동물사체 신고/ 유성구,유성구청 당직실 0426112222연결,14,36.0,1,2023-07-08 10:08:02,36.0,0.0,1
9476,2023-07-08 10:08:00,2,문화동 홈플러스->가장교,"서대전역네거리\n311,201,314,613,608,612,513,202,622\n...",14,108.0,0,2023-07-08 10:08:02,36.0,0.0,1
9856,2023-07-15 15:46:00,1,서구 변동 4-14 수도 터졌는지문의,서부사업소 당직실 042-715-6908 직통번호 안내후 연결,14,45.0,1,2023-07-15 15:46:45,46.0,0.0,1
9857,2023-07-15 15:46:00,2,종로3가역->고속터미널역 이동 문의,대전시청으로 타지역 노선 확인어려움 양해,14,24.0,0,2023-07-15 15:46:45,46.0,0.0,1


In [9]:
# 14건 중 total_duration과 active_duration이 비슷한 것이 잘 매칭된 것으로 보고 크게 차이 나는건 다시 매칭하기
duplicated_data = all_merged_data[all_merged_data.duplicated(subset=['start_second', 'pnum'], keep=False)] # start_second가 중복되는 데이터 확인
eliminate_rows_list = []
keep_rows_list = []
for _, group in duplicated_data.groupby(['start', 'pnum']):
    group['duration_diff'] = abs(group['total_duration'] - group['active_duration'])
    
    eliminate_row = group.loc[group['duration_diff'].idxmax()]
    eliminate_rows_list.append(eliminate_row)

    for idx, row in group.iterrows():
        if idx != eliminate_row.name:
            keep_rows_list.append(row)
eliminate_df = pd.DataFrame(eliminate_rows_list)
keep_df = pd.DataFrame(keep_rows_list)

all_merged_data = all_merged_data.drop(index=eliminate_df.index) # 크게 all_merged_data에서 제거하기
not_merged_content = pd.concat([not_merged_content, eliminate_df[['start', 'agreement', 'question', 'answer', 'pnum', 'total_duration',
       'complain']]], ignore_index=True) # all_merged_data에서 제거된 것을 not_merged_content에 추가하기

# answer 관련 missing data 처리하기 
all_merged_data['answer'] = all_merged_data['answer'].fillna("") 
all_merged_data = all_merged_data.reset_index(drop=True)

In [10]:
all_merged_data[all_merged_data[['pnum','start_second']].duplicated(keep=False)] # 모두 처리 완료

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching


In [11]:
len(all_merged_data)

17874

#### abnormal data

In [12]:
# 이전 코드 수정 기록

# 발신 데이터와 수신데이터가 merge된 상황. 
# 확인 결과. content data의 agreement가 Out이 아닌 1,2로 기재됨. 하지만, question을 보면 아웃이라고 써져 있음. 
# display(all_merged_data[(all_merged_data['in/out']==-1) &(all_merged_data['agreement']!=-1)])
# 따라서, 위에 전처리 코드에서 agreement 데이터 Out을 -1로 뒀던 것을 2 (무응답)로 수정 

In [13]:
# abnormal data: total_duration이(mute 포함 전체 전화 상담 시간) active_duration보다 (mute 제외 전체 전화 상담 시간간) 작은 경우 

all_merged_data['mute'] = all_merged_data["total_duration"] - all_merged_data["active_duration"]
print(all_merged_data[all_merged_data['mute']<-17].info()) # 17초 차이가 최대였음음

#녹음본을 3개 확인해본 결과. total_duration이 잘못기재 되어 있음. 따라서  active duration값으로 변경함. 
all_merged_data.loc[all_merged_data['mute'] < -5, 'total_duration'] = all_merged_data.loc[all_merged_data['mute'] < -5, 'active_duration']
all_merged_data['mute'] = all_merged_data["total_duration"] - all_merged_data["active_duration"]
all_merged_data.drop(['mute'],axis=1,inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 286 entries, 753 to 17102
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   start            286 non-null    datetime64[ns]
 1   agreement        286 non-null    int64         
 2   question         286 non-null    object        
 3   answer           286 non-null    object        
 4   pnum             286 non-null    int64         
 5   total_duration   286 non-null    float64       
 6   complain         286 non-null    int64         
 7   start_second     286 non-null    datetime64[ns]
 8   active_duration  286 non-null    float64       
 9   in/out           286 non-null    float64       
 10  matching         286 non-null    int64         
 11  mute             286 non-null    float64       
dtypes: datetime64[ns](2), float64(4), int64(4), object(2)
memory usage: 29.0+ KB
None


### 2차 merge 

In [14]:
# 3차 병합 - forward(matching 2)
# 작동 방식: 병합이 안된 데이터들끼리 서로 병합하기 - forward
not_merged_content= not_merged_content.sort_values(by=['start']).reset_index(drop=True)
not_merged_time = not_merged_time.sort_values(by=['start_second']).reset_index(drop=True)
third_merge= pd.merge_asof(not_merged_content, not_merged_time, left_on='start', right_on='start_second', 
                                by='pnum', direction='forward',tolerance=pd.Timedelta(minutes=3), allow_exact_matches=True)# tolerance 제거. 녹음본 3개 확인 결과 시간이 2~3분 떨어져 있어도 매칭이 됨. 
third_merge_cleaned = third_merge.dropna(subset=['start_second'])
third_merge_cleaned['matching'] = 2

# 4차 병합 - backward (matching 2)
# 작동 방식: 병합이 안된 데이터들끼리 서로 병합하기
not_merged_content = third_merge[third_merge['start_second'].isnull()][['start', 'agreement', 'question', 'answer', 'pnum', 'total_duration','complain']]
not_merged_time_data = not_merged_time.merge(third_merge_cleaned[['start_second','pnum','active_duration','in/out']], on=['start_second', 'pnum'], how='left', indicator=True,suffixes=["","_y"])
not_merged_time_data = not_merged_time_data[not_merged_time_data['_merge']=="left_only"][['start_second','pnum','active_duration','in/out']]


# 정렬
not_merged_content= not_merged_content.sort_values(by=['start']).reset_index(drop=True)
not_merged_time = not_merged_time.sort_values(by=['start_second']).reset_index(drop=True)

forth_merge = pd.merge_asof(not_merged_content, not_merged_time, left_on='start', right_on='start_second', 
                                by='pnum', direction='backward',tolerance=pd.Timedelta(minutes=1), allow_exact_matches=True)
forth_merge_cleaned = forth_merge.dropna(subset=['start_second'])
forth_merge_cleaned['matching'] = 2


# # 3차 병합 (matching 4 - content에만 있는 data, matching 5 - time에만 있는 데이터)
# # 작동 방식: 병합이 안되는 데이터 합치기
not_merged_content = forth_merge[forth_merge['start_second'].isnull()][['start', 'agreement', 'question', 'answer', 'pnum', 'total_duration','complain']] # 매칭되는게 없는 건 3개. 확인 결과 total_duration이 0, 또한 문자 발송으로 추정됨.
not_merged_time = not_merged_time[~not_merged_time['start_second'].isin(third_merge_cleaned['start_second'])]
not_merged_time ['matching']  = 3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  third_merge_cleaned['matching'] = 2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  forth_merge_cleaned['matching'] = 2


In [15]:
len(third_merge_cleaned), len(forth_merge_cleaned)

(18, 2)

In [16]:
len(not_merged_content)

3

In [17]:
not_merged_content

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain
0,2023-07-10 12:22:00,2,아웃/ 중리네거리에서 용전네거리 방향 풍년삼계탕앞 우회전 구간에 노란색 두줄선인데도...,부재,22,0.0,1
3,2023-07-17 11:29:00,2,버스불편/담당자연락동의\n냉난방\n1.발생일시: 2023.7.17.(월) 오전10시...,\n▶처리중◀ [2023-07-17 17:17:16 김병선]\n7월 3차 민원대장 ...,17,0.0,1
4,2023-07-17 16:53:00,2,60마리 닭 튀기는 곳 전화번호/,60계치킨 전화번호 042-114로 문의 안내,10,0.0,0


### 3차 merge

In [18]:
# 모든 데이터 합치기
dfs = [all_merged_data, third_merge_cleaned, forth_merge_cleaned, not_merged_time]
final_call_log = pd.concat(dfs,axis=0 ,ignore_index=True)

# Second Preprocessing

In [22]:
# Missing data 처리 - total_duration
final_call_log['total_duration'] = final_call_log['total_duration'].fillna(final_call_log['active_duration'])

# feature engineering : end, mute, date
final_call_log['end'] = final_call_log['start_second'] + pd.to_timedelta(final_call_log['total_duration'], unit='s')
final_call_log['mute'] = final_call_log["total_duration"] - final_call_log["active_duration"]
final_call_log['date'] = final_call_log['start_second'].dt.date

# abnormal data 처리 
display(final_call_log[final_call_log['mute'] < -5]) # 없음
final_call_log['mute'] = final_call_log['mute'].apply(lambda x: 0 if x<0 else x)

# print(final_call_log.info())
display(final_call_log[final_call_log[['pnum','start_second']].duplicated(keep=False)])
final_call_log = final_call_log.drop(index=[18335,18374])# 확인해본 결과 2쌍 있었는데, content없는건 abnormal data로 취급하고 제거거
display(final_call_log[final_call_log[['pnum','start_second']].duplicated(keep=False)])
# print(final_call_log.info())

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date


Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date
17892,2023-07-12 12:57:00,2.0,602번 비래동 막차/,평일기준 22:40,21,37.0,0.0,2023-07-12 12:56:59,37.0,0.0,2,2023-07-12 12:57:36,0.0,2023-07-12
17893,2023-07-15 15:46:00,2.0,종로3가역->고속터미널역 이동 문의,대전시청으로 타지역 노선 확인어려움 양해,14,24.0,0.0,2023-07-15 15:45:56,25.0,0.0,2,2023-07-15 15:46:20,0.0,2023-07-15
18335,NaT,,,,21,37.0,,2023-07-12 12:56:59,37.0,0.0,3,2023-07-12 12:57:36,0.0,2023-07-12
18374,NaT,,,,14,25.0,,2023-07-15 15:45:56,25.0,0.0,3,2023-07-15 15:46:21,0.0,2023-07-15


Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date


In [None]:
# feature engineering : break (call과 call 사이의 쉬는 시간) 
# beforwork survey불러오기
BEFOREWORK_DAILY_SOURCE_PATH = os.path.join(envs['DATA_PATH'],"2_preprocessed","SURVEY_DAILY")
before_work = pd.read_csv(os.path.join(BEFOREWORK_DAILY_SOURCE_PATH,"daily_before_work.csv"),parse_dates =['date'])
answer_time =['before_work_stand_time', 'before_work_sleep_time',
       'before_work_wake_time', 'before_work_general_health_time',
       'before_work_stress_time', 'before_work_arousal_time']
before_work['timestamp'] = before_work[answer_time].max(axis=1)
before_work['timestamp'] = pd.to_datetime(before_work['before_work_stand_time']/1000, unit ='s').dt.tz_localize('UTC').dt.tz_convert('Asia/Seoul').dt.floor('s')
before_work['timestamp']= before_work['timestamp'].dt.tz_localize(None)

def calculate_break(group):
    group['break'] = group['start_second'] - group['end'].shift(1)

    # 첫번째 통화 같은 경우, break값을 계산할 때, daily survey의 시간을 기준으로 결정
    pnum = group['pnum'].iloc[0]
    date = group['date'].iloc[0]
    before_work_row = before_work[(before_work['pnum'] == pnum) & (before_work['date'] == pd.Timestamp(date))]
    if not before_work_row.empty:
        before_work_timestamp = before_work_row['timestamp'].iloc[0]
        group['break'].iloc[0] = pd.Timedelta(group['start_second'].iloc[0] - before_work_timestamp)
        # group['break'].iloc[0] = max(pd.Timedelta(group['start_second'].iloc[0] - before_work_timestamp),pd.Timedelta(seconds=0))
    else:
        group['break'].iloc[0] = pd.Timedelta(seconds=0) # or any other default value

    return group


# sort values
final_call_log = final_call_log.sort_values(['pnum','start_second']).reset_index(drop=True)
final_call_log = final_call_log.groupby(['pnum','date']).apply(calculate_break)
final_call_log.reset_index(drop=True, inplace=True)
final_call_log["break"] = final_call_log["break"].apply(delta_to_seconds)
final_call_log = final_call_log.sort_values(['pnum','start_second']).reset_index(drop=True)


* abnormal data: break가 0보다 작은 경우
    * 확인 결과: 1분 안에 여러번의 call이 있는 경우, break가 0보다 경우가 발생함. 
    * 대처 방식: duration을 기준으로 결과 변경 해야 함

In [35]:
# abnormal data check : break가 0보다 작은 경우
check= final_call_log.groupby(['pnum','date']).first()['break']
check_df= check[check <0]
check_df
final_call_log[(final_call_log['pnum'] == 2) & (final_call_log['date'] == pd.Timestamp("2023-07-04").date())].head(5)

display(final_call_log[final_call_log['break'] < 0].info())

# 조건에 맞는 행의 인덱스 찾기
condition = (final_call_log['break'] < 0) | (final_call_log['break'].isnull())
indices = final_call_log[condition].index

# 앞뒤 행의 인덱스를 포함한 리스트 만들기
all_indices = set(indices)
for idx in indices:
    if idx > 0:
        # all_indices.add(idx - 2)
        all_indices.add(idx - 1)
    if idx < len(final_call_log) - 1:
        all_indices.add(idx + 1)
        # all_indices.add(idx + 2)

# 필터링된 데이터프레임 생성
final_call_log.loc[sorted(all_indices)]


<class 'pandas.core.frame.DataFrame'>
Index: 24 entries, 598 to 17934
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   start            12 non-null     datetime64[ns]
 1   agreement        12 non-null     float64       
 2   question         12 non-null     object        
 3   answer           12 non-null     object        
 4   pnum             24 non-null     int64         
 5   total_duration   24 non-null     float64       
 6   complain         12 non-null     float64       
 7   start_second     24 non-null     datetime64[ns]
 8   active_duration  24 non-null     float64       
 9   in/out           24 non-null     float64       
 10  matching         24 non-null     int64         
 11  end              24 non-null     datetime64[ns]
 12  mute             24 non-null     float64       
 13  date             24 non-null     object        
 14  break            24 non-null     float64    

None

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date,break
597,2023-07-17 09:05:00,2.0,"아웃/ 민원인 전달요청건/ 자연재난 복구시 필요한 장비 빌려줄 수 있음, 자원봉사 ...",전달완료,1,79.0,0.0,2023-07-17 09:05:18,5.0,-1.0,1,2023-07-17 09:06:37,74.0,2023-07-17,109.0
598,NaT,,,,1,84.0,,2023-07-17 09:05:48,84.0,-1.0,3,2023-07-17 09:07:12,0.0,2023-07-17,-49.0
599,2023-07-17 09:13:00,1.0,2023년 대전광역시 인재개발원 강사수당등 지급기준\n대전시청 홈페이지에서 내용을 ...,"홈페이지에서 내용 확인되지 않음, 확인 후 문자로 안내",1,246.0,0.0,2023-07-17 09:13:00,173.0,0.0,1,2023-07-17 09:17:06,73.0,2023-07-17,348.0
1139,2023-06-30 17:44:00,2.0,긴급생계비지원 자격 조건 문의 / 구청에 통화를 했더니 담당자가 자리에 없다함/,,2,70.0,0.0,2023-06-30 17:44:17,70.0,0.0,1,2023-06-30 17:45:27,0.0,2023-06-30,1050.0
1140,NaT,,,,2,4.0,,2023-07-04 08:24:54,4.0,-1.0,3,2023-07-04 08:24:58,0.0,2023-07-04,-2085.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14703,2023-06-28 09:09:00,0.0,여권 지금 신청하면 언제 받을 수 있는지/ 기존여권 분실\n성인여권 신규발급/ 기...,평일 7일 소요. 빨리 나오면 카톡알림 드림.\n신분증 지참하여 방문 또는 정부24...,20,248.0,0.0,2023-06-28 09:09:48,249.0,0.0,1,2023-06-28 09:13:56,0.0,2023-06-28,-378.0
14704,2023-06-28 09:19:00,1.0,비오는 날 인도에서 넘어짐/ 대전시민이면 보상된다고 함,시민안전보험 말씀하신다면 보장내용 아님. 인도관리에 미흡한 부분 있었다면 구청으로 ...,20,106.0,1.0,2023-06-28 09:19:38,102.0,0.0,1,2023-06-28 09:21:24,4.0,2023-06-28,342.0
17933,2023-06-30 20:53:00,1.0,619번에 우산 놓고 내림,,23,43.0,1.0,2023-06-30 20:52:56,44.0,0.0,1,2023-06-30 20:53:39,0.0,2023-06-30,728.0
17934,2023-07-03 08:14:00,0.0,유천동 서대전 농협 ->서구 변동 사마 3길52번지/119타고 어디에서 내리는지,구농도원네거리 정류장에서 하차\n하차후 버스 왔던 방향으로 약 20m 되돌아간 후 ...,23,187.0,0.0,2023-07-03 08:14:05,160.0,0.0,1,2023-07-03 08:17:12,27.0,2023-07-03,-326.0


In [37]:
daily_abnormal_break = list(check_df.values)
abnoraml_break = final_call_log[final_call_log['break'] < 0]
abnoraml_break[~abnoraml_break['break'].isin(daily_abnormal_break)]

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date,break
598,NaT,,,,1,84.0,,2023-07-17 09:05:48,84.0,-1.0,3,2023-07-17 09:07:12,0.0,2023-07-17,-49.0
3930,NaT,,,,7,203.0,,2023-07-01 11:37:44,203.0,0.0,3,2023-07-01 11:41:07,0.0,2023-07-01,-174.0
4445,NaT,,,,7,3.0,,2023-07-15 10:54:33,3.0,-1.0,3,2023-07-15 10:54:36,0.0,2023-07-15,-1.0
4601,NaT,,,,7,72.0,,2023-07-18 15:02:45,72.0,-1.0,3,2023-07-18 15:03:57,0.0,2023-07-18,-54.0
6003,NaT,,,,9,112.0,,2023-07-07 15:59:42,112.0,-1.0,3,2023-07-07 16:01:34,0.0,2023-07-07,-75.0
6350,NaT,,,,9,56.0,,2023-07-18 18:40:42,56.0,-1.0,3,2023-07-18 18:41:38,0.0,2023-07-18,-14.0
7114,NaT,,,,10,66.0,,2023-07-19 09:25:52,66.0,-1.0,3,2023-07-19 09:26:58,0.0,2023-07-19,-48.0
8689,2023-07-09 17:43:00,2.0,아웃/ \n부사오거리<->서구동서대로1040번길 토담한식뷔패,부사오거리정류장\n119\n내동네거리\n------------------------...,13,103.0,0.0,2023-07-09 17:43:10,103.0,-1.0,1,2023-07-09 17:44:53,0.0,2023-07-09,-4.0
9054,NaT,,,,13,7.0,,2023-07-20 09:22:22,7.0,-1.0,3,2023-07-20 09:22:29,0.0,2023-07-20,-5.0
10496,NaT,,,,14,5.0,,2023-07-21 12:37:45,5.0,-1.0,3,2023-07-21 12:37:50,0.0,2023-07-21,-2.0


In [None]:
# 1분 안에 여러 콜이 발생하여 매칭이 잘 안된 경우, total_duration과 active_duration을 기준으로 더 가까운 것으로 매칭 시도 
final_call_log.loc[597,["start_second","active_duration"]] = [pd.to_datetime("2023-07-17 09:05:48"), 84]
final_call_log.loc[3929,["start_second","active_duration"]] = [pd.to_datetime("2023-07-01 11:37:44"), 203]
final_call_log.loc[4600,["start_second","active_duration"]] = [pd.to_datetime("2023-07-18 15:02:45"), 72]
final_call_log.loc[6002,["start_second","active_duration"]] = [pd.to_datetime("2023-07-07 15:59:42"), 112]
final_call_log.loc[6349,["start_second","active_duration"]] = [pd.to_datetime("2023-07-18 18:40:42"), 56]
final_call_log.loc[7113,["start_second","active_duration"]] = [pd.to_datetime("2023-07-19 09:25:52"), 66]

# break가 0보다 작은 것은 제거하기: 단, daily가 이상하게 기록되어 있기 때문에 잘못 된 것은 삭제 하지 않기
daily_abnormal_break = list(check_df.values)
abnoraml_break = final_call_log[final_call_log['break'] < 0]
drop_idx = abnoraml_break[~abnoraml_break['break'].isin(daily_abnormal_break)].index
final_call_log = final_call_log.drop(drop_idx,axis=0)
# final_call_log = final_call_log[final_call_log['break']>=0] # 24건 제외 : 짧은 call 기록인 경우가 대다수임

# feature 값 그에 따라 고쳐주기: end, break, mute
final_call_log['end'] = final_call_log['start_second'] + pd.to_timedelta(final_call_log['total_duration'], unit='s') # end
# sort values
final_call_log = final_call_log.sort_values(['pnum','start_second']).reset_index(drop=True)
final_call_log = final_call_log.groupby(['pnum','date']).apply(calculate_break)
final_call_log.reset_index(drop=True, inplace=True)
final_call_log["break"] = final_call_log["break"].apply(delta_to_seconds)


final_call_log['mute'] = final_call_log["total_duration"] - final_call_log["active_duration"]
final_call_log['mute'] = final_call_log['mute'].apply(lambda x: 0 if x<0 else x)

# sort values
final_call_log = final_call_log.sort_values(['pnum','start_second']).reset_index(drop=True)


#### Validation

In [39]:
final_call_log[final_call_log[['pnum','start_second']].duplicated(keep=False)] # 중복 되는 데이터 확인

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date,break


In [40]:
check_df

pnum  date      
2     2023-07-04   -2085.0
3     2023-07-03    -156.0
4     2023-06-27    -904.0
      2023-06-28    -173.0
      2023-07-03    -435.0
7     2023-06-29     -12.0
12    2023-06-27     -86.0
      2023-07-04     -92.0
      2023-07-12    -157.0
16    2023-07-14     -81.0
20    2023-06-28    -378.0
23    2023-07-03    -326.0
Name: break, dtype: float64

In [41]:
final_call_log[final_call_log['break'] < 0] # break 0보다 작은거 확인

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date,break
1139,NaT,,,,2,4.0,,2023-07-04 08:24:54,4.0,-1.0,3,2023-07-04 08:24:58,0.0,2023-07-04,-2085.0
2125,2023-07-03 12:02:00,2.0,타임월드 -> 서대전네거리역,갤러리아타임월드\n315\n서대전네거리역1번출구,3,59.0,0.0,2023-07-03 12:02:55,59.0,0.0,1,2023-07-03 12:03:54,0.0,2023-07-03,-156.0
2857,2023-06-27 08:03:00,2.0,로드킬 신고/유성구/,유성구청 당직실 042-611-2222 연결,4,48.0,1.0,2023-06-27 08:03:26,49.0,0.0,1,2023-06-27 08:04:14,0.0,2023-06-27,-904.0
2908,2023-06-28 09:04:00,0.0,성인여권 신규발급/,"본인내방,신분증,구여권,여권용 사진1+1,수수료(1년단수 20,000 10년복수 5...",4,57.0,0.0,2023-06-28 09:04:02,58.0,0.0,1,2023-06-28 09:04:59,0.0,2023-06-28,-173.0
3027,2023-07-03 12:01:00,2.0,"비래동 웰니스병원 쪽에 있음, 601번 현재위치/","비래동 종점에서 12:01 출발, 3개 경유 후 도착/ 비래동 웰니스병원~봉명동 바...",4,336.0,0.0,2023-07-03 12:01:30,280.0,0.0,1,2023-07-03 12:07:06,56.0,2023-07-03,-435.0
3817,2023-06-29 09:00:00,1.0,1.한국전력공사 문의,1. 042 123 안내,7,30.0,0.0,2023-06-29 09:00:19,30.0,0.0,1,2023-06-29 09:00:49,0.0,2023-06-29,-12.0
7316,2023-06-27 09:09:00,2.0,대전시청 옆에서 코로나 선별진료소 운영 하는지 ?/\n그럼 이제 어디서 65세이상 ...,운영 종료됨\n보건소로 내방 하셔야 함,12,48.0,1.0,2023-06-27 09:09:57,48.0,0.0,1,2023-06-27 09:10:45,0.0,2023-06-27,-86.0
7535,2023-07-04 09:06:00,0.0,필름번호판 필름 불량으로 교체 문의 / 부사동 발급,부사동 번호판제작소 스마일기업 042-242-1600 안내,12,67.0,1.0,2023-07-04 09:06:23,68.0,0.0,1,2023-07-04 09:07:30,0.0,2023-07-04,-92.0
7768,2023-07-12 08:15:00,2.0,조기페차 신청 가능한지 ?\n5등급 경유차량 지원금 얼마나 나오는지 ?\n\n,"가능 (인터넷,우편,이메일)로 가능함 \n\n최대 300만원 \n\n인터넷 신청 방...",12,110.0,0.0,2023-07-12 08:15:24,110.0,0.0,1,2023-07-12 08:17:14,0.0,2023-07-12,-157.0
12466,2023-07-14 12:04:00,0.0,계량기가 물에잠겨있음.,서부사 042-715-6772연결,16,52.0,0.0,2023-07-14 12:04:38,52.0,0.0,1,2023-07-14 12:05:30,0.0,2023-07-14,-81.0


In [42]:
final_call_log[final_call_log['mute'] < -5] # mute 0보다 작은거 확인

Unnamed: 0,start,agreement,question,answer,pnum,total_duration,complain,start_second,active_duration,in/out,matching,end,mute,date,break


In [43]:
call_log_content.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17904 entries, 813 to 13522
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   start           17904 non-null  datetime64[ns]
 1   agreement       17904 non-null  int64         
 2   question        17904 non-null  object        
 3   answer          17215 non-null  object        
 4   pnum            17904 non-null  int64         
 5   total_duration  17896 non-null  float64       
 6   complain        17904 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(2)
memory usage: 1.1+ MB


In [44]:
call_log_time.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18434 entries, 16846 to 52
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   start_second     18434 non-null  datetime64[ns]
 1   pnum             18434 non-null  int64         
 2   active_duration  18434 non-null  float64       
 3   in/out           18434 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 720.1 KB


In [45]:
final_call_log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18422 entries, 0 to 18421
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   start            17893 non-null  datetime64[ns]
 1   agreement        17893 non-null  float64       
 2   question         17893 non-null  object        
 3   answer           17893 non-null  object        
 4   pnum             18422 non-null  int64         
 5   total_duration   18422 non-null  float64       
 6   complain         17893 non-null  float64       
 7   start_second     18422 non-null  datetime64[ns]
 8   active_duration  18422 non-null  float64       
 9   in/out           18422 non-null  float64       
 10  matching         18422 non-null  int64         
 11  end              18422 non-null  datetime64[ns]
 12  mute             18422 non-null  float64       
 13  date             18422 non-null  object        
 14  break            18422 non-null  float

#### Save

In [419]:
final_call_log.to_csv(os.path.join(CALL_LOG_DEST_PATH,"CALL_LOG" ,'call_log.csv'),index=False)