# 아이펠톤에서 사용할 데이터 생성하는 노트북

### 발생한 이슈
- 같은 텍스트를 넣었는데, openai api에서 포함해야 하는 텍스트를 제외하는 일이 발생함 -> 문장유사도를 판단하여, 유사한 문장을 포함하지 않으면 제외하는 방식 사용

### 코드 흐름
- 데이터를 한줄씩 가져옴
- 포함해야 하는 컬럼의 데이터를 openai api 에 넣어 문장을 생성함
- 포함해야 하는 컬럼의 데이터를 문장별로 나눔
- 생성된 문장 내에 '포함해야 하는 문장'들이 존재하는지 문장유사도를 사용하여 확인
- 문장유사도 비율이 낮으면 중단하고 저장, 원인 확인

In [29]:
import os
# from dotenv import load_dotenv
from openai import OpenAI
import pandas as pd
import nltk
from difflib import SequenceMatcher

# NLTK 패키지에서 Punkt tokenizer를 다운로드 (한 번만 실행)
# nltk.download('punkt')

In [48]:
# 문장유사도 비교
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# '포함해야 하는 문장들' 내의 1개의 문장과 '만들어진 장문' 내 문장들을 비교하여, 가장 유사한 문장을 1개 반환 
def find_similar_sentences(base_sentence, long_text, threshold=0):
    # 긴 텍스트를 문장으로 분리
    sentences = nltk.sent_tokenize(long_text)
    
    # 유사한 문장 추출
    base = ""
    long = ""
    similarity_final = 0
    for sentence in sentences:
        similarity = similar(base_sentence, sentence)
        if similarity >= threshold and similarity > similarity_final:
            base = base_sentence
            long = sentence
            similarity_final = similarity
    return base, long, similarity_final

In [31]:
# api함수 

In [32]:
# load_dotenv()
os.environ["OPENAI_API_KEY"] = "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"

client = OpenAI()

### 기존에 생성이 잘 된 데이터를 이용하여, 문장 유사도가 어느정도 되어야 하는지 확인

In [33]:
raw = pd.read_csv("data/c2d2_refined_0_500_cleaned.csv")
raw.head()

Unnamed: 0,Num,Scenario,Thought,Label,Refined_Thought,Cleaned_Refined_Thought
0,1,"I'm an introverted person, and I've just arriv...",Are the people in this environment unfriendly?,Overgeneralization,"Sure, I can help with that. Here's a possible ...","I'm an introverted person, and I've just arriv..."
1,2,"Recently, I feel dizzy sometimes when I stand ...",I'm so dizzy. Am I sick? I should probably go ...,No Distortion,"Recently, I feel dizzy sometimes when I stand ...","Recently, I feel dizzy sometimes when I stand ..."
2,3,"I'm walking down the street and feel hungry, b...","I'm tired and there's no place to rest, I'm hu...",Overgeneralization,"I'm walking down the street and feel hungry, b...","I'm walking down the street and feel hungry, b..."
3,4,"Work has been busy lately, but I have caught a...",Why did I catch a cold at this time? I feel so...,No Distortion,"Work has been busy lately, but I have caught a...","Work has been busy lately, but I have caught a..."
4,5,My mom and I are discussing future plans. She ...,"Mom is trying to control my life again, wantin...",Fortune-telling,My mom and I are discussing future plans. She ...,My mom and I are discussing future plans. She ...


In [34]:
# raw['Scenario_Sentences'] = raw['Scenario'].apply(lambda x: x.split('. '))
raw['Scenario_Sentences'] = raw['Scenario'].apply(lambda x: nltk.sent_tokenize(x))
raw['Thought_Sentences'] = raw['Thought'].apply(lambda x: nltk.sent_tokenize(x))
raw['all_Sentences'] = raw['Scenario_Sentences'] + raw['Thought_Sentences']
raw['all_Sentences']

0      [I'm an introverted person, and I've just arri...
1      [Recently, I feel dizzy sometimes when I stand...
2      [I'm walking down the street and feel hungry, ...
3      [Work has been busy lately, but I have caught ...
4      [My mom and I are discussing future plans., Sh...
                             ...                        
495    [Ir parents are dissatisfied with I because I ...
496    [When someone says I look unhappy, it's as if ...
497    [I are pushed to the point of explosion by Ir ...
498    [I were preparing to run for president of the ...
499    [The takeaway order was stolen., Who is so wic...
Name: all_Sentences, Length: 500, dtype: object

In [35]:
print(raw.iloc[2]['Scenario_Sentences']), print(raw.iloc[2]['Thought_Sentences']), print(raw.iloc[2]['all_Sentences'])

["I'm walking down the street and feel hungry, but there's no restaurant around."]
["I'm tired and there's no place to rest, I'm hungry and there's no place to eat.", 'Why do unfortunate things always happen to me?']
["I'm walking down the street and feel hungry, but there's no restaurant around.", "I'm tired and there's no place to rest, I'm hungry and there's no place to eat.", 'Why do unfortunate things always happen to me?']


(None, None, None)

In [64]:
# 결과 저장할 데이터프레임 생성
result_df = pd.DataFrame()
result_df_2 = pd.DataFrame()

# 각 행에 대해 a 컬럼의 텍스트와 b 컬럼의 문장들 간 유사도 비교
for index, row in raw.iterrows():
    
    base_list = []
    long_list = []
    similarity_list = []
    
    long_text = row['Cleaned_Refined_Thought']
    
    for base_text in row['all_Sentences']:
        # print(base_text)
        base, long, similarity = find_similar_sentences(base_text, long_text)
    
        base_list.append(base)
        long_list.append(long)
        similarity_list.append(similarity)
    
    # 유사도가 있는 경우 평균 계산, 없으면 0으로 설정
    if similarity_list:
        average_similarity = sum(similarity_list) / len(similarity_list)
    else:
        average_similarity = 0
        
    # 데이터프레임으로 만들어서 비교
    result = pd.DataFrame({
        'base': [base_list],
        'long': [long_list],
        'similarity': [similarity_list],
        'average_similarity': average_similarity
    })
    
    result_df = pd.concat([result_df, result], ignore_index=True)
    
    # 데이터프레임으로 만들어서 비교 (threshold 보기 쉽게 만든 버전)
    result_2 = pd.DataFrame({
        'base': base_list,
        'long': long_list,
        'similarity': similarity_list,
        'average_similarity': average_similarity
    })
    
    result_df_2 = pd.concat([result_df_2, result_2], ignore_index=True)


In [61]:
# 결과를 raw 데이터프레임에 추가
raw['base'] = result_df['base']
raw['long'] = result_df['long']
raw['similarity'] = result_df['similarity']
raw['average_similarity'] = result_df['average_similarity']

# 저장
result_df.to_csv('data/Sentence_similarity_comparison.csv', index=False)
raw.to_csv('data/c2d2_0_500_similarity.csv', index=False)

print(result_df)

Unnamed: 0,base,long,similarity,average_similarity
0,"[I'm an introverted person, and I've just arri...","[I'm an introverted person, and I've just arri...","[1.0, 1.0]",1.000000
1,"[Recently, I feel dizzy sometimes when I stand...","[Recently, I feel dizzy sometimes when I stand...","[1.0, 1.0, 1.0, 1.0]",1.000000
2,"[I'm walking down the street and feel hungry, ...","[I'm walking down the street and feel hungry, ...","[1.0, 1.0, 1.0]",1.000000
3,"[Work has been busy lately, but I have caught ...","[Work has been busy lately, but I have caught ...","[1.0, 1.0, 1.0, 1.0, 0.6129032258064516]",0.922581
4,"[My mom and I are discussing future plans., Sh...","[My mom and I are discussing future plans., Sh...","[1.0, 1.0, 1.0, 1.0]",1.000000
...,...,...,...,...
495,[Ir parents are dissatisfied with I because I ...,[My parents are dissatisfied with me because I...,"[0.9548387096774194, 1.0]",0.977419
496,"[When someone says I look unhappy, it's as if ...","[When someone says I look unhappy, it's as if ...","[0.976, 1.0, 1.0, 1.0]",0.994000
497,[I are pushed to the point of explosion by Ir ...,[I are pushed to the point of explosion by my ...,"[0.9809523809523809, 1.0, 1.0]",0.993651
498,[I were preparing to run for president of the ...,[I were preparing to run for president of the ...,"[1.0, 1.0, 1.0, 1.0]",1.000000


In [66]:
# 저장
result_df_2.to_csv('data/Sentence_similarity_comparison_easy.csv', index=False)

print(result_df_2)

                                                   base  \
0     I'm an introverted person, and I've just arriv...   
1        Are the people in this environment unfriendly?   
2     Recently, I feel dizzy sometimes when I stand ...   
3                                         I'm so dizzy.   
4                                            Am I sick?   
...                                                 ...   
1481        It turns out that I was too self-righteous.   
1482  I am just a useless person who can't accomplis...   
1483                     The takeaway order was stolen.   
1484                                  Who is so wicked!   
1485  The family can't afford to eat, so they send t...   

                                                   long  similarity  \
0     I'm an introverted person, and I've just arriv...         1.0   
1        Are the people in this environment unfriendly?         1.0   
2     Recently, I feel dizzy sometimes when I stand ...         1.0   
3      

### 아래 데이터를 생성함
- annotated data (캐글의 심리학자-환자 대화 셋)의 전체 장문, 상담 받는 사람의 생각, 해당 생각의 인지왜곡 type 과 포멧을 맞춤
- (중국어 -> 영어로 변역된) C2D2 데이터셋의 시나리오+thought (인지 왜곡된 생각)을 맞춰 장문을 만들어냄
- 선행연구 C2d2 dataset: A resource for the cognitive distortion analysis and its impact on mental health. 에서 사용된 것

In [4]:
raw = pd.read_csv("raw_data/Final_Modified_C2D20911.csv")
raw.head()

Unnamed: 0,Num,Scenario,Thought,Label
0,1,"I'm an introverted person, and I've just arriv...",Are the people in this environment unfriendly?,Overgeneralization
1,2,"Recently, I feel dizzy sometimes when I stand ...",I'm so dizzy. Am I sick? I should probably go ...,No Distortion
2,3,"I'm walking down the street and feel hungry, b...","I'm tired and there's no place to rest, I'm hu...",Overgeneralization
3,4,"Work has been busy lately, but I have caught a...",Why did I catch a cold at this time? I feel so...,No Distortion
4,5,My mom and I are discussing future plans. She ...,"Mom is trying to control my life again, wantin...",Fortune-telling
