In [3]:
import os
import json
import glob
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

from collections import Counter
from sklearn.model_selection import train_test_split
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

# 데이터 전처리

In [None]:
##### csv 파일 생성하는 기존 코드 ('중립' 부분을 계속 바꿔서 실행해야함)
textdata = pd.read_csv("/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Training_221115_add/total.csv")
textdata.columns = ['age', 'gender', 'label', 'sentence']
print(Counter(textdata['label']))
textdata.drop_duplicates(['sentence'], keep = False)
textdata.loc[textdata['label'] == '중립', 'label'] = 6
print(Counter(textdata['label']))
textdata.loc[textdata['label'] == 6].to_csv("/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Training_221115_add/train/SDD-중립.csv", index=False)

textdata = pd.read_csv("/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Validation_221115_add/total.csv")
textdata.columns = ['age', 'gender', 'label', 'sentence']
textdata.drop_duplicates(['sentence'], keep = False)
textdata.loc[textdata['label'] == '중립', 'label'] = 6
#print(Counter(textdata['label']))
textdata.loc[textdata['label'] == 6].to_csv("/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Validation_221115_add/valid/SDD-중립.csv", index=False)

In [None]:
##### 정리한 코드 (검증 필요)
def process_data(input_file, output_file, emotion, num):
    textdata = pd.read_csv(input_file)
    textdata.columns = ['age', 'gender', 'label', 'sentence']
    textdata.dropna(inplace=True)
    textdata.drop_duplicates(['sentence'], keep=False)
    textdata.loc[textdata['label'] == emotion, 'label'] = num
    textdata.loc[textdata['label'] == num].to_csv(output_file, index=False)

emotion_mapping = {
    '기쁨': 0,
    '슬픔': 1,
    '분노': 2,
    '당황': 3,
    '불안': 4,
    '상처': 5,
    '중립': 6
}

input_dir_train = "/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Training_221115_add/total.csv"
output_dir_train = "/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Training_221115_add/train/"
for emotion, num in emotion_mapping.items():
    process_data(input_dir_train, f"{output_dir_train}SDD-{emotion}.csv", emotion, num)

input_dir_valid = "/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Validation_221115_add/total.csv"
output_dir_valid = "/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Validation_221115_add/valid/"
for emotion, num in emotion_mapping.items():
    process_data(input_dir_valid, f"{output_dir_valid}SDD-{emotion}.csv", emotion, num)

In [None]:
##### 두 SDD_{label}.csv 파일을 읽고 중복 문장 제거

import pandas as pd

labelList = ["기쁨", "슬픔", "분노", "당황", "불안", "상처", "중립"]

for label in labelList:
    # Step 1: Read the two CSV files into dataframes
    file1 = f'/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Training_221115_add/train/SDD-{label}.csv'
    file2 = f'/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/Validation_221115_add/valid/SDD-{label}.csv'

    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Step 2: Find overlapping sentences
    overlapping_sentences = df1.merge(df2, on='sentence')

    # Step 3: Create a set to keep track of sentences added to the non_overlapping_file
    added_sentences = set()

    # Step 4: Write non-overlapping sentences from df1 to the non_overlapping_file
    non_overlapping_file = f'/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/{label}_non.csv'
    with open(non_overlapping_file, 'w', encoding='utf-8') as file:
        file.write("age,gender,label,sentence\n")
        for index, row in df1.iterrows():
            sentence = row['sentence']
            if sentence not in added_sentences:
                file.write(f"{row['age']},{row['gender']},{row['label']},{sentence}\n")
                added_sentences.add(sentence)

    # Step 5: Create a dataframe with only overlapping sentences (using df1)
    overlapping_sentences = overlapping_sentences[['age_x', 'gender_x', 'label_x', 'sentence']]

    # Step 6: Write overlapping dataframe to a new CSV file with the new emotion label
    overlapping_file = f'/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/{label}_중복.csv'
    overlapping_sentences.to_csv(overlapping_file, index=False)


# 구축 데이터 분석

In [8]:
##### 새로 생성한 파일({label}_non.csv)의 중복 문장 확인 (중복이 없어야 함)

import pandas as pd
from collections import Counter

# Step 1: Read the CSV file into a pandas DataFrame
df = pd.read_csv('/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/기쁨_non.csv')

# Step 2: Group the DataFrame by the 'sentence' column and count the occurrences of each sentence
sentence_counts = df.groupby('sentence').size().reset_index(name='count')

# Step 3: Filter the sentences that have a count greater than 1, indicating they are duplicated
duplicated_sentences = sentence_counts[sentence_counts['count'] > 1]

# Step 4: Display the duplicated sentences and their occurrences
print(duplicated_sentences)


Empty DataFrame
Columns: [sentence, count]
Index: []


In [None]:
##### 새로 생성한 파일({label}_non.csv)의 나이 및 성별 분포 확인

import pandas as pd

# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/media/jeeyoung/8C0CACF80CACDE88/JEE/data_multimodal/emotion/018.감성대화/기쁨_non.csv', sep=',')

# Define the age and gender groups
age_groups = ["청소년", "청년", "중년", "노년"]
genders = ["여성", "남성"]

# Initialize a dictionary to store the counts for each combination
counts = {}

# Loop through age groups and genders to count occurrences
for age_group in age_groups:
    for gender in genders:
        count = len(df[(df['age'] == age_group) & (df['gender'] == gender)])
        counts[(age_group, gender)] = count

# Print the result
for age_group, gender in counts.keys():
    count = counts[(age_group, gender)]
    print(f"Age group: {age_group}, Gender: {gender}, Count: {count}")