In [6]:
import pandas as pd
import os
import glob
import seaborn as sns
import matplotlib.pyplot as plt

In [7]:
def merge_and_visualize_diff_files(output_file='combined_diff.csv'):
    # 현재 디렉토리에서 *_diff.csv 파일들 검색
    diff_files = glob.glob("*_diff.csv")
    
    # 모든 diff 파일을 읽어서 하나의 DataFrame으로 병합
    combined_df = pd.concat([pd.read_csv(file) for file in diff_files], ignore_index=True)
    
    # 병합된 결과를 CSV 파일로 저장
    combined_df.to_csv(output_file, index=False)
    print(f'모든 diff 파일이 {output_file}로 병합되었습니다.')

    # 시각화
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=combined_df, x='feature_value_x', y='feature_value_y', hue='critical_score', palette='coolwarm')
    plt.title('Feature Value Comparison')
    plt.xlabel('Feature Value (x)')
    plt.ylabel('Guideline Value (y)')
    plt.grid(True)
    plt.show()



In [8]:
# 사용 예시
merge_and_visualize_diff_files()

모든 diff 파일이 combined_diff.csv로 병합되었습니다.


ValueError: Could not interpret value `feature_value_y` for parameter `y`

<Figure size 864x576 with 0 Axes>

In [9]:
def get_feature(item):
    
    feature_mapping = {
        'feature': [1, 9, 11, 2, 3, 7, 6, 8, 10, 4, 5, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
        'new_feature': [11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 10, 16, 17, 32],
        'back': [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        'lv1': [1,2,2,3,3,4,5,5,5,6,6,7,7,7,7,8,8,8,9,1,4,4,9],
    }

    df2 = pd.DataFrame(feature_mapping)
    subset = df2[df2['new_feature'] == item]

    feature_value = subset.iloc[0]['feature']
    back_value = subset.iloc[0]['back']
    return int(feature_value), int(back_value)


In [19]:
def modify(input_csv, output_csv):
    # CSV 파일 읽기
    df = pd.read_csv(input_csv)

    # 'item' 열 이름을 'feature'로 변경
    df.rename(columns={'item': 'feature'}, inplace=True)
    
    df = df[df['feature'] >= 10]

    # 모든 행을 순회하면서 수정
    for index, row in df.iterrows():
        item = row['feature']  # 'item'이 'feature'로 바뀌었음
        suggestion_value = row['suggestion']
        observation_value = row['observation']

        
        # feature와 back 값 가져오기
        feature, back = get_feature(item)
        

        # 'feature' 열 업데이트
        df.loc[index, 'feature'] = feature

        # Back 처리
        if back == 1:
            df.loc[index, 'suggestion'] = 8 - suggestion_value
            df.loc[index, 'observation'] = 8 - observation_value

        # -1 ~ 1 정규화
        df.loc[index, 'suggestion'] = (df.loc[index, 'suggestion'] - 4) / 3.0
        df.loc[index, 'observation'] = (df.loc[index, 'observation'] - 4) / 3.0

    # 수정된 데이터 저장
    df.to_csv(output_csv, index=False)
    print(f"Modified CSV saved as: {output_csv}")

In [18]:
modify('merged_c1.csv','fast1.csv')

4
4
1
1
23
23
4
4
5
5
10
10
12
12
13
13
15
15
16
16
17
17
18
18
10
10
12
12
13
13
15
15
16
16
17
17
18
18
19
19
23
23
20
20
1
1
23
23
23
23
4
4
5
5
19
19
23
23
23
23
20
20
20
20
1
1
9
9
11
11
13
13
15
15
18
18
19
19
23
23
23
23
23
23
15
15
19
19
23
23
23
23
11
11
15
15
6
6
22
22
6
6
13
13
23
23
23
23
23
23
23
23
23
23
23
23
4
4
23
23
11
11
6
6
19
19
10
10
10
10
1
1
15
15
23
23
13
13
15
15
23
23
19
19
13
13
19
19
23
23
23
23
23
23
10
10
23
23
23
23
23
23
10
10
23
23
19
19
23
23
Modified CSV saved as: fast1.csv


In [20]:
def validate_annotations(annotation_file, combined_diff_file, output_file='validation_result.csv'):
    # CSV 파일 읽기
    annotation_df = pd.read_csv(annotation_file)
    combined_diff_df = pd.read_csv(combined_diff_file)
    
    # 결과 저장 리스트
    validation_results = []
    
    # 각 annotation row에 대해 비교 수행
    for _, anno_row in annotation_df.iterrows():
        pid = anno_row['pid']
        feature = anno_row['feature']
        suggestion = anno_row['suggestion']
        observation = anno_row['observation']
        start_measure = anno_row['start_measure']
        end_measure = anno_row['end_measure']
        
        # 관련된 combined_diff 행 필터링
        relevant_rows = combined_diff_df[
            (combined_diff_df['pid'] == pid) &
            (combined_diff_df['feature'] == feature) &
            (combined_diff_df['measure'].between(start_measure, end_measure))
        ]
        
        for _, diff_row in relevant_rows.iterrows():
            measure = diff_row['measure']
            guideline_value = diff_row['guideline_value']
            feature_value = diff_row['feature_value']
            critical_score = diff_row['critical_score']
            difference = diff_row['difference']
            
            # 검증 결과 저장
            validation_results.append({
                'pid': pid,
                'measure': measure,
                'feature': feature,
                'suggestion': suggestion,
                'guideline_value': guideline_value,
                'suggestion_diff': suggestion - guideline_value,
                'observation': observation,
                'feature_value': feature_value,
                'observation_diff': observation - feature_value,
                'critical_score': critical_score,
                'difference': difference
            })
    
    # 결과를 데이터프레임으로 변환
    result_df = pd.DataFrame(validation_results)
    result_df.to_csv(output_file, index=False)
    print(f'Validation results saved to {output_file}')


In [21]:
# 사용 예시
validate_annotations('fast1.csv', 'combined_diff.csv')

Validation results saved to validation_result.csv
