In [5]:
import pandas as pd  # pandas 라이브러리 임포트

In [6]:
# JSON 파일 경로
json_file_path = "C:/SKN_3_MyProject/SKN_03_FINAL/Data/Final/embedding_2.json"
# JSON 파일을 데이터프레임으로 읽기
df = pd.read_json(json_file_path, lines=True)

In [7]:
def check_target_balance(df):
    """target 컬럼의 클래스 비율 확인 함수"""
    if 'target' not in df.columns:
        print("target 컬럼이 데이터프레임에 없습니다.")
        return

    target_counts = df['target'].value_counts(normalize=True)
    print("Target 클래스 비율:")
    print(target_counts)


In [8]:
check_target_balance(df)

Target 클래스 비율:
target
1    0.5
0    0.5
Name: proportion, dtype: float64


In [9]:
# 희소성 계산 함수
def calculate_sparsity(df):
    sparsity_report = {}
    
    # 컬럼별로 0 비율 계산
    for col in df.columns:
        zero_count = (df[col] == 0).sum() if df[col].dtype != 'object' else 0  # 문자열 컬럼은 제외
        sparsity = zero_count / len(df)
        sparsity_report[col] = sparsity
    
    return pd.DataFrame(list(sparsity_report.items()), columns=['Column', 'Sparsity'])

In [10]:
# 희소성 분석 실행
sparsity_report = calculate_sparsity(df)
print(sparsity_report)

                    Column  Sparsity
0                     cast  0.000000
1                    title  0.000000
2                   target  0.500000
3                   editor  0.000000
4                    genre  0.000000
5               percentage  0.002963
6          musical_license  0.984275
7                   period  0.123747
8             ticket_price  0.009572
9                  cast_id  0.002051
10      editor_combined_id  0.004102
11              day_vector  0.020966
12           time_category  0.002051
13    day_time_interaction  0.023017
14   actor_sales_influence  0.001367
15  actor_genre_preference  0.000000


In [11]:
# 값의 분포에서 최빈값이 전체의 90% 이상을 차지하면 희소하다고 판단.
def calculate_mode_sparsity(df):
    sparsity_report = {}
    for col in df.columns:
        if df[col].dtype != 'object':  # 숫자형 데이터만 분석
            mode_ratio = df[col].value_counts(normalize=True).max()
            sparsity_report[col] = mode_ratio
    return pd.DataFrame(list(sparsity_report.items()), columns=['Column', 'Mode_Ratio'])

In [12]:
sparsity_report = calculate_mode_sparsity(df)
print(sparsity_report)

                    Column  Mode_Ratio
0                    title    0.003874
1                   target    0.500000
2                    genre    0.296490
3               percentage    0.007065
4          musical_license    0.984275
5                   period    0.453282
6             ticket_price    0.178669
7                  cast_id    0.011167
8       editor_combined_id    0.069964
9               day_vector    0.407247
10           time_category    0.478806
11    day_time_interaction    0.320191
12   actor_sales_influence    0.178669
13  actor_genre_preference    0.255242


In [13]:
from scipy.stats import entropy

In [14]:
# 엔트로피 값이 낮으면 희소 데이터로 간주.
def calculate_entropy(df):
    sparsity_report = {}
    for col in df.columns:
        if df[col].dtype != 'object':  # 숫자형 데이터만 분석
            value_counts = df[col].value_counts(normalize=True)
            col_entropy = entropy(value_counts)
            sparsity_report[col] = col_entropy
    return pd.DataFrame(list(sparsity_report.items()), columns=['Column', 'Entropy'])

sparsity_report = calculate_entropy(df)
print(sparsity_report)

                    Column   Entropy
0                    title  6.062683
1                   target  0.693147
2                    genre  1.917934
3               percentage  6.019576
4          musical_license  0.080898
5                   period  2.410222
6             ticket_price  3.190974
7                  cast_id  5.742522
8       editor_combined_id  4.108361
9               day_vector  1.815733
10           time_category  1.075659
11    day_time_interaction  2.405915
12   actor_sales_influence  3.194899
13  actor_genre_preference  1.581295


In [15]:
# ID처럼 각 값이 유일하면, 모델에 크게 기여하지 않을 가능성이 있음.
def calculate_cardinality(df):
    sparsity_report = {}
    for col in df.columns:
        unique_ratio = df[col].nunique() / len(df)
        sparsity_report[col] = unique_ratio
    return pd.DataFrame(list(sparsity_report.items()), columns=['Column', 'Unique_Ratio'])

sparsity_report = calculate_cardinality(df)
print(sparsity_report)

                    Column  Unique_Ratio
0                     cast      0.077940
1                    title      0.103464
2                   target      0.000456
3                   editor      0.059253
4                    genre      0.003191
5               percentage      0.099818
6          musical_license      0.000456
7                   period      0.017092
8             ticket_price      0.010255
9                  cast_id      0.077940
10      editor_combined_id      0.021650
11              day_vector      0.004330
12           time_category      0.001595
13    day_time_interaction      0.010483
14   actor_sales_influence      0.010483
15  actor_genre_preference      0.001367


In [16]:
# 값이 거의 비슷한 범위에 있다면 하나의 값으로 병합
def calculate_variance_sparsity(df, threshold=0.01):
    sparsity_report = {}
    for col in df.columns:
        if df[col].dtype != 'object':  # 숫자형 데이터만 분석
            variance = df[col].std()
            sparsity_report[col] = variance < threshold  # True: 희소한 컬럼
    return pd.DataFrame(list(sparsity_report.items()), columns=['Column', 'Is_Sparse'])

sparsity_report = calculate_variance_sparsity(df)
print(sparsity_report)

                    Column  Is_Sparse
0                    title      False
1                   target      False
2                    genre      False
3               percentage      False
4          musical_license      False
5                   period      False
6             ticket_price      False
7                  cast_id      False
8       editor_combined_id      False
9               day_vector      False
10           time_category      False
11    day_time_interaction      False
12   actor_sales_influence      False
13  actor_genre_preference      False


In [18]:
# 피처가 target과 독립적이면 모델 성능에 큰 기여를 하지 않을 가능성이 있음
def calculate_target_correlation(df, target_col):
    correlations = {}
    for col in df.columns:
        if col != target_col and df[col].dtype != 'object':
            correlations[col] = abs(df[col].corr(df[target_col]))
    return pd.DataFrame(list(correlations.items()), columns=['Column', 'Target_Correlation'])

correlation_report = calculate_target_correlation(df, target_col='target')
print(correlation_report)

                    Column  Target_Correlation
0                    title            0.012703
1                    genre            0.024236
2               percentage            0.026993
3          musical_license            0.078768
4                   period            0.067740
5             ticket_price            0.083649
6                  cast_id            0.001984
7       editor_combined_id            0.000767
8               day_vector            0.091312
9            time_category            0.030476
10    day_time_interaction            0.077473
11   actor_sales_influence            0.088039
12  actor_genre_preference            0.015285
