In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 한글 폰트 설정 (Windows의 경우)
plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

In [22]:
train_raw = pd.read_csv('../data/train_raw.csv')
train_raw

# "중립 및 불만족"은 단순히 "만족하지 않음" 으로 간주하고 0으로 변환
# "만족"은 1로 변환
train_raw['satisfaction_binary'] = (
    train_raw['satisfaction'] == 'satisfied'
).astype(int)
train_raw[['satisfaction', 'satisfaction_binary']].head()

Unnamed: 0,satisfaction,satisfaction_binary
0,neutral or dissatisfied,0
1,neutral or dissatisfied,0
2,satisfied,1
3,neutral or dissatisfied,0
4,satisfied,1


In [23]:
# 모든 서비스 변수
service_cols = [
    'Inflight wifi service',
    'Departure/Arrival time convenient',
    'Ease of Online booking',
    'Gate location',
    'Food and drink',
    'Online boarding',
    'Seat comfort',
    'Inflight entertainment',
    'On-board service',
    'Leg room service',
    'Baggage handling',
    'Checkin service',
    'Inflight service',
    'Cleanliness'
]

# 클래스와 이동 거리 컬럼 추출
class_cols = ['Class']
distance_col = ['Flight Distance']


# 서비스 변수와 클래스, 거리 그리고 satisfaction_binary만 존재하는 데이터프레임 생성
service_df = train_raw[service_cols + class_cols + distance_col + ['satisfaction_binary']]
service_df

# 0은 결측치 처리
service_df[service_cols] = service_df[service_cols].replace(0, np.nan)

# 기준: Short(<1000), Medium(1000~3000), Long(3000+)
# 데이터의 단위가 Mile이라고 가정했을 때의 통상적 기준입니다.
bins = [0, 1000, 3000, float('inf')]
labels = ['Short', 'Medium', 'Long']
service_df['Distance_Category'] = pd.cut(service_df['Flight Distance'], bins=bins, labels=labels)


# 그룹 정의
# Long에서 Eco, Eco plus의 수는 매우 적으므로 제외
groups = {
    'Biz_Long': service_df[(service_df['Class'] == 'Business') & (service_df['Distance_Category'] == 'Long')],
    'Biz_Medium': service_df[(service_df['Class'] == 'Business') & (service_df['Distance_Category'] == 'Medium')],
    'Biz_Short': service_df[(service_df['Class'] == 'Business') & (service_df['Distance_Category'] == 'Short')],
    'Eco-plus_Medium': service_df[(service_df['Class'] == 'Eco Plus') & (service_df['Distance_Category'] == 'Medium')],
    'Eco-plus_Short': service_df[(service_df['Class'] == 'Eco Plus') & (service_df['Distance_Category'] == 'Short')],
    'Eco_Medium': service_df[(service_df['Class'] == 'Eco') & (service_df['Distance_Category'] == 'Medium')],
    'Eco_Short': service_df[(service_df['Class'] == 'Eco') & (service_df['Distance_Category'] == 'Short')]
}

# 각 그룹 별로 인원수 출력
for group_name, group_df in groups.items():
    print(f"{group_name}: {len(group_df)}명")

service_by_class = {}
for cls in service_df['Class'].unique():
    service_by_class[cls] = service_df[service_df['Class'] == cls]

# Eco -> Eco Plus -> Business 순서 정렬
service_by_class = dict(sorted(service_by_class.items(), key=lambda x: ['Eco', 'Eco Plus', 'Business'].index(x[0])))    

Biz_Long: 8204명
Biz_Medium: 22737명
Biz_Short: 18724명
Eco-plus_Medium: 1747명
Eco-plus_Short: 5735명
Eco_Medium: 11090명
Eco_Short: 35606명


In [33]:
print("=" * 80)
print("분석 대상 그룹 및 샘플 수")
print("=" * 80)

group_info = []
for group_name, group_df in groups.items():
    satisfaction_rate = group_df['satisfaction_binary'].mean()
    group_info.append({
        '그룹': group_name,
        '샘플 수': f"{len(group_df):,}명",
        '만족도': f"{satisfaction_rate:.2%}"
    })

group_info_df = pd.DataFrame(group_info)
print(group_info_df.to_string(index=False))
print()

분석 대상 그룹 및 샘플 수
             그룹    샘플 수    만족도
       Biz_Long  8,204명 77.88%
     Biz_Medium 22,737명 73.24%
      Biz_Short 18,724명 61.09%
Eco-plus_Medium  1,747명 19.92%
 Eco-plus_Short  5,735명 26.03%
     Eco_Medium 11,090명 16.54%
      Eco_Short 35,606명 19.27%



In [26]:
def perform_ttest_between_groups(group1_name, group1_df, group2_name, group2_df, service_cols):
    """
    두 그룹 간 서비스 점수의 평균 차이를 t-test로 검정
    """
    print(f"\n{'=' * 80}")
    print(f"T-TEST: [{group1_name}] vs [{group2_name}]")
    print(f"샘플 수: {len(group1_df):,}명 vs {len(group2_df):,}명")
    print(f"{'=' * 80}")
    
    ttest_results = []
    
    for service in service_cols:
        # 결측치 제거
        data1 = group1_df[service].dropna()
        data2 = group2_df[service].dropna()
        
        if len(data1) < 30 or len(data2) < 30:
            continue
        
        # Two-sample t-test (Welch's t-test)
        t_stat, p_value = stats.ttest_ind(data1, data2, equal_var=False)
        
        # 평균 및 표준편차
        mean1 = data1.mean()
        mean2 = data2.mean()
        std1 = data1.std()
        std2 = data2.std()
        mean_diff = mean2 - mean1
        
        # 효과 크기 (Cohen's d)
        pooled_std = np.sqrt((std1**2 + std2**2) / 2)
        cohens_d = mean_diff / pooled_std if pooled_std > 0 else 0
        
        # 유의성
        if p_value < 0.001:
            sig = '***'
        elif p_value < 0.01:
            sig = '**'
        elif p_value < 0.05:
            sig = '*'
        else:
            sig = 'ns'
        
        ttest_results.append({
            '서비스': service,
            f'{group1_name} 평균': mean1,
            f'{group2_name} 평균': mean2,
            '평균 차이': mean_diff,
            "Cohen's d": cohens_d,
            't-statistic': t_stat,
            'p-value': p_value,
            '유의성': sig
        })
    
    results_df = pd.DataFrame(ttest_results)
    
    if len(results_df) == 0:
        print("   (충분한 데이터 없음)")
        return results_df
    
    results_df = results_df.sort_values('p-value')
    
    # 유의미한 차이만 출력
    significant = results_df[results_df['p-value'] < 0.05]
    
    if len(significant) > 0:
        print(f"\n유의미한 차이가 있는 서비스 (p < 0.05, Top 10):")
        display_cols = ['서비스', '평균 차이', "Cohen's d", 'p-value', '유의성']
        print(significant[display_cols].head(10).to_string(index=False))
    else:
        print("\n   (유의미한 차이 없음)")
    
    return results_df

In [27]:
print("\n" + "=" * 80)
print("T-TEST 분석 시작")
print("=" * 80)

comparisons = [
    ('Eco_Short', 'Biz_Short'),
    ('Eco_Medium', 'Biz_Medium'),
    ('Eco_Short', 'Eco-plus_Short'),
    ('Eco_Medium', 'Eco-plus_Medium'),
    ('Biz_Short', 'Biz_Medium'),
    ('Biz_Medium', 'Biz_Long'),
]

all_ttest_results = {}

for group1_name, group2_name in comparisons:
    if group1_name in groups and group2_name in groups:
        result = perform_ttest_between_groups(
            group1_name, groups[group1_name],
            group2_name, groups[group2_name],
            service_cols
        )
        if len(result) > 0:
            all_ttest_results[f"{group1_name}_vs_{group2_name}"] = result



T-TEST 분석 시작

T-TEST: [Eco_Short] vs [Biz_Short]
샘플 수: 35,606명 vs 18,724명

유의미한 차이가 있는 서비스 (p < 0.05, Top 10):
                              서비스     평균 차이  Cohen's d       p-value 유의성
                  Online boarding  0.701325   0.571062  0.000000e+00 ***
                 Baggage handling  0.432839   0.372996  0.000000e+00 ***
                 Leg room service  0.474623   0.371151  0.000000e+00 ***
                 On-board service  0.543416   0.433215  0.000000e+00 ***
                 Inflight service  0.421538   0.365871  0.000000e+00 ***
                     Seat comfort  0.448323   0.338377 1.599311e-307 ***
                  Checkin service  0.395917   0.317633 4.499012e-275 ***
           Inflight entertainment  0.393163   0.294801 5.402860e-234 ***
           Ease of Online booking  0.309923   0.242330 6.335729e-144 ***
Departure/Arrival time convenient -0.314036  -0.229830 1.208455e-130 ***

T-TEST: [Eco_Medium] vs [Biz_Medium]
샘플 수: 11,090명 vs 22,737명

유의미한 차이가 있는 서비스 (p < 

In [30]:
def perform_logistic_regression(group_name, group_df, service_cols):
    """
    특정 그룹에 대한 로지스틱 회귀분석
    """
    # 결측치 제거
    data_clean = group_df[service_cols + ['satisfaction_binary']].dropna()
    
    # 샘플 수 체크
    if len(data_clean) < 100:
        print(f"\n[{group_name}] 샘플 수 부족 ({len(data_clean)}명) - 분석 스킵\n")
        return None
    
    # X, y 분리
    X = data_clean[service_cols]
    y = data_clean['satisfaction_binary']
    
    # 표준화
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=service_cols)
    
    # Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs')
    model.fit(X_scaled_df, y)
    
    # 계수 추출
    coefficients = model.coef_[0]
    
    # P-value 계산 (Wald test 근사)
    n = len(y)
    k = len(service_cols)
    
    predictions = model.predict_proba(X_scaled_df)[:, 1]
    residuals = y - predictions
    mse = np.sum(residuals**2) / (n - k)
    
    # Variance-Covariance Matrix
    try:
        var_covar = np.linalg.inv(X_scaled_df.T @ X_scaled_df) * mse
        std_errors = np.sqrt(np.diag(var_covar))
        
        # Z-score and P-value
        z_scores = coefficients / std_errors
        p_values = [2 * (1 - stats.norm.cdf(abs(z))) for z in z_scores]
    except:
        # 역행렬 계산 실패 시
        std_errors = np.zeros(len(coefficients))
        z_scores = np.zeros(len(coefficients))
        p_values = [1.0] * len(coefficients)
    
    # 결과 DataFrame
    results = pd.DataFrame({
        'Feature': service_cols,
        'Coefficient': coefficients,
        'Std_Error': std_errors,
        'Z_score': z_scores,
        'P_value': p_values,
        'Abs_Coefficient': np.abs(coefficients)
    })
    
    results['Significance'] = results['P_value'].apply(
        lambda p: '***' if p < 0.001 else ('**' if p < 0.01 else ('*' if p < 0.05 else 'ns'))
    )
    
    results = results.sort_values('Abs_Coefficient', ascending=False)
    
    print(f"\n{'='*80}")
    print(f"[{group_name}] Regression Results")
    print(f"샘플 수: {len(data_clean):,}명 | 만족도: {y.mean():.2%}")
    print(f"{'='*80}")
    
    return results

In [31]:
# 모든 그룹에 대해 Regression 수행
print("\n" + "=" * 80)
print("LOGISTIC REGRESSION 분석 시작")
print("=" * 80)

all_regression_results = {}

# 그룹 순서 정의 (Eco -> Eco Plus -> Business, 각각 Short -> Medium -> Long)
group_order = [
    'Eco_Short',
    'Eco_Medium',
    'Eco-plus_Short',
    'Eco-plus_Medium',
    'Biz_Short',
    'Biz_Medium',
    'Biz_Long'
]

for group_name in group_order:
    if group_name in groups:
        result = perform_logistic_regression(group_name, groups[group_name], service_cols)
        
        if result is not None:
            all_regression_results[group_name] = result
            
            print("\nTop 5 영향력 있는 서비스:")
            top5 = result.head(5)
            display_cols = ['Feature', 'Coefficient', 'P_value', 'Significance']
            print(top5[display_cols].to_string(index=False))
            
            print("\n통계적으로 유의하지 않은 서비스 (p > 0.05):")
            non_sig = result[result['P_value'] > 0.05]
            if len(non_sig) > 0:
                print(non_sig[['Feature', 'Coefficient', 'P_value']].to_string(index=False))
            else:
                print("   (모든 변수가 유의미함)")


LOGISTIC REGRESSION 분석 시작

[Eco_Short] Regression Results
샘플 수: 31,561명 | 만족도: 17.93%

Top 5 영향력 있는 서비스:
               Feature  Coefficient  P_value Significance
 Inflight wifi service     3.311978      0.0          ***
Inflight entertainment     0.461379      0.0          ***
Ease of Online booking    -0.236582      0.0          ***
      Baggage handling    -0.218604      0.0          ***
        Food and drink    -0.191473      0.0          ***

통계적으로 유의하지 않은 서비스 (p > 0.05):
   (모든 변수가 유의미함)

[Eco_Medium] Regression Results
샘플 수: 10,634명 | 만족도: 15.36%

Top 5 영향력 있는 서비스:
                          Feature  Coefficient  P_value Significance
            Inflight wifi service     3.130443      0.0          ***
Departure/Arrival time convenient    -0.225079      0.0          ***
                  Online boarding     0.224827      0.0          ***
                 Inflight service    -0.143619      0.0          ***
           Inflight entertainment     0.133244      0.0          ***

통계적

In [32]:
print("\n" + "=" * 80)
print("그룹별 Top 3 서비스 요약")
print("=" * 80)

summary_data = []

for group_name in group_order:
    if group_name not in all_regression_results:
        continue
    
    result = all_regression_results[group_name]
    top3 = result[result['P_value'] < 0.05].head(3)
    
    if len(top3) > 0:
        summary_data.append({
            '그룹': group_name,
            '1순위': f"{top3.iloc[0]['Feature']} ({top3.iloc[0]['Coefficient']:.3f})",
            '2순위': f"{top3.iloc[1]['Feature']} ({top3.iloc[1]['Coefficient']:.3f})" if len(top3) > 1 else '-',
            '3순위': f"{top3.iloc[2]['Feature']} ({top3.iloc[2]['Coefficient']:.3f})" if len(top3) > 2 else '-',
        })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))


그룹별 Top 3 서비스 요약
             그룹                           1순위                                        2순위                                        3순위
      Eco_Short Inflight wifi service (3.312)             Inflight entertainment (0.461)            Ease of Online booking (-0.237)
     Eco_Medium Inflight wifi service (3.130) Departure/Arrival time convenient (-0.225)                    Online boarding (0.225)
 Eco-plus_Short Inflight wifi service (2.924)             Inflight entertainment (0.449) Departure/Arrival time convenient (-0.423)
Eco-plus_Medium Inflight wifi service (2.810)                    Online boarding (0.671)             Inflight entertainment (0.427)
      Biz_Short       Online boarding (1.346) Departure/Arrival time convenient (-0.723)             Inflight entertainment (0.683)
     Biz_Medium       Online boarding (0.980)                   Leg room service (0.596)                   On-board service (0.520)
       Biz_Long       Checkin service (0.794)             