In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 한글 폰트(윈도우 기준)
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

df = pd.read_csv("../data/train_raw.csv")

# satisfaction → 이진 label로 변경
df['label'] = df['satisfaction'].map({
    'neutral or dissatisfied': 0,
    'satisfied': 1
})

In [3]:
service_columns = [
    'Online boarding', 'Inflight entertainment', 'Inflight wifi service',
    'Seat comfort', 'On-board service', 'Cleanliness', 'Leg room service',
    'Inflight service', 'Ease of Online booking', 'Baggage handling',
    'Checkin service', 'Food and drink', 'Gate location',
    'Departure/Arrival time convenient'
]

# 0은 '이용하지 않음' → NaN으로 처리
df[service_columns] = df[service_columns].replace(0, np.nan)

In [4]:
# 결측치 처리 (단순히 평균 대치)
df[service_columns] = df[service_columns].fillna(df[service_columns].mean())

In [5]:
# 충성/비충성 고객 분리
loyal = df[df['Customer Type'] == 'Loyal Customer'].copy()
disloyal = df[df['Customer Type'] == 'disloyal Customer'].copy()

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [7]:
# 충성 고객 서비스 상관계수
loyal_corr = (
    loyal[service_columns + ['label']]
    .corr()['label']
    .drop('label')
    .sort_values(ascending=False)
)

# 비충성 고객 서비스 상관계수
disloyal_corr = (
    disloyal[service_columns + ['label']]
    .corr()['label']
    .drop('label')
    .sort_values(ascending=False)
)

# 두 상관계수를 하나의 DataFrame으로 합치기
corr_compare = pd.DataFrame({
    'Loyal_corr': loyal_corr,
    'Disloyal_corr': disloyal_corr
})

# 보기 좋게 정렬 (예: 충성 고객 기준으로 정렬)
corr_compare = corr_compare.sort_values('Loyal_corr', ascending=False)

# 보기 좋게 반올림
corr_compare = corr_compare.round(4)

corr_compare


Unnamed: 0,Loyal_corr,Disloyal_corr
Online boarding,0.5368,0.579
Inflight entertainment,0.4611,0.0229
Seat comfort,0.398,0.0114
Leg room service,0.3598,0.0702
Cleanliness,0.355,0.0126
Inflight wifi service,0.3457,0.6275
On-board service,0.3387,0.2109
Baggage handling,0.2648,0.2124
Inflight service,0.2619,0.2068
Food and drink,0.2382,0.0351


In [8]:
# 충성 고객
X_loyal = loyal[service_columns]
y_loyal = loyal['label']

loyal_model = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])
loyal_model.fit(X_loyal, y_loyal)

loyal_coefs = pd.Series(
    loyal_model.named_steps['logreg'].coef_[0],
    index=service_columns
)

# 비충성 고객
X_dis = disloyal[service_columns]
y_dis = disloyal['label']

disloyal_model = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])
disloyal_model.fit(X_dis, y_dis)

disloyal_coefs = pd.Series(
    disloyal_model.named_steps['logreg'].coef_[0],
    index=service_columns
)


In [14]:
from scipy.stats import ttest_ind
import pandas as pd

# 결과 저장 리스트
rows = []

for col in service_columns:
    loyal_scores = loyal[col]
    disloyal_scores = disloyal[col]

    # Welch’s t-test (equal_var=False)
    stat, p = ttest_ind(loyal_scores, disloyal_scores, equal_var=False)

    # 평균 계산
    loyal_mean = loyal_scores.mean()
    disloyal_mean = disloyal_scores.mean()

    # 해석
    significance = "차이 있음" if p < 0.05 else "차이 없음"

    rows.append({
        "Service": col,
        "Loyal Mean": round(loyal_mean, 3),
        "Disloyal Mean": round(disloyal_mean, 3),
        "Mean Diff (L - D)": round(loyal_mean - disloyal_mean, 3),
        "p-value": p,
        "Interpretation": significance
    })

# DataFrame 생성
ttest_summary = pd.DataFrame(rows)

# p-value 정렬
ttest_summary = ttest_summary.sort_values("p-value")

# 출력
ttest_summary


Unnamed: 0,Service,Loyal Mean,Disloyal Mean,Mean Diff (L - D),p-value,Interpretation
0,Online boarding,3.434,2.854,0.58,0.0,차이 있음
3,Seat comfort,3.539,2.994,0.544,0.0,차이 있음
13,Departure/Arrival time convenient,3.287,2.948,0.338,6.7994360000000004e-267,차이 있음
1,Inflight entertainment,3.428,3.048,0.38,4.698190000000001e-255,차이 있음
5,Cleanliness,3.339,3.054,0.284,1.310465e-145,차이 있음
11,Food and drink,3.243,3.039,0.204,3.774928e-76,차이 있음
4,On-board service,3.417,3.228,0.189,7.775089e-75,차이 있음
6,Leg room service,3.4,3.218,0.182,5.564623e-66,차이 있음
10,Checkin service,3.324,3.218,0.105,1.36818e-24,차이 있음
9,Baggage handling,3.618,3.694,-0.076,2.5144500000000004e-17,차이 있음


In [10]:
# 기본 틀
summary = pd.DataFrame(index=service_columns)

# 평균 점수
summary['Loyal_mean'] = loyal[service_columns].mean()
summary['Disloyal_mean'] = disloyal[service_columns].mean()
summary['Mean_diff (L - D)'] = summary['Loyal_mean'] - summary['Disloyal_mean']

# 상관계수
summary['Loyal_corr'] = loyal_corr
summary['Disloyal_corr'] = disloyal_corr

# 회귀 계수
summary['Loyal_coef'] = loyal_coefs
summary['Disloyal_coef'] = disloyal_coefs

# t-test
summary['ttest_p'] = ttest_df.set_index('Service')['P-value']

# 정렬
summary = summary.sort_values('Loyal_coef', ascending=False)

summary


Unnamed: 0,Loyal_mean,Disloyal_mean,Mean_diff (L - D),Loyal_corr,Disloyal_corr,Loyal_coef,Disloyal_coef,ttest_p
Online boarding,3.434181,2.853735,0.580446,0.536769,0.579002,1.01096,1.820403,0.0
Inflight wifi service,2.814268,2.811227,0.00304,0.34571,0.627491,0.71862,2.730082,0.7530639
Leg room service,3.399537,3.217849,0.181688,0.359785,0.0702,0.525226,0.056619,5.564623e-66
On-board service,3.416932,3.228228,0.188704,0.338667,0.210937,0.432133,0.287324,7.775089e-75
Checkin service,3.323579,3.218166,0.105414,0.23738,0.222327,0.339569,0.399031,1.36818e-24
Inflight entertainment,3.427988,3.048206,0.379782,0.46106,0.022861,0.330048,0.150077,4.698190000000001e-255
Seat comfort,3.538893,2.994415,0.544478,0.398032,0.011375,0.2569,0.018195,0.0
Cleanliness,3.338688,3.054265,0.284424,0.354974,0.012616,0.1991,-2.240128,1.310465e-145
Gate location,2.973246,2.993309,-0.020063,0.015914,-0.083748,0.190334,-0.13584,0.03165672
Ease of Online booking,2.892964,2.829266,0.063698,0.171897,0.594026,0.18718,-1.99802,6.4418e-11


In [11]:
# 상관계수 기반 투자 우선순위 Top3 추출
# 충성 고객 Top 3 서비스
top3_loyal = loyal_corr.sort_values(ascending=False).head(3)

# 비충성 고객 Top 3 서비스
top3_disloyal = disloyal_corr.sort_values(ascending=False).head(3)

print("[상관 기반]Loyal Customer 투자 우선순위 Top 3")
print(top3_loyal.to_string())

print("\n[상관 기반]Disloyal Customer 투자 우선순위 Top 3")
print(top3_disloyal.to_string())


[상관 기반]Loyal Customer 투자 우선순위 Top 3
Online boarding           0.536769
Inflight entertainment    0.461060
Seat comfort              0.398032

[상관 기반]Disloyal Customer 투자 우선순위 Top 3
Inflight wifi service     0.627491
Ease of Online booking    0.594026
Online boarding           0.579002


In [12]:
# 회귀계수 기반 투자 우선순위 Top3 추출
# 회귀 계수 기반 Top 3
top3_loyal_reg = loyal_coefs.sort_values(ascending=False).head(3)
top3_disloyal_reg = disloyal_coefs.sort_values(ascending=False).head(3)

print("[회귀 기반] Loyal Customer 우선 투자 Top 3")
print(top3_loyal_reg.to_string())

print("\n[회귀 기반] Disloyal Customer 우선 투자 Top 3")
print(top3_disloyal_reg.to_string())


[회귀 기반] Loyal Customer 우선 투자 Top 3
Online boarding          1.010960
Inflight wifi service    0.718620
Leg room service         0.525226

[회귀 기반] Disloyal Customer 우선 투자 Top 3
Inflight wifi service    2.730082
Food and drink           2.132918
Online boarding          1.820403


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# -----------------------------
# 1. 추가 파생 변수 생성 (AgeGroup, DistanceGroup)
# -----------------------------
df['AgeGroup'] = pd.cut(df['Age'], bins=[0,19,29,39,49,59,69,150],
                        labels=['10대','20대','30대','40대','50대','60대','70대+'])

df['DistanceGroup'] = pd.cut(df['Flight Distance'], 
                             bins=[0, 1500, 3500, 10000],
                             labels=['Short','Medium','Long'])

# -----------------------------
# 2. Customer Type 분리
# -----------------------------
groups = {
    'Loyal': df[df['Customer Type'] == 'Loyal Customer'],
    'Disloyal': df[df['Customer Type'] == 'disloyal Customer']
}

# -----------------------------
# 3. 세부 subgroup 정의
# Type of Travel 추가됨
# -----------------------------
sub_conditions = {
    'Gender': df['Gender'].unique(),
    'AgeGroup': df['AgeGroup'].unique(),
    'Class': df['Class'].unique(),
    'DistanceGroup': df['DistanceGroup'].unique(),
    'Type of Travel': df['Type of Travel'].unique()   # ← 추가됨!!!
}

# -----------------------------
# 4. 회귀 자동 실행 함수
# -----------------------------
def run_regression(data, service_columns):
    X = data[service_columns]
    y = data['label']

    if len(data) < 80:      # 데이터 너무 적으면 skip
        return None

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('logreg', LogisticRegression(max_iter=2000))
    ])

    model.fit(X, y)
    coefs = pd.Series(model.named_steps['logreg'].coef_[0], index=service_columns)
    return coefs.sort_values(ascending=False)

# -----------------------------
# 5. Deep Dive 실행
# Customer Type → Subgroup → Regression
# -----------------------------
results = []

for ctype, cdf in groups.items():  
    for subgroup, values in sub_conditions.items():

        for val in values:

            sub_df = cdf[cdf[subgroup] == val]

            if len(sub_df) < 80:
                continue

            coefs = run_regression(sub_df, service_columns)

            if coefs is not None:
                results.append({
                    "Customer Type": ctype,
                    "Subgroup": subgroup,
                    "Value": val,
                    "Top1": f"{coefs.index[0]} ({coefs.iloc[0]:.3f})",
                    "Top2": f"{coefs.index[1]} ({coefs.iloc[1]:.3f})",
                    "Top3": f"{coefs.index[2]} ({coefs.iloc[2]:.3f})",
                })

deep_dive_df = pd.DataFrame(results)
deep_dive_df


Unnamed: 0,Customer Type,Subgroup,Value,Top1,Top2,Top3
0,Loyal,Gender,Male,Online boarding (1.591),Inflight entertainment (0.716),Leg room service (0.362)
1,Loyal,Gender,Female,Leg room service (0.805),Online boarding (0.802),Inflight entertainment (0.475)
2,Loyal,AgeGroup,10대,Online boarding (1.792),Food and drink (1.430),Inflight wifi service (0.558)
3,Loyal,AgeGroup,20대,Online boarding (2.064),Food and drink (1.822),Gate location (0.419)
4,Loyal,AgeGroup,60대,Leg room service (0.947),Online boarding (0.663),Inflight wifi service (0.518)
5,Loyal,AgeGroup,40대,Leg room service (0.957),Online boarding (0.868),Inflight entertainment (0.551)
6,Loyal,AgeGroup,50대,Leg room service (1.069),Online boarding (0.870),Inflight entertainment (0.460)
7,Loyal,AgeGroup,30대,Online boarding (1.300),Inflight entertainment (0.824),On-board service (0.333)
8,Loyal,AgeGroup,70대+,Inflight entertainment (0.839),Leg room service (0.706),Online boarding (0.448)
9,Loyal,Class,Eco Plus,Inflight wifi service (1.552),Inflight entertainment (0.649),Gate location (0.263)
