## 콜레스테롤 예측 모델(실패) ##

In [7]:
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import pandas as pd

In [8]:
# 1. CSV 파일 불러오기
file_path = "./data/health_2023_cleaned_final.csv"
df = pd.read_csv(file_path, encoding='utf-8', sep=',')

In [9]:
df = pd.read_csv('./data/health_2023_cleaned_final.csv', encoding='utf-8')
df = df[['성별코드', '연령대코드(5세단위)', '신장(5cm단위)', '체중(5kg단위)', '허리둘레', '총콜레스테롤']].dropna()

In [10]:
X = df.drop(columns='총콜레스테롤').values
y = df['총콜레스테롤'].values

In [27]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = X_scaled.reshape(X_scaled.shape[0], X_scaled.shape[1], 1)  # (샘플 수, 타임스텝, 채널)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# 4. MLP 모델 정의
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # 회귀 출력
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

# 5. 학습
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=32, verbose=1)

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

# 1. 데이터 불러오기
df = pd.read_csv("./data/health_2023_cleaned_final.csv")

# 2. 파생 변수 생성
df['BMI'] = (df['체중(5kg단위)'] + 2.5) / ((df['신장(5cm단위)'] + 2.5) / 100) ** 2
df['허리신장비'] = df['허리둘레'] / (df['신장(5cm단위)'] + 2.5)

# 3. 결측치 제거
required_cols = [
    '연령대코드(5세단위)', '성별코드', '신장(5cm단위)', '체중(5kg단위)', '허리둘레',
    '흡연상태', '음주여부', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)',
    'BMI', '허리신장비', 'HDL콜레스테롤', '치아우식증유무', '치석'
]
df = df.dropna(subset=required_cols)

# 4. 입력 / 출력 정의
X = df[[
    '연령대코드(5세단위)', '성별코드', '신장(5cm단위)', '체중(5kg단위)', '허리둘레',
    '흡연상태', '음주여부', '수축기혈압', '이완기혈압', '식전혈당(공복혈당)',
    'BMI', '허리신장비', '치아우식증유무', '치석'
]]
y = np.log1p(df['HDL콜레스테롤'])  # 로그 변환 적용

# 5. 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 6. 학습/테스트 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 7. 모델 정의 및 학습
model = XGBRegressor(n_estimators=200, max_depth=4, learning_rate=0.05, random_state=42)
model.fit(X_train, y_train)

# 8. 예측 및 성능 평가
y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)  # 로그 복원
y_true = np.expm1(y_test)

print(f"[HDL콜레스테롤 예측 결과 - XGBoost]")
print(f"R²: {r2_score(y_true, y_pred):.4f}")


[HDL콜레스테롤 예측 결과 - XGBoost]
R²: 0.2285


In [None]:
import tensorflow as tf
print(tf.__version__)  # 2.15.1 이 출력돼야 함
print(dir(tf))   

In [None]:
# 5. 예측 및 성능 평가
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("📊 MAE:", mae)
print("📊 RMSE:", rmse)
print("📈 R²:", r2)

## 테스트 ##

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

# 1. 모델 정의
baseline_model = LinearRegression()

# 2. 학습
baseline_model.fit(X_train, y_train)

# 3. 예측
y_pred_log = baseline_model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test)

# 4. R² 평가
r2_linear = r2_score(y_true, y_pred)
print(f"Linear Regression R²: {r2_linear:.4f}")


In [None]:
r2_xgb = r2
improvement = (r2_xgb - r2_linear) / abs(r2_linear) * 100
print(f"XGBoost 성능 향상: {improvement:.1f}%")

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

plt.rcParams['font.family'] = 'AppleGothic'

# 특성 중요도 시각화
importances = model.feature_importances_
feature_names = X.columns

feat_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

print(feat_df)

feat_df.plot(kind='barh', x='Feature', y='Importance', figsize=(8, 6), title='HDL 예측에서의 특성 중요도')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()