In [9]:
import pandas as pd
df = pd.read_csv(r'C:\Users\User\LG_Aimers\MainTask\other.csv')

In [11]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# PSO 하이퍼파라미터 설정
NUM_PARTICLES = 69  # 입자의 개수 (즉, 20개의 feature 조합 탐색)
MAX_ITER = 30  # 최대 반복 횟수
W = 0.7  # 관성 계수
C1 = 1.5  # 개인 최적 해 반영 계수
C2 = 1.5  # 전역 최적 해 반영 계수

# 데이터 불러오기 (예시)
df = df  # 실제 데이터셋으로 변경 필요
X = df.drop(columns=["임신 성공 여부"])  # 특징 데이터
y = df["임신 성공 여부"]  # 목표 변수

num_features = X.shape[1]  # 전체 feature 개수

# 훈련/검증 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 입자 초기화
particles = np.random.randint(0, 2, size=(NUM_PARTICLES, num_features))  # 0 또는 1로 초기화
velocities = np.random.uniform(-1, 1, size=(NUM_PARTICLES, num_features))  # 속도 초기화

# 개인 및 전역 최적 해 초기화
p_best_positions = np.copy(particles)  # 개인 최적 위치
p_best_scores = np.full(NUM_PARTICLES, -np.inf)  # 개인 최적 점수 (초기값 -무한대)
g_best_position = None  # 전역 최적 위치
g_best_score = -np.inf  # 전역 최적 점수

# 특징 선택 후 모델 평가 함수
def evaluate_feature_subset(feature_mask):
    selected_features = np.where(feature_mask == 1)[0]  # 선택된 feature 인덱스 찾기
    if len(selected_features) == 0:
        return 0  # 아무 feature도 선택되지 않으면 점수 0 반환
    X_train_selected = X_train.iloc[:, selected_features]
    X_valid_selected = X_valid.iloc[:, selected_features]

    # LightGBM 모델 학습
    model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_selected, y_train)
    y_pred = model.predict_proba(X_valid_selected)[:, 1]  # 확률 예측

    return roc_auc_score(y_valid, y_pred)  # AUC 점수 반환

# PSO 최적화 과정
for iteration in range(MAX_ITER):
    for i in range(NUM_PARTICLES):
        score = evaluate_feature_subset(particles[i])
        
        # 개인 최적 해 업데이트
        if score > p_best_scores[i]:
            p_best_scores[i] = score
            p_best_positions[i] = particles[i].copy()

        # 전역 최적 해 업데이트
        if score > g_best_score:
            g_best_score = score
            g_best_position = particles[i].copy()

    # 입자 속도 및 위치 업데이트
    for i in range(NUM_PARTICLES):
        r1, r2 = np.random.rand(), np.random.rand()  # 무작위 값 (0~1)
        velocities[i] = (
            W * velocities[i] +
            C1 * r1 * (p_best_positions[i] - particles[i]) +
            C2 * r2 * (g_best_position - particles[i])
        )
        
        # 시그모이드 변환을 통해 확률적으로 0 또는 1 결정
        probabilities = 1 / (1 + np.exp(-velocities[i]))
        particles[i] = (np.random.rand(num_features) < probabilities).astype(int)

    print(f"Iteration {iteration+1}/{MAX_ITER}: Best AUC = {g_best_score:.4f}")

# 최적 특징 선택 결과
selected_features = np.where(g_best_position == 1)[0]
print(f"최적 선택된 특징 개수: {len(selected_features)}")
print(f"최적 특징 인덱스: {selected_features}")

[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 362
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 34
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.258348 -> initscore=-1.054573
[LightGBM] [Info] Start training from score -1.054573
[LightGBM] [Info] Number of positive: 52982, number of negative: 152098
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 449
[LightGBM] [Info] Number of data points in the train set: 205080, number of used features: 26
[LightGBM] [Info

In [13]:
selected_features = [0, 1, 3, 6, 7, 8, 9, 10, 15, 16, 19, 20, 23, 25, 26, 27, 28, 30, 31, 33, 34, 36, 38, 39, 40, 41, 42, 43, 44, 47, 48, 50, 52, 55, 56]
df = df.iloc[:, selected_features]  
df

Unnamed: 0,시술 시기 코드,시술 당시 나이,특정 시술 유형,단일 배아 이식 여부,착상 전 유전 진단 사용 여부,남성 주 불임 원인,남성 부 불임 원인,여성 주 불임 원인,불임 원인 - 난관 질환,불임 원인 - 남성 요인,...,저장된 배아 수,미세주입 후 저장된 배아 수,해동된 배아 수,해동 난자 수,혼합된 난자 수,파트너 정자와 혼합된 난자 수,난자 출처,동결 배아 사용 여부,대리모 여부,배아 이식 경과일
0,6,0,1.0,0.0,0.0,0,0,0,0,1,...,2.0,2.0,0.0,0.0,5.0,5.0,1,0.0,0.0,3.0
1,5,5,1.0,0.0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,1.0,1,0.0,0.0,3.0
2,3,0,0.0,0.0,0.0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,7.0,7.0,1,0.0,0.0,2.0
3,2,1,1.0,0.0,0.0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,4.0,4.0,1,0.0,0.0,3.0
4,3,0,1.0,0.0,0.0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,6.0,6.0,1,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
256346,5,0,1.0,0.0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,8.0,8.0,1,0.0,0.0,5.0
256347,5,2,1.0,0.0,0.0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,3.0,3.0,1,0.0,0.0,3.0
256348,3,1,1.0,0.0,0.0,0,0,0,0,1,...,0.0,0.0,0.0,0.0,6.0,6.0,1,0.0,0.0,3.0
256349,6,2,1.0,0.0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,2.0,2.0,1,0.0,0.0,1.0


In [17]:
ntarget = pd.read_csv(r'C:\Users\User\LG_Aimers\MainTask\other.csv')

In [19]:
target = ntarget['임신 성공 여부']

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score

# 이미 PSO 선택된 특징이 df에 포함되어 있다고 가정
# 'target'이 타겟 변수 (IVF 성공 여부와 같은)라고 가정
X = df  # 'target'을 제외한 모든 열을 특징(X)으로 사용
y = target  # 타겟 변수(y)

# 훈련/검증 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 결정 트리 모델 정의 및 학습
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_valid)
y_prob = model.predict_proba(X_valid)[:, 1]  # AUC-ROC 계산을 위한 확률값

# 성능 평가
accuracy = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_prob)

# 결과 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# 추가적인 평가 지표 출력
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_valid, y_pred))


Accuracy: 0.6730
F1-score: 0.3705
AUC-ROC: 0.5810

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.78      0.78     38025
           1       0.37      0.37      0.37     13246

    accuracy                           0.67     51271
   macro avg       0.57      0.58      0.57     51271
weighted avg       0.67      0.67      0.67     51271


Confusion Matrix:
[[29573  8452]
 [ 8313  4933]]


In [48]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score


# 이미 PSO 선택된 특징이 df에 포함되어 있다고 가정
# 'target'이 타겟 변수 (IVF 성공 여부와 같은)라고 가정
X = df  # 'target'을 제외한 모든 열을 특징(X)으로 사용
y = target  # 타겟 변수(y)

# 훈련/검증 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 랜덤 포레스트 모델 정의 및 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_valid)
y_prob = model.predict_proba(X_valid)[:, 1]  # AUC-ROC 계산을 위한 확률값

# 성능 평가
accuracy = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_prob)

# 결과 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# 추가적인 평가 지표 출력
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_valid, y_pred))


Accuracy: 0.7237
F1-score: 0.3217
AUC-ROC: 0.7017

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.89      0.83     38025
           1       0.44      0.25      0.32     13246

    accuracy                           0.72     51271
   macro avg       0.61      0.57      0.57     51271
weighted avg       0.69      0.72      0.70     51271


Confusion Matrix:
[[33748  4277]
 [ 9887  3359]]


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score

# 이미 PSO 선택된 특징이 df에 포함되어 있다고 가정
# 'target'이 타겟 변수 (IVF 성공 여부와 같은)라고 가정
X = df# 'target'을 제외한 모든 열을 특징(X)으로 사용
y = target  # 타겟 변수(y)

# 훈련/검증 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# LightGBM 데이터셋 준비
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)

# LightGBM 하이퍼파라미터 설정
params = {
    'objective': 'binary',  # 이진 분류
    'metric': 'binary_error',  # 평가 지표
    'boosting_type': 'gbdt',  # GBDT 부스팅 타입
    'num_leaves': 31,  # 리프의 수
    'learning_rate': 0.05,  # 학습률
    'feature_fraction': 0.9,  # 각 반복마다 사용할 feature 비율
    'verbose': -1  # 로그 레벨
}

# 모델 학습
model = lgb.train(params, train_data, valid_sets=[valid_data], num_boost_round=1000)

# 예측
y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
y_pred_bin = (y_pred > 0.5).astype(int)  # 확률을 0.5를 기준으로 이진 분류

# 성능 평가
accuracy = accuracy_score(y_valid, y_pred_bin)
f1 = f1_score(y_valid, y_pred_bin)
roc_auc = roc_auc_score(y_valid, y_pred)

# 결과 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# 추가적인 평가 지표 출력
print("\nClassification Report:")
print(classification_report(y_valid, y_pred_bin))

print("\nConfusion Matrix:")
print(confusion_matrix(y_valid, y_pred_bin))


Accuracy: 0.7476
F1-score: 0.2207
AUC-ROC: 0.7446

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85     38025
           1       0.55      0.14      0.22     13246

    accuracy                           0.75     51271
   macro avg       0.65      0.55      0.54     51271
weighted avg       0.71      0.75      0.69     51271


Confusion Matrix:
[[36498  1527]
 [11414  1832]]


In [78]:
test = pd.read_csv(r'C:\Users\User\LG_Aimers\MainTask\other_test.csv')

In [80]:
y_pred = model.predict(test)

In [82]:
prediction = pd.DataFrame(y_pred)

In [84]:
prediction.to_csv('RF.csv',encoding='utf-8-sig')

In [72]:
df = pd.read_csv(r'C:\Users\User\LG_Aimers\MainTask\other.csv')

In [74]:
target = df['임신 성공 여부']
df = df.drop(columns=['임신 성공 여부'],axis=1)

In [76]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, roc_auc_score


# 이미 PSO 선택된 특징이 df에 포함되어 있다고 가정
# 'target'이 타겟 변수 (IVF 성공 여부와 같은)라고 가정
X = df  # 'target'을 제외한 모든 열을 특징(X)으로 사용
y = target  # 타겟 변수(y)

# 훈련/검증 데이터 분할
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 랜덤 포레스트 모델 정의 및 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_valid)
y_prob = model.predict_proba(X_valid)[:, 1]  # AUC-ROC 계산을 위한 확률값

# 성능 평가
accuracy = accuracy_score(y_valid, y_pred)
f1 = f1_score(y_valid, y_pred)
roc_auc = roc_auc_score(y_valid, y_prob)

# 결과 출력
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

# 추가적인 평가 지표 출력
print("\nClassification Report:")
print(classification_report(y_valid, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_valid, y_pred))


Accuracy: 0.7299
F1-score: 0.3000
AUC-ROC: 0.7100

Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83     38025
           1       0.45      0.22      0.30     13246

    accuracy                           0.73     51271
   macro avg       0.61      0.57      0.57     51271
weighted avg       0.69      0.73      0.70     51271


Confusion Matrix:
[[34458  3567]
 [10279  2967]]
