### 1. 라이브러리 import

In [1]:
import pandas as pd # pandas는 데이터를 다루기 위한 라이브러리
from sklearn.model_selection import train_test_split  # train_test_split은 데이터를 train과 test로 나누기 위한 라이브러리
from sklearn.preprocessing import StandardScaler  # StandardScaler는 데이터를 표준화하기 위한 라이브러리
from sklearn.linear_model import LogisticRegression # LogisticRegression는 로지스틱 회귀를 위한 라이브러리
from sklearn.ensemble import RandomForestClassifier # RandomForestClassifier는 랜덤 포레스트를 위한 라이브러리
from xgboost import XGBClassifier # XGBClassifier는 XGBoost를 위한 라이브러리
from sklearn.metrics import classification_report, accuracy_score # classification_report와 accuracy_score를 import


### 2. data load and Split

In [None]:
df_df = pd.read_csv("player_TopRate_position_JJINMAK/DF_combined.csv")
df_df.columns # 데이터의 컬럼을 확인

### 3. VIF 계산

In [None]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def calculate_vif(dataframe):
    """
    주어진 데이터프레임의 VIF(분산 팽창 계수)를 계산.

    Args:
        dataframe (pd.DataFrame): 독립 변수들로 이루어진 데이터프레임.

    Returns:
        pd.DataFrame: 변수 이름과 해당 VIF 값.
    """
    # 상수항 추가 (회귀식의 절편을 고려하기 위함)
    X = add_constant(dataframe)
    
    # VIF 계산
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif_data

df_df_drop = df_df.copy()
df_df_drop = df_df.drop(columns= ['선수', "포지션_DF", "포지션_FW", "포지션_GK", "포지션_MF", "평점", "isWin"])
# 예시 데이터
# df: 독립 변수들로 이루어진 데이터프레임
vif_result = calculate_vif(df_df_drop)
vif_result = vif_result[vif_result['VIF'] < 7].reset_index(drop=True)
# VIF 결과 출력
print(vif_result)
feature_list = vif_result['Feature'].tolist()


In [None]:
vif_result_sorted = vif_result.sort_values(by=vif_result.columns[1], ascending=False)

# 결과 확인
vif_result_sorted

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# VIF 값을 오름차순으로 정렬
vif_result_sorted = vif_result.sort_values(by='VIF', ascending=True)

# 시각화
plt.figure(figsize=(10, 6))
sns.barplot(x='VIF', y='Feature', data=vif_result_sorted, palette='viridis')
plt.title('VIF Values by Feature in DF')
plt.xlabel('VIF')
plt.ylabel('Feature')
plt.show()

In [None]:
# '성공'이 포함되었지만 '성공률'이 포함된 항목만 남기고, 나머지 항목은 그대로 포함
filtered_features = [
    feature for feature in feature_list if '성공률' in feature or '성공' not in feature
]

print(len(filtered_features))

In [None]:
X = df_df[filtered_features]
y = df_df["isWin"]  # 타겟 변수

# 학습/검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 4. LogisticRegression learning and evalutaion (RFE)
- RFE(Recursive Feature Elimination)

In [None]:
from sklearn.feature_selection import RFE
# 모델 선택
model = LogisticRegression(max_iter=10000)

# RFE를 통한 피처 선택
selector = RFE(estimator=model, n_features_to_select=20)  # 선택할 피처 수 설정
selector = selector.fit(X_train, y_train)

# 선택된 피처 출력
selected_features = X_train.columns[selector.support_]  # 선택된 피처들
print("선택된 피처들:", selected_features)

# 선택된 피처를 사용한 모델 학습
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# 모델 학습
model.fit(X_train_selected, y_train)

# 모델 성능 평가
accuracy = model.score(X_test_selected, y_test)
print(f"모델 정확도: {accuracy:.4f}")

### 5. LogisticRegression learning and evalutaion 

In [40]:
scaler = StandardScaler() # 표준화를 위한 객체 생성
X_train_scaled = scaler.fit_transform(X_train) # 학습 데이터의 표준화
X_test_scaled = scaler.transform(X_test) # 검증 데이터의 표준화

In [41]:
logistic_model = LogisticRegression(random_state=42) # 로지스틱 회귀 모델 생성

In [42]:
logistic_model.fit(X_train_scaled, y_train) # 모델 학습
logistic_preds = logistic_model.predict(X_test_scaled) # 모델 예측

In [None]:
print("Logistic Regression Performance:") # 성능 평가
print(classification_report(y_test, logistic_preds)) # 분류 리포트 출력
print("Accuracy:", accuracy_score(y_test, logistic_preds)) # 정확도 출력

### 6. RandomForest and evalutaion 

In [44]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10) # 랜덤 포레스트 모델 생성

In [45]:
rf_model.fit(X_train, y_train) # 모델 학습
rf_preds = rf_model.predict(X_test) # 모델 예측

In [None]:
print("\nRandom Forest Performance:") # 성능 평가
print(classification_report(y_test, rf_preds)) # 분류 리포트 출력
print("Accuracy:", accuracy_score(y_test, rf_preds)) # 정확도 출력

### 7. XGBoost and evalutaion 

In [48]:
xgb_model = XGBClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.1) # XGBoost 모델 생성

In [49]:
xgb_model.fit(X_train, y_train) # 모델 학습
xgb_preds = xgb_model.predict(X_test) # 모델 예측

In [None]:
print("\nXGBoost Performance:") # 성능 평가
print(classification_report(y_test, xgb_preds)) # 분류 리포트 출력
print("Accuracy:", accuracy_score(y_test, xgb_preds)) # 정확도 출력

### 8. Logistic, Random, XGBoost overfit check

In [21]:
from sklearn.metrics import confusion_matrix # confusion_matrix를 import

In [None]:
# 모델 초기화
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.1),
}

# 성능 평가 함수 정의
def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    accuracy = (tp + tn) / (tn + fp + fn + tp)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return accuracy, precision, recall, f1

# 모델 학습 및 평가
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # 학습
    model.fit(X_train, y_train)
    
    # 학습 데이터 성능
    train_preds = model.predict(X_train)
    train_accuracy, train_precision, train_recall, train_f1 = evaluate_model(y_train, train_preds)
    
    # 테스트 데이터 성능
    test_preds = model.predict(X_test)
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(y_test, test_preds)
    
    # 성능 비교 출력
    print(f"{model_name} Performance:")
    print(f"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print(f"Train Precision: {train_precision:.4f}, Test Precision: {test_precision:.4f}")
    print(f"Train Recall: {train_recall:.4f}, Test Recall: {test_recall:.4f}")
    print(f"Train F1 Score: {train_f1:.4f}, Test F1 Score: {test_f1:.4f}")
    
    # 과적합 여부 확인
    if train_accuracy - test_accuracy > 0.1 or train_f1 - test_f1 > 0.1:
        print("Potential Overfitting Detected!")
    else:
        print("No Significant Overfitting Observed.")

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

### 9. Visualization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay


# 모델 정의
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42)
}

# 교차 검증과 혼돈 행렬 시각화
for model_name, model in models.items():
    print(f"모델: {model_name}")
    
    # 교차 검증: 5-fold 교차 검증으로 정확도 평가
    cv_scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f"교차 검증 정확도: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    # 모델 학습
    model.fit(X_train, y_train)
    
    # 테스트 데이터로 예측
    y_pred = model.predict(X_test)
    
    # 혼돈 행렬 계산
    cm = confusion_matrix(y_test, y_pred)
    
    # 혼돈 행렬 시각화
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(f"{model_name} Confusion Matrix")
    plt.show()
    
    print("="*50)

# 10. XAI

In [None]:
import shap
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

# 데이터 준비
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 모델 정의
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# SHAP 값 계산 및 시각화
for model_name, model in models.items():
    # 모델 학습
    model.fit(X_train, y_train)
    
    # SHAP 설명기 생성
    explainer = shap.TreeExplainer(model) if model_name == "XGBoost" or model_name == "Random Forest" else shap.LinearExplainer(model, X_train)
    
    # SHAP 값 계산
    shap_values = explainer.shap_values(X_test)
    
    # SHAP summary plot
    print(f"{model_name} SHAP Summary Plot")
    shap.summary_plot(shap_values, X_test)
    plt.show()
    
    # SHAP dependence plot (특정 특성에 대한 SHAP 값의 관계 시각화)
    # shap.dependence_plot(0, shap_values, X_test)  # 0번 특성 (첫 번째 특성)에 대한 종속성 플롯
    # plt.show()

In [None]:
from lime.lime_tabular import LimeTabularExplainer
import numpy as np

# 모델 학습 (예시: 랜덤 포레스트)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# LIME 설명기 생성
explainer = LimeTabularExplainer(X_train.values, training_labels=y_train.values, mode='classification', 
                                 feature_names=X.columns, class_names=['0', '1'], discretize_continuous=True)

# 특정 샘플에 대해 설명 생성
i = 1  # 설명을 원하는 샘플 인덱스
explanation = explainer.explain_instance(X_test.iloc[i].values, model.predict_proba)

# LIME 결과 시각화
explanation.show_in_notebook(show_table=True, show_all=False)