### 1. 라이브러리 import

In [1]:
import pandas as pd # pandas는 데이터를 다루기 위한 라이브러리
from sklearn.model_selection import train_test_split  # train_test_split은 데이터를 train과 test로 나누기 위한 라이브러리
from sklearn.preprocessing import StandardScaler  # StandardScaler는 데이터를 표준화하기 위한 라이브러리
from sklearn.linear_model import LogisticRegression # LogisticRegression는 로지스틱 회귀를 위한 라이브러리
from sklearn.ensemble import RandomForestClassifier # RandomForestClassifier는 랜덤 포레스트를 위한 라이브러리
from xgboost import XGBClassifier # XGBClassifier는 XGBoost를 위한 라이브러리
from sklearn.metrics import classification_report, accuracy_score # classification_report와 accuracy_score를 import


### 2. data load and Split

In [2]:
mf_df = pd.read_csv("player_TopRate_position_JJINMAK/MF_combined.csv")
mf_df.shape

(1673, 64)

In [4]:
columns = ["isWin", "선수","포지션_DF", "포지션_FW", "포지션_GK", "포지션_MF", "평점"]

### 3. VIF 계산

In [16]:
import pandas as pd
import numpy as np
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def calculate_vif(dataframe):
    """
    주어진 데이터프레임의 VIF(분산 팽창 계수)를 계산.

    Args:
        dataframe (pd.DataFrame): 독립 변수들로 이루어진 데이터프레임.

    Returns:
        pd.DataFrame: 변수 이름과 해당 VIF 값.
    """
    # 상수항 추가 (회귀식의 절편을 고려하기 위함)
    X = add_constant(dataframe)
    
    # VIF 계산
    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return vif_data

mf_df_drop = mf_df.copy()
mf_df_drop = mf_df.drop(columns= ['선수', "포지션_DF", "포지션_FW", "포지션_GK", "포지션_MF", "평점", "isWin"])
# 예시 데이터
# df: 독립 변수들로 이루어진 데이터프레임
vif_result = calculate_vif(mf_df_drop)
vif_result = vif_result[vif_result['VIF'] < 7].reset_index(drop=True)
# VIF 결과 출력
print(vif_result)
feature_list = vif_result['Feature'].tolist()


  vif = 1. / (1. - r_squared_i)
  return 1 - self.ssr/self.centered_tss


         Feature       VIF
0           출전시간  2.595553
1             득점  2.184842
2             도움  1.239245
3          오프사이드  1.085597
4            프리킥  1.776839
5            코너킥  2.029904
6            스로인  1.599744
7         드리블 성공  3.482886
8     드리블 성공률(%)  3.322738
9         Ishome  1.042553
10      경합(지상)성공  2.438234
11  경합(지상)성공률(%)  1.481216
12      경합(공중)성공  2.042394
13  경합(공중)성공률(%)  1.635209
14          태클성공  2.589022
15      태클성공률(%)  2.450978
16          클리어링  1.575901
17          인터셉트  1.540722
18            차단  1.928038
19            획득  1.827327
20            블락  1.399481
21           볼미스  1.096616
22            파울  1.197122
23           피파울  1.168166
24            경고  1.109123
25            퇴장  1.042681
26      패스성공률(%)  6.621253
27           키패스  1.966175
28      공격진영패스성공  2.784978
29  공격진영패스성공률(%)  1.476409
30      수비진영패스성공  2.549316
31  수비진영패스성공률(%)  1.398075
32     롱패스성공률(%)  1.637549
33   중거리패스성공률(%)  2.423263
34     숏패스성공률(%)  3.197223
35    전진패스성공률(%)  2.433153
3

In [None]:
# '성공'이 포함되었지만 '성공률'이 없는 항목을 제외한 리스트 만들기
filtered_features = [
    feature for feature in feature_list 
    if ('성공' in feature and '성공률' in feature) or '성공률' not in feature
]

print(len(filtered_features))

42


In [None]:
X = mf_df[filtered_features]
y = mf_df["isWin"]  # 타겟 변수

# 학습/검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

### 4. LogisticRegression learning and evalutaion (RFE)
- RFE(Recursive Feature Elimination)

In [69]:
from sklearn.feature_selection import RFE
# 모델 선택
model = LogisticRegression(max_iter=10000)

# RFE를 통한 피처 선택
selector = RFE(estimator=model, n_features_to_select=10)  # 선택할 피처 수 설정
selector = selector.fit(X_train, y_train)

# 선택된 피처 출력
selected_features = X_train.columns[selector.support_]  # 선택된 피처들
print("선택된 피처들:", selected_features)

# 선택된 피처를 사용한 모델 학습
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# 모델 학습
model.fit(X_train_selected, y_train)

# 모델 성능 평가
accuracy = model.score(X_test_selected, y_test)
print(f"모델 정확도: {accuracy:.4f}")

선택된 피처들: Index(['평점', '득점', '도움', '슈팅', '블락된 슈팅', '오프사이드', '드리블 성공', '클리어링', '파울',
       '키패스'],
      dtype='object')
모델 정확도: 0.9462


### 5. LogisticRegression learning and evalutaion 

In [12]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
logistic_model = LogisticRegression(random_state=42)

In [14]:
logistic_model.fit(X_train_scaled, y_train)
logistic_preds = logistic_model.predict(X_test_scaled)

In [15]:
print("Logistic Regression Performance:")
print(classification_report(y_test, logistic_preds))
print("Accuracy:", accuracy_score(y_test, logistic_preds))

Logistic Regression Performance:
              precision    recall  f1-score   support

           0       0.93      0.86      0.89       169
           1       0.87      0.93      0.90       166

    accuracy                           0.90       335
   macro avg       0.90      0.90      0.90       335
weighted avg       0.90      0.90      0.90       335

Accuracy: 0.8955223880597015


### 6. RandomForest and evalutaion 

In [59]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)

In [60]:
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)

In [61]:
print("\nRandom Forest Performance:")
print(classification_report(y_test, rf_preds))
print("Accuracy:", accuracy_score(y_test, rf_preds))


Random Forest Performance:
              precision    recall  f1-score   support

           0       0.99      0.91      0.94       169
           1       0.91      0.99      0.95       166

    accuracy                           0.95       335
   macro avg       0.95      0.95      0.95       335
weighted avg       0.95      0.95      0.95       335

Accuracy: 0.9462686567164179


### 7. XGBoost and evalutaion 

In [62]:
xgb_model = XGBClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.1)

In [63]:
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)

In [64]:
print("\nXGBoost Performance:")
print(classification_report(y_test, xgb_preds))
print("Accuracy:", accuracy_score(y_test, xgb_preds))


XGBoost Performance:
              precision    recall  f1-score   support

           0       0.97      0.91      0.94       169
           1       0.91      0.97      0.94       166

    accuracy                           0.94       335
   macro avg       0.94      0.94      0.94       335
weighted avg       0.94      0.94      0.94       335

Accuracy: 0.9373134328358209


In [65]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

### 8. Logistic, Random, XGBoost overfit check

In [52]:
# 모델 초기화
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
    "XGBoost": XGBClassifier(random_state=42, n_estimators=100, max_depth=10, learning_rate=0.1),
}

# 성능 평가 함수 정의
def evaluate_model(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    accuracy = (tp + tn) / (tn + fp + fn + tp)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return accuracy, precision, recall, f1

# 모델 학습 및 평가
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # 학습
    model.fit(X_train, y_train)
    
    # 학습 데이터 성능
    train_preds = model.predict(X_train)
    train_accuracy, train_precision, train_recall, train_f1 = evaluate_model(y_train, train_preds)
    
    # 테스트 데이터 성능
    test_preds = model.predict(X_test)
    test_accuracy, test_precision, test_recall, test_f1 = evaluate_model(y_test, test_preds)
    
    # 성능 비교 출력
    print(f"{model_name} Performance:")
    print(f"Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}")
    print(f"Train Precision: {train_precision:.4f}, Test Precision: {test_precision:.4f}")
    print(f"Train Recall: {train_recall:.4f}, Test Recall: {test_recall:.4f}")
    print(f"Train F1 Score: {train_f1:.4f}, Test F1 Score: {test_f1:.4f}")
    
    # 과적합 여부 확인
    if train_accuracy - test_accuracy > 0.1 or train_f1 - test_f1 > 0.1:
        print("Potential Overfitting Detected!")
    else:
        print("No Significant Overfitting Observed.")


Training Logistic Regression...
Logistic Regression Performance:
Train Accuracy: 0.7877, Test Accuracy: 0.8000
Train Precision: 0.7722, Test Precision: 0.7797
Train Recall: 0.8117, Test Recall: 0.8313
Train F1 Score: 0.7915, Test F1 Score: 0.8047
No Significant Overfitting Observed.

Training Random Forest...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Random Forest Performance:
Train Accuracy: 0.9978, Test Accuracy: 0.9522
Train Precision: 0.9955, Test Precision: 0.9261
Train Recall: 1.0000, Test Recall: 0.9819
Train F1 Score: 0.9977, Test F1 Score: 0.9532
No Significant Overfitting Observed.

Training XGBoost...
XGBoost Performance:
Train Accuracy: 1.0000, Test Accuracy: 0.9373
Train Precision: 1.0000, Test Precision: 0.9096
Train Recall: 1.0000, Test Recall: 0.9699
Train F1 Score: 1.0000, Test F1 Score: 0.9388
No Significant Overfitting Observed.
