In [None]:
# 기여도 계산하는 코드입니다.
import pandas as pd
import numpy as np
import os

# 파일 경로
stats_path = "/content/Dataset/players_with_converted_names.csv"
lineup_path = ["/content/Dataset/epl20.csv",
               "/content/Dataset/epl21.csv",
               "/content/Dataset/epl22.csv",
               "/content/Dataset/epl23.csv",
               "/content/Dataset/epl24.csv"]
match_paths = [
    "/content/Dataset/england-premier-league-matches-2020-to-2021-stats.csv",
    "/content/Dataset/england-premier-league-matches-2021-to-2022-stats.csv",
    "/content/Dataset/england-premier-league-matches-2022-to-2023-stats.csv",
    "/content/Dataset/england-premier-league-matches-2023-to-2024-stats.csv",
    "/content/Dataset/england-premier-league-matches-2024-to-2025-stats.csv"
]

# 데이터 불러오기
player_df = pd.read_csv(stats_path)
lineup_df_list = [pd.read_csv(path) for path in lineup_path]
lineup_df = pd.concat(lineup_df_list, ignore_index=True)
match_df_list = [pd.read_csv(path) for path in match_paths]
match_df = pd.concat(match_df_list, ignore_index=True)

# ----------------------------
# 1. 이름 정제 및 열명 통일
# ----------------------------
player_df.rename(columns={
    'full_name': 'player_name',
    'Current Club': 'team',
    'goals_overall': 'Goals',
    'assists_overall': 'Assists',
    'shots_total_overall': 'Shots',
    'xg_per_90_overall': 'xG_per90',
    'key_passes_per_90_overall': 'KeyPasses_per90',
    'tackles_per_90_overall': 'Tackles_per90',
    'interceptions_per_90_overall': 'Interceptions_per90',
    'clearances_per_90_overall': 'Clearances_per90',
    'aerial_duels_won_per_90_overall': 'AerialsWon_per90',
    'saves_per_90_overall': 'Saves_per90',
    'clean_sheets_overall': 'CleanSheets',
    'conceded_per_90_overall': 'GoalsConceded_per90',
    'minutes_played_overall': 'minutes_played_overall'
}, inplace=True)

# ----------------------------
# 2. 날짜 정제 및 경기결과 연결
# ----------------------------
lineup_df['date'] = pd.to_datetime(lineup_df['date'], errors='coerce')
match_df['date_GMT'] = pd.to_datetime(match_df['date_GMT'], errors='coerce')
lineup_df['match_date'] = lineup_df['date'].dt.date
match_df['match_date'] = match_df['date_GMT'].dt.date

match_lookup = match_df[['match_date', 'home_team_name', 'away_team_name',
                         'home_team_goal_count', 'away_team_goal_count']]

# 팀 이름 매핑
team_name_map = {
    'West Ham': 'West Ham United',
    'Man City': 'Manchester City',
    'Man Utd': 'Manchester United',
    'Spurs': 'Tottenham Hotspur',
    'Wolves': 'Wolverhampton Wanderers',
    'Brighton': 'Brighton and Hove Albion',
    'Newcastle': 'Newcastle United',
    'Sheffield Utd': 'Sheffield United',
    'Nottingham Forest': 'Nottingham Forest',
    'Aston Villa': 'Aston Villa',
    'Brentford': 'Brentford',
    'Burnley': 'Burnley',
    'Bournemouth': 'Bournemouth',
    'Chelsea': 'Chelsea',
    'Crystal Palace': 'Crystal Palace',
    'Everton': 'Everton',
    'Fulham': 'Fulham',
    'Liverpool': 'Liverpool',
    'Arsenal': 'Arsenal',
    'Luton': 'Luton Town'
}
lineup_df['team_mapped'] = lineup_df['team'].replace(team_name_map)

# 병합 (홈팀 우선)
lineup_with_result = lineup_df.merge(
    match_lookup, how='left',
    left_on=['match_date', 'team_mapped'],
    right_on=['match_date', 'home_team_name']
)
lineup_remain = lineup_with_result[lineup_with_result['home_team_goal_count'].isna()].copy()
lineup_matched = lineup_with_result[~lineup_with_result['home_team_goal_count'].isna()].copy()

# 병합 (원정팀)
lineup_remain.drop(columns=[
    'home_team_name', 'away_team_name',
    'home_team_goal_count', 'away_team_goal_count'
], inplace=True)

lineup_with_result_away = lineup_remain.merge(
    match_lookup, how='left',
    left_on=['match_date', 'team_mapped'],
    right_on=['match_date', 'away_team_name']
)

# 전체 라인업 결과 통합
full_lineup_result = pd.concat([lineup_matched, lineup_with_result_away], ignore_index=True)

# ----------------------------
# 3. Match Impact 계산
# ----------------------------
full_lineup_result['win'] = (
    ((full_lineup_result['team_mapped'] == full_lineup_result['home_team_name']) &
     (full_lineup_result['home_team_goal_count'] > full_lineup_result['away_team_goal_count'])) |
    ((full_lineup_result['team_mapped'] == full_lineup_result['away_team_name']) &
     (full_lineup_result['away_team_goal_count'] > full_lineup_result['home_team_goal_count']))
).astype(int)

mi_df = full_lineup_result.groupby('player_name')['win'].agg(['sum', 'count']).reset_index()
mi_df.columns = ['player_name', 'wins', 'appearances']
mi_df['MI'] = mi_df['wins'] / mi_df['appearances']
mi_df['MI'] = mi_df['MI'].fillna(0)

# ----------------------------
# 4. AC / DC / 기여도 계산
# ----------------------------
player_df = player_df.merge(mi_df[['player_name', 'MI']], how='left', on='player_name')
player_df['MI'] = player_df['MI'].fillna(0)

player_df['Goals_per90'] = player_df['Goals'] / player_df['minutes_played_overall'] * 90
player_df['Assists_per90'] = player_df['Assists'] / player_df['minutes_played_overall'] * 90
player_df['Shots_per90'] = player_df['Shots'] / player_df['minutes_played_overall'] * 90
player_df['CleanSheets_per90'] = player_df['CleanSheets'] / player_df['minutes_played_overall'] * 90

player_df.replace([np.inf, -np.inf], np.nan, inplace=True)
player_df.fillna(0, inplace=True)

player_df['FinishingEfficiency'] = player_df['Goals_per90'] - player_df['xG_per90']
player_df['FinishingEfficiency'] = player_df['FinishingEfficiency'].clip(lower=-0.5)

player_df['AC'] = 0.0
player_df['DC'] = 0.0

fw_mask = player_df['position'].str.contains('Forward', case=False)
player_df.loc[fw_mask, 'AC'] = (
    0.45 * player_df.loc[fw_mask, 'Goals_per90'] +
    0.25 * player_df.loc[fw_mask, 'xG_per90'] +
    0.15 * player_df.loc[fw_mask, 'Assists_per90'] +
    0.05 * player_df.loc[fw_mask, 'Shots_per90'] +
    0.10 * player_df.loc[fw_mask, 'FinishingEfficiency']
)

mf_mask = player_df['position'].str.contains('Midfielder', case=False)
player_df.loc[mf_mask, 'AC'] = (
    0.3 * player_df.loc[mf_mask, 'Goals_per90'] +
    0.3 * player_df.loc[mf_mask, 'Assists_per90'] +
    0.4 * player_df.loc[mf_mask, 'KeyPasses_per90']
)
player_df.loc[mf_mask, 'DC'] = (
    0.6 * player_df.loc[mf_mask, 'Tackles_per90'] +
    0.4 * player_df.loc[mf_mask, 'Interceptions_per90']
)

df_mask = player_df['position'].str.contains('Defender', case=False)
player_df.loc[df_mask, 'DC'] = (
    0.3 * player_df.loc[df_mask, 'Tackles_per90'] +
    0.3 * player_df.loc[df_mask, 'Interceptions_per90'] +
    0.25 * player_df.loc[df_mask, 'Clearances_per90'] +
    0.15 * player_df.loc[df_mask, 'AerialsWon_per90']
)

gk_mask = player_df['position'].str.contains('Goalkeeper', case=False)
player_df.loc[gk_mask, 'DC'] = (
    0.4 * player_df.loc[gk_mask, 'Saves_per90'] +
    0.4 * player_df.loc[gk_mask, 'CleanSheets_per90'] -
    0.2 * player_df.loc[gk_mask, 'GoalsConceded_per90']
)

player_df['contribution_score'] = (
    0.4 * player_df['AC'] +
    0.4 * player_df['DC'] +
    0.2 * player_df['MI']
)

min_score = player_df['contribution_score'].min()
max_score = player_df['contribution_score'].max()
player_df['contribution_score_normalized'] = (
    (player_df['contribution_score'] - min_score) /
    (max_score - min_score)
) * 100

# ----------------------------
# 5. 결과 저장
# ----------------------------
output_path = "/content/Dataset/final_contribution_scores_1season.csv"
player_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print(f"✅ 저장 완료: {output_path}")


✅ 저장 완료: /content/Dataset/final_contribution_scores_1season.csv


In [None]:
# train.csv 만들기
import pandas as pd
import os

# 경로 설정
os.makedirs("/content/Dataset", exist_ok=True)

# 불러올 라인업+경기결과 병합 파일
full_result_path = "/content/Dataset/full_lineup_result_fixed.csv"
df = pd.read_csv(full_result_path)

# 날짜 기준 통일
df['match_date'] = pd.to_datetime(df['match_date']).dt.date

# 경기별 고유 match_id 생성
df['match_id'] = (
    df['match_date'].astype(str) + "_" + df['home_team_name'] + "_vs_" + df['away_team_name']
)

# 결과 라벨링: 2 = 홈승, 1 = 무승부, 0 = 원정승
def label_result(row):
    if row['home_team_goal_count'] > row['away_team_goal_count']:
        return 2
    elif row['home_team_goal_count'] < row['away_team_goal_count']:
        return 0
    else:
        return 1

df['result'] = df.apply(label_result, axis=1)

# 홈 라인업
home_df = df[df['team_mapped'] == df['home_team_name']]
away_df = df[df['team_mapped'] == df['away_team_name']]

home_group = home_df.groupby('match_id').agg({
    'home_team_name': 'first',
    'away_team_name': 'first',
    'result': 'first',
    'player_name': lambda x: ', '.join(x)
}).reset_index().rename(columns={
    'home_team_name': 'home_team',
    'away_team_name': 'away_team',
    'player_name': 'home_lineup'
})

away_group = away_df.groupby('match_id').agg({
    'player_name': lambda x: ', '.join(x)
}).reset_index().rename(columns={'player_name': 'away_lineup'})

# 병합
train_df = pd.merge(home_group, away_group, on='match_id')
train_df = train_df[['home_team', 'home_lineup', 'away_team', 'away_lineup', 'result']]

# 저장
output_path = "/content/Dataset/train_contrib_based.csv"
train_df.to_csv(output_path, index=False, encoding="utf-8-sig")
print("✅ train.csv 생성 완료:", output_path)


✅ train.csv 생성 완료: /content/Dataset/train_contrib_based.csv


In [None]:

import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# 경로
base_path = "/content/Dataset"
os.makedirs(base_path, exist_ok=True)

# 파일 경로
full_result_path = f"{base_path}/full_lineup_result_fixed.csv"
contrib_path = f"{base_path}/final_contribution_scores_1season.csv"
train_csv_path = f"{base_path}/train_contrib_based.csv"
model_path = f"{base_path}/contrib_rf_model.pkl"

# ----------------------------
# Step 1. train.csv 생성
# ----------------------------
df = pd.read_csv(full_result_path)
df['match_date'] = pd.to_datetime(df['match_date']).dt.date
df['match_id'] = df['match_date'].astype(str) + "_" + df['home_team_name'] + "_vs_" + df['away_team_name']

def label_result(row):
    if row['home_team_goal_count'] > row['away_team_goal_count']:
        return 2
    elif row['home_team_goal_count'] < row['away_team_goal_count']:
        return 0
    else:
        return 1

df['result'] = df.apply(label_result, axis=1)

home_df = df[df['team_mapped'] == df['home_team_name']]
away_df = df[df['team_mapped'] == df['away_team_name']]

home_group = home_df.groupby('match_id').agg({
    'home_team_name': 'first',
    'away_team_name': 'first',
    'result': 'first',
    'player_name': lambda x: ', '.join(x)
}).reset_index().rename(columns={
    'home_team_name': 'home_team',
    'away_team_name': 'away_team',
    'player_name': 'home_lineup'
})

away_group = away_df.groupby('match_id').agg({
    'player_name': lambda x: ', '.join(x)
}).reset_index().rename(columns={'player_name': 'away_lineup'})

train_df = pd.merge(home_group, away_group, on='match_id')
train_df = train_df[['home_team', 'home_lineup', 'away_team', 'away_lineup', 'result']]
train_df.to_csv(train_csv_path, index=False, encoding='utf-8-sig')
print("✅ train.csv 생성 완료")

# ----------------------------
# Step 2. 기여도 불러와서 평균 계산
# ----------------------------
contrib_df = pd.read_csv(contrib_path)

def convert_name(name):
    parts = name.strip().split()
    return f"{parts[0][0].upper()}. {parts[-1].capitalize()}" if len(parts) >= 2 else name.strip()

def get_avg_contrib(lineup_str, contrib_df):
    names = [convert_name(n) for n in lineup_str.split(",") if n.strip()]
    scores = []
    for name in names:
        row = contrib_df[contrib_df['player_name'].str.lower() == name.lower()]
        if not row.empty:
            scores.append(row['contribution_score_normalized'].values[0])
    return round(np.mean(scores), 2) if scores else 0.0

train_df['home_avg_contrib'] = train_df['home_lineup'].apply(lambda x: get_avg_contrib(x, contrib_df))
train_df['away_avg_contrib'] = train_df['away_lineup'].apply(lambda x: get_avg_contrib(x, contrib_df))

# ----------------------------
# Step 3. 모델 학습
# ----------------------------
X = train_df[['home_avg_contrib', 'away_avg_contrib']]
y = train_df['result']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
joblib.dump(model, model_path)

# ----------------------------
# Step 4. 성능 출력
# ----------------------------
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print("✅ 모델 학습 및 저장 완료")
print(report_df)


✅ train.csv 생성 완료
✅ 모델 학습 및 저장 완료
              precision    recall  f1-score    support
0              0.391304  0.562500  0.461538  16.000000
1              0.428571  0.272727  0.333333  11.000000
2              0.583333  0.518519  0.549020  27.000000
accuracy       0.481481  0.481481  0.481481   0.481481
macro avg      0.467736  0.451249  0.447964  54.000000
weighted avg   0.494910  0.481481  0.479163  54.000000


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib

# 파일 경로
contrib_path = "/content/Dataset/final_contribution_scores_1season.csv"
test_path = "/content/Dataset/test_model.csv"

# 기여도 불러오기
contrib_df = pd.read_csv(contrib_path)

# 이름 변환 함수: "Declan Rice" → "D. Rice"
def convert_name(name):
    parts = name.strip().split()
    return f"{parts[0][0].upper()}. {parts[-1].capitalize()}" if len(parts) >= 2 else name.strip()

# 평균 기여도 계산
def get_avg_contrib(lineup_str, contrib_df):
    names = [convert_name(n) for n in lineup_str.split(",") if n.strip()]
    scores = []
    for name in names:
        row = contrib_df[contrib_df['player_name'].str.lower() == name.lower()]
        if not row.empty:
            scores.append(row['contribution_score_normalized'].values[0])
    return round(np.mean(scores), 2) if scores else 0.0

# 테스트 셋 로드
test_df = pd.read_csv(test_path)
test_df['home_avg_contrib'] = test_df['home_lineup'].apply(lambda x: get_avg_contrib(x, contrib_df))
test_df['away_avg_contrib'] = test_df['away_lineup'].apply(lambda x: get_avg_contrib(x, contrib_df))

# 모델 불러오기 또는 샘플 학습
try:
    model = joblib.load("/content/Dataset/contrib_rf_model.pkl")
except:
    # 샘플 학습 데이터 (임시용)
    np.random.seed(42)
    data = pd.DataFrame({
        'home_avg_contrib': np.random.normal(60, 10, 200),
        'away_avg_contrib': np.random.normal(55, 10, 200)
    })
    data['diff'] = data['home_avg_contrib'] - data['away_avg_contrib']
    data['result'] = data['diff'].apply(lambda x: 2 if x > 5 else (1 if -5 <= x <= 5 else 0))

    X = data[['home_avg_contrib', 'away_avg_contrib']]
    y = data['result']
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    joblib.dump(model, "/content/Dataset/contrib_rf_model.pkl")

# 예측
X_test = test_df[['home_avg_contrib', 'away_avg_contrib']]
test_df['prediction'] = model.predict(X_test)
test_df['result_label'] = test_df['prediction'].map({0: "AWAY WIN", 1: "DRAW", 2: "HOME WIN"})

# 결과 출력
print(test_df[['home_team', 'away_team', 'home_avg_contrib', 'away_avg_contrib', 'result_label']])


  home_team away_team  home_avg_contrib  away_avg_contrib result_label
0   Arsenal   Chelsea              9.57              10.6         DRAW


In [None]:
#최근전적 추가 모델
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

# 경로
base_path = "/content/Dataset"
os.makedirs(base_path, exist_ok=True)

# 파일 경로
full_result_path = f"{base_path}/full_lineup_result_fixed.csv"
contrib_path = f"{base_path}/final_contribution_scores_1season.csv"
train_csv_path = f"{base_path}/train_contrib_based_improved.csv"
model_path = f"{base_path}/contrib_rf_model_improved.pkl"

# ----------------------------
# Step 1. 데이터 불러오기 및 전처리
# ----------------------------
df = pd.read_csv(full_result_path)
df['match_date'] = pd.to_datetime(df['match_date']).dt.date
df['match_id'] = df['match_date'].astype(str) + "_" + df['home_team_name'] + "_vs_" + df['away_team_name']

def label_result(row):
    if row['home_team_goal_count'] > row['away_team_goal_count']:
        return 2
    elif row['home_team_goal_count'] < row['away_team_goal_count']:
        return 0
    else:
        return 1

df['result'] = df.apply(label_result, axis=1)

# ----------------------------
# Step 2. 최근 폼 계산 함수 추가
# ----------------------------
def calculate_recent_form(df, team_name, current_date, window=5):
    """
    특정 팀의 특정 날짜 이전 최근 window경기의 폼을 계산
    """
    # 현재 날짜 이전의 경기들만 필터링
    team_games = df[
        ((df['home_team_name'] == team_name) | (df['away_team_name'] == team_name)) &
        (df['match_date'] < current_date)
    ].sort_values('match_date').tail(window)

    if len(team_games) == 0:
        return 0.0

    points = []
    for _, row in team_games.iterrows():
        if row['home_team_name'] == team_name:
            # 홈팀으로 출전
            if row['home_team_goal_count'] > row['away_team_goal_count']:
                points.append(3)  # 승리
            elif row['home_team_goal_count'] == row['away_team_goal_count']:
                points.append(1)  # 무승부
            else:
                points.append(0)  # 패배
        else:
            # 원정팀으로 출전
            if row['away_team_goal_count'] > row['home_team_goal_count']:
                points.append(3)  # 승리
            elif row['away_team_goal_count'] == row['home_team_goal_count']:
                points.append(1)  # 무승부
            else:
                points.append(0)  # 패배

    # 최근 폼 점수 (평균)
    return np.mean(points)

# ----------------------------
# Step 3. 경기별 데이터 생성 (폼 포함)
# ----------------------------
home_df = df[df['team_mapped'] == df['home_team_name']]
away_df = df[df['team_mapped'] == df['away_team_name']]

home_group = home_df.groupby('match_id').agg({
    'match_date': 'first',
    'home_team_name': 'first',
    'away_team_name': 'first',
    'result': 'first',
    'player_name': lambda x: ', '.join(x)
}).reset_index().rename(columns={
    'home_team_name': 'home_team',
    'away_team_name': 'away_team',
    'player_name': 'home_lineup'
})

away_group = away_df.groupby('match_id').agg({
    'player_name': lambda x: ', '.join(x)
}).reset_index().rename(columns={'player_name': 'away_lineup'})

train_df = pd.merge(home_group, away_group, on='match_id')

# 최근 폼 계산 추가
print("최근 폼 계산 중...")
train_df['home_recent_form'] = train_df.apply(
    lambda row: calculate_recent_form(df, row['home_team'], row['match_date']), axis=1
)
train_df['away_recent_form'] = train_df.apply(
    lambda row: calculate_recent_form(df, row['away_team'], row['match_date']), axis=1
)

# ----------------------------
# Step 4. 기여도 계산
# ----------------------------
contrib_df = pd.read_csv(contrib_path)

def convert_name(name):
    parts = name.strip().split()
    return f"{parts[0][0].upper()}. {parts[-1].capitalize()}" if len(parts) >= 2 else name.strip()

def get_avg_contrib(lineup_str, contrib_df):
    names = [convert_name(n) for n in lineup_str.split(",") if n.strip()]
    scores = []
    for name in names:
        row = contrib_df[contrib_df['player_name'].str.lower() == name.lower()]
        if not row.empty:
            scores.append(row['contribution_score_normalized'].values[0])
    return round(np.mean(scores), 2) if scores else 0.0

train_df['home_avg_contrib'] = train_df['home_lineup'].apply(lambda x: get_avg_contrib(x, contrib_df))
train_df['away_avg_contrib'] = train_df['away_lineup'].apply(lambda x: get_avg_contrib(x, contrib_df))

# ----------------------------
# Step 5. 새로운 특성 추가
# ----------------------------
# 기여도 차이
train_df['contrib_diff'] = train_df['home_avg_contrib'] - train_df['away_avg_contrib']

# 폼 차이
train_df['form_diff'] = train_df['home_recent_form'] - train_df['away_recent_form']

# 홈 어드밴티지 (상수)
train_df['home_advantage'] = 1.0

# ----------------------------
# Step 6. 개선된 모델 학습
# ----------------------------
# 기존 2개 특성 + 새로운 4개 특성 = 총 6개 특성
feature_columns = [
    'home_avg_contrib', 'away_avg_contrib',  # 기존
    'contrib_diff', 'form_diff', 'home_advantage',  # 새로운
    'home_recent_form', 'away_recent_form'  # 폼 직접 사용
]

X = train_df[feature_columns]
y = train_df['result']

# 결측치 처리
X = X.fillna(0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 더 강력한 RandomForest 모델
model = RandomForestClassifier(
    n_estimators=200,  # 트리 개수 증가
    max_depth=10,      # 깊이 제한
    min_samples_split=5,
    min_samples_leaf=3,
    random_state=42
)

model.fit(X_train, y_train)
joblib.dump(model, model_path)

# ----------------------------
# Step 7. 성능 평가
# ----------------------------
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

print("✅ 개선된 모델 학습 완료!")
print("\n=== 성능 보고서 ===")
print(report_df)

# 특성 중요도 출력
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n=== 특성 중요도 ===")
print(feature_importance)

# 향상된 train 데이터 저장
train_df.to_csv(train_csv_path, index=False, encoding='utf-8-sig')
print(f"\n✅ 향상된 훈련 데이터 저장: {train_csv_path}")

# ----------------------------
# Step 8. 예측 함수 (테스트용)
# ----------------------------
def predict_match_improved(home_team, home_lineup, away_team, away_lineup, model, contrib_df, df):
    """
    개선된 모델로 경기 결과 예측
    """
    # 기여도 계산
    home_contrib = get_avg_contrib(home_lineup, contrib_df)
    away_contrib = get_avg_contrib(away_lineup, contrib_df)

    # 최근 폼 계산 (현재 날짜를 최신으로 가정)
    latest_date = df['match_date'].max()
    home_form = calculate_recent_form(df, home_team, latest_date)
    away_form = calculate_recent_form(df, away_team, latest_date)

    # 특성 벡터 생성
    features = pd.DataFrame({
        'home_avg_contrib': [home_contrib],
        'away_avg_contrib': [away_contrib],
        'contrib_diff': [home_contrib - away_contrib],
        'form_diff': [home_form - away_form],
        'home_advantage': [1.0],
        'home_recent_form': [home_form],
        'away_recent_form': [away_form]
    })

    prediction = model.predict(features)[0]
    probability = model.predict_proba(features)[0]

    result_map = {0: "AWAY WIN", 1: "DRAW", 2: "HOME WIN"}

    return {
        'prediction': result_map[prediction],
        'probabilities': {
            'AWAY WIN': round(probability[0] * 100, 1),
            'DRAW': round(probability[1] * 100, 1),
            'HOME WIN': round(probability[2] * 100, 1)
        },
        'home_form': round(home_form, 2),
        'away_form': round(away_form, 2),
        'home_contrib': home_contrib,
        'away_contrib': away_contrib
    }

print("\n=== 사용 예시 ===")
print("predict_match_improved(home_team, home_lineup, away_team, away_lineup, model, contrib_df, df)")

최근 폼 계산 중...
✅ 개선된 모델 학습 완료!

=== 성능 보고서 ===
              precision    recall  f1-score    support
0              0.375000  0.375000  0.375000  16.000000
1              0.428571  0.272727  0.333333  11.000000
2              0.645161  0.740741  0.689655  27.000000
accuracy       0.537037  0.537037  0.537037   0.537037
macro avg      0.482911  0.462823  0.465996  54.000000
weighted avg   0.520993  0.537037  0.523840  54.000000

=== 특성 중요도 ===
            feature  importance
1  away_avg_contrib    0.283322
0  home_avg_contrib    0.260117
2      contrib_diff    0.244563
3         form_diff    0.097630
6  away_recent_form    0.061417
5  home_recent_form    0.052952
4    home_advantage    0.000000

✅ 향상된 훈련 데이터 저장: /content/Dataset/train_contrib_based_improved.csv

=== 사용 예시 ===
predict_match_improved(home_team, home_lineup, away_team, away_lineup, model, contrib_df, df)


In [None]:
# 기존 코드에 추가할 고급 특성들

# ----------------------------
# 고급 특성 계산 함수들
# ----------------------------

def calculate_head_to_head(df, home_team, away_team, current_date, window=10):
    """
    두 팀 간 최근 상대전적 계산
    """
    h2h_games = df[
        (((df['home_team_name'] == home_team) & (df['away_team_name'] == away_team)) |
         ((df['home_team_name'] == away_team) & (df['away_team_name'] == home_team))) &
        (df['match_date'] < current_date)
    ].sort_values('match_date').tail(window)

    if len(h2h_games) == 0:
        return 0.0

    home_points = 0
    total_games = len(h2h_games)

    for _, row in h2h_games.iterrows():
        if row['home_team_name'] == home_team:
            if row['home_team_goal_count'] > row['away_team_goal_count']:
                home_points += 3
            elif row['home_team_goal_count'] == row['away_team_goal_count']:
                home_points += 1
        else:  # home_team이 원정팀인 경우
            if row['away_team_goal_count'] > row['home_team_goal_count']:
                home_points += 3
            elif row['away_team_goal_count'] == row['home_team_goal_count']:
                home_points += 1

    return home_points / (total_games * 3)  # 승률로 정규화

def calculate_home_away_form(df, team_name, current_date, is_home=True, window=5):
    """
    홈/원정 전용 폼 계산
    """
    if is_home:
        team_games = df[
            (df['home_team_name'] == team_name) & (df['match_date'] < current_date)
        ].sort_values('match_date').tail(window)
    else:
        team_games = df[
            (df['away_team_name'] == team_name) & (df['match_date'] < current_date)
        ].sort_values('match_date').tail(window)

    if len(team_games) == 0:
        return 0.0

    points = []
    for _, row in team_games.iterrows():
        if is_home:
            if row['home_team_goal_count'] > row['away_team_goal_count']:
                points.append(3)
            elif row['home_team_goal_count'] == row['away_team_goal_count']:
                points.append(1)
            else:
                points.append(0)
        else:
            if row['away_team_goal_count'] > row['home_team_goal_count']:
                points.append(3)
            elif row['away_team_goal_count'] == row['home_team_goal_count']:
                points.append(1)
            else:
                points.append(0)

    return np.mean(points)

def calculate_goal_stats(df, team_name, current_date, window=10):
    """
    최근 득점/실점 통계
    """
    team_games = df[
        ((df['home_team_name'] == team_name) | (df['away_team_name'] == team_name)) &
        (df['match_date'] < current_date)
    ].sort_values('match_date').tail(window)

    if len(team_games) == 0:
        return 0.0, 0.0

    goals_for = 0
    goals_against = 0

    for _, row in team_games.iterrows():
        if row['home_team_name'] == team_name:
            goals_for += row['home_team_goal_count']
            goals_against += row['away_team_goal_count']
        else:
            goals_for += row['away_team_goal_count']
            goals_against += row['home_team_goal_count']

    return goals_for / len(team_games), goals_against / len(team_games)

def calculate_position_based_contrib(lineup_str, contrib_df):
    """
    포지션별 기여도 계산
    """
    names = [convert_name(n) for n in lineup_str.split(",") if n.strip()]

    fw_contrib = []
    mf_contrib = []
    df_contrib = []
    gk_contrib = []

    for name in names:
        player_data = contrib_df[contrib_df['player_name'].str.lower() == name.lower()]
        if not player_data.empty:
            position = player_data['position'].values[0]
            contrib = player_data['contribution_score_normalized'].values[0]

            if 'Forward' in position:
                fw_contrib.append(contrib)
            elif 'Midfielder' in position:
                mf_contrib.append(contrib)
            elif 'Defender' in position:
                df_contrib.append(contrib)
            elif 'Goalkeeper' in position:
                gk_contrib.append(contrib)

    return {
        'forward_contrib': np.mean(fw_contrib) if fw_contrib else 0,
        'midfielder_contrib': np.mean(mf_contrib) if mf_contrib else 0,
        'defender_contrib': np.mean(df_contrib) if df_contrib else 0,
        'goalkeeper_contrib': np.mean(gk_contrib) if gk_contrib else 0
    }

# ----------------------------
# 기존 train_df에 고급 특성 추가
# ----------------------------

print("고급 특성 계산 중...")

# 1. 상대전적
train_df['h2h_advantage'] = train_df.apply(
    lambda row: calculate_head_to_head(df, row['home_team'], row['away_team'], row['match_date']),
    axis=1
)

# 2. 홈/원정 전용 폼
train_df['home_home_form'] = train_df.apply(
    lambda row: calculate_home_away_form(df, row['home_team'], row['match_date'], is_home=True),
    axis=1
)
train_df['away_away_form'] = train_df.apply(
    lambda row: calculate_home_away_form(df, row['away_team'], row['match_date'], is_home=False),
    axis=1
)

# 3. 최근 득점/실점 통계
home_goals_stats = train_df.apply(
    lambda row: calculate_goal_stats(df, row['home_team'], row['match_date']),
    axis=1, result_type='expand'
)
train_df['home_goals_per_game'] = home_goals_stats[0]
train_df['home_goals_against_per_game'] = home_goals_stats[1]

away_goals_stats = train_df.apply(
    lambda row: calculate_goal_stats(df, row['away_team'], row['match_date']),
    axis=1, result_type='expand'
)
train_df['away_goals_per_game'] = away_goals_stats[0]
train_df['away_goals_against_per_game'] = away_goals_stats[1]

# 4. 포지션별 기여도
home_pos_contrib = train_df['home_lineup'].apply(
    lambda x: calculate_position_based_contrib(x, contrib_df)
)
for pos in ['forward', 'midfielder', 'defender', 'goalkeeper']:
    train_df[f'home_{pos}_contrib'] = [d[f'{pos}_contrib'] for d in home_pos_contrib]

away_pos_contrib = train_df['away_lineup'].apply(
    lambda x: calculate_position_based_contrib(x, contrib_df)
)
for pos in ['forward', 'midfielder', 'defender', 'goalkeeper']:
    train_df[f'away_{pos}_contrib'] = [d[f'{pos}_contrib'] for d in away_pos_contrib]

# 5. 공격/수비 밸런스
train_df['home_attack_strength'] = (train_df['home_forward_contrib'] + train_df['home_midfielder_contrib']) / 2
train_df['home_defense_strength'] = (train_df['home_defender_contrib'] + train_df['home_goalkeeper_contrib']) / 2
train_df['away_attack_strength'] = (train_df['away_forward_contrib'] + train_df['away_midfielder_contrib']) / 2
train_df['away_defense_strength'] = (train_df['away_defender_contrib'] + train_df['away_goalkeeper_contrib']) / 2

# 6. 종합 우위 지표
train_df['attack_vs_defense'] = train_df['home_attack_strength'] - train_df['away_defense_strength']
train_df['defense_vs_attack'] = train_df['home_defense_strength'] - train_df['away_attack_strength']
train_df['goals_diff'] = train_df['home_goals_per_game'] - train_df['away_goals_against_per_game']

# ----------------------------
# 새로운 특성 목록 (총 25개 특성!)
# ----------------------------
advanced_feature_columns = [
    # 기존 특성 (7개)
    'home_avg_contrib', 'away_avg_contrib', 'contrib_diff',
    'form_diff', 'home_advantage', 'home_recent_form', 'away_recent_form',

    # 상대전적 (1개)
    'h2h_advantage',

    # 홈/원정 전용 폼 (2개)
    'home_home_form', 'away_away_form',

    # 득점/실점 통계 (4개)
    'home_goals_per_game', 'home_goals_against_per_game',
    'away_goals_per_game', 'away_goals_against_per_game',

    # 포지션별 기여도 (8개)
    'home_forward_contrib', 'home_midfielder_contrib', 'home_defender_contrib', 'home_goalkeeper_contrib',
    'away_forward_contrib', 'away_midfielder_contrib', 'away_defender_contrib', 'away_goalkeeper_contrib',

    # 종합 지표 (3개)
    'attack_vs_defense', 'defense_vs_attack', 'goals_diff'
]

print(f"총 특성 개수: {len(advanced_feature_columns)}개")

# ----------------------------
# 고급 모델 학습
# ----------------------------
X_advanced = train_df[advanced_feature_columns].fillna(0)
y = train_df['result']

X_train, X_test, y_train, y_test = train_test_split(X_advanced, y, test_size=0.2, random_state=42)

# 더욱 강력한 모델
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# 여러 모델 시도
models = {
    'RandomForest': RandomForestClassifier(
        n_estimators=300,
        max_depth=12,
        min_samples_split=3,
        min_samples_leaf=2,
        random_state=42
    ),
    'GradientBoosting': GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    )
}

best_model = None
best_score = 0

for name, model in models.items():
    model.fit(X_train, y_train)
    score = accuracy_score(y_test, model.predict(X_test))
    print(f"{name} 정확도: {score:.3f}")

    if score > best_score:
        best_score = score
        best_model = model

print(f"\n최고 성능: {best_score:.3f}")

# 특성 중요도 출력
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'feature': advanced_feature_columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)

    print("\n=== TOP 10 중요 특성 ===")
    print(feature_importance.head(10))

고급 특성 계산 중...
총 특성 개수: 25개
RandomForest 정확도: 0.574
GradientBoosting 정확도: 0.500

최고 성능: 0.574

=== TOP 10 중요 특성 ===
                    feature  importance
14     home_forward_contrib    0.118922
16    home_defender_contrib    0.078755
19  away_midfielder_contrib    0.059671
2              contrib_diff    0.059020
0          home_avg_contrib    0.055301
20    away_defender_contrib    0.055020
21  away_goalkeeper_contrib    0.054393
18     away_forward_contrib    0.053839
1          away_avg_contrib    0.053758
15  home_midfielder_contrib    0.053000


In [None]:
ㅌㅌㅌ