In [1]:
import pandas as pd

# Load CSV
df = pd.read_csv(r"C:\Users\sahil\OneDrive\Desktop\cPP\capstone\fifa_players.csv")


# Remove the specified columns
cols_to_remove = ['value_euro', 'wage_euro', 'body_type', 'national_team', 'release_clause_euro','birth_date','international_reputation(1-5)','full_name','national_jersey_number']
df = df.drop(columns=cols_to_remove, errors='ignore')

# Save the new CSV
df.to_csv("cleaned_file.csv", index=False)


In [2]:
df.head()

Unnamed: 0,name,age,height_cm,weight_kgs,positions,nationality,overall_rating,potential,preferred_foot,weak_foot(1-5),...,long_shots,aggression,interceptions,positioning,vision,penalties,composure,marking,standing_tackle,sliding_tackle
0,L. Messi,31,170.18,72.1,"CF,RW,ST",Argentina,94,94,Left,4,...,94,48,22,94,94,75,96,33,28,26
1,C. Eriksen,27,154.94,76.2,"CAM,RM,CM",Denmark,88,89,Right,5,...,89,46,56,84,91,67,88,59,57,22
2,P. Pogba,25,190.5,83.9,"CM,CAM",France,88,91,Right,4,...,82,78,64,82,88,82,87,63,67,67
3,L. Insigne,27,162.56,59.0,"LW,ST",Italy,88,88,Right,4,...,84,34,26,83,87,61,83,51,24,22
4,K. Koulibaly,27,187.96,88.9,CB,Senegal,88,91,Right,3,...,15,87,88,24,49,33,80,91,88,87


In [6]:
# best_11_ai.py
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import random
import warnings
warnings.filterwarnings('ignore')

# ---------- CONFIG ----------
CSV_IN = r"C:\Users\sahil\OneDrive\Desktop\cPP\capstone\cleaned_file.csv"
CSV_OUT = r"C:\Users\sahil\OneDrive\Desktop\cPP\capstone\best_11_ai_by_country.csv"
RANDOM_SEED = 42
N_SYNTHETIC_TEAMS = 6000
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

# ---------- FORMATIONS ----------
FORMATIONS = {
    "4-3-3": {"GK":1, "DF":4, "MF":3, "FW":3},
    "4-4-2": {"GK":1, "DF":4, "MF":4, "FW":2},
    "3-5-2": {"GK":1, "DF":3, "MF":5, "FW":2},
    "4-2-3-1": {"GK":1, "DF":4, "MF":5, "FW":1},
    "4-1-4-1": {"GK":1, "DF":4, "MF":5, "FW":1},
    "5-3-2": {"GK":1, "DF":5, "MF":3, "FW":2},
    "3-4-3": {"GK":1, "DF":3, "MF":4, "FW":3},
    "4-5-1": {"GK":1, "DF":4, "MF":5, "FW":1}
}

# ---------- ROLE HELPERS ----------
def normalize_position(pos):
    if not isinstance(pos, str): return ''
    return pos.split(',')[0].strip().upper()

def get_role(pos):
    if 'GK' in pos:
        return 'GK'
    if any(p in pos for p in ['CB','LB','RB','LWB','RWB']):
        return 'DF'
    if any(p in pos for p in ['CDM','CM','CAM','LM','RM']):
        return 'MF'
    if any(p in pos for p in ['ST','CF','LW','RW']):
        return 'FW'
    return 'OTHER'

# ---------- TEAM FEATURE EXTRACTION ----------
def compute_team_features(team_df):
    features = {}
    features['team_size'] = len(team_df)
    features['total_overall'] = team_df['overall_rating'].sum()
    features['mean_overall'] = team_df['overall_rating'].mean()
    features['std_overall'] = team_df['overall_rating'].std(ddof=0) if len(team_df)>1 else 0.0
    features['total_potential'] = team_df.get('potential', team_df['overall_rating']).sum()

    for role in ['GK','DF','MF','FW']:
        subset = team_df[team_df['role']==role]
        features[f'{role}_count'] = len(subset)
        features[f'{role}_mean_overall'] = subset['overall_rating'].mean() if len(subset)>0 else 0.0

    features['avg_height'] = team_df['height_cm'].mean() if 'height_cm' in team_df.columns else 0.0
    features['avg_weight'] = team_df['weight_kgs'].mean() if 'weight_kgs' in team_df.columns else 0.0

    left = sum(team_df['preferred_foot'].str.lower().fillna('').str.contains('left'))
    right = sum(team_df['preferred_foot'].str.lower().fillna('').str.contains('right'))
    features['left_count'] = left
    features['right_count'] = right
    features['foot_balance'] = 1 - abs(left-right)/11.0

    def pos_fit_score(row):
        pos = row['positions']
        role = row['role']
        if role == 'GK' and 'GK' in pos: return 1.0
        if role == 'DF' and any(p in pos for p in ['CB','LB','RB','LWB','RWB']): return 1.0
        if role == 'MF' and any(p in pos for p in ['CDM','CM','CAM','LM','RM']): return 1.0
        if role == 'FW' and any(p in pos for p in ['ST','CF','LW','RW']): return 1.0
        return 0.0

    features['pos_fit_sum'] = team_df.apply(pos_fit_score, axis=1).sum()
    features['pos_fit_frac'] = features['pos_fit_sum'] / 11.0
    return features

# ---------- HEURISTIC TEAM SCORE ----------
def heuristic_team_score(team_df):
    total_overall = team_df['overall_rating'].sum()
    pos_fit = compute_team_features(team_df)['pos_fit_sum']
    left = sum(team_df['preferred_foot'].str.lower().fillna('').str.contains('left'))
    right = sum(team_df['preferred_foot'].str.lower().fillna('').str.contains('right'))
    foot_balance = 1 - abs(left-right)/11.0
    potential_sum = team_df.get('potential', team_df['overall_rating']).sum()
    score = total_overall + 0.8 * pos_fit + 20.0 * foot_balance + 0.05 * potential_sum
    return score

# ---------- GREEDY TEAM BUILDER (with fallback + edge safety) ----------
def build_greedy_team_for_formation(pool_df, formation_roles):
    parts = []
    used_names = set()

    fallback_roles = {
        "GK": [],
        "DF": ["MF"],      # mids can drop back
        "MF": ["DF", "FW"],# can use defenders or forwards
        "FW": ["MF"]       # mids can push up
    }

    for role, count in formation_roles.items():
        candidates = pool_df[(pool_df['role'] == role) & (~pool_df['name'].isin(used_names))]

        if len(candidates) < count:
            backups = []
            for fb in fallback_roles.get(role, []):
                extra = pool_df[(pool_df['role'] == fb) & (~pool_df['name'].isin(used_names))]
                if not extra.empty:
                    extra = extra.copy()
                    extra['overall_rating'] = extra['overall_rating'] * 0.9
                    backups.append(extra)
            if backups:
                candidates = pd.concat([candidates] + backups)

        selected = candidates.nlargest(count, 'overall_rating')
        if len(selected) < count:
            missing = count - len(selected)
            dummy = pd.DataFrame({
                'name': [f"No suitable player ({role})"] * missing,
                'role': [role] * missing,
                'overall_rating': [0] * missing,
                'positions': [role] * missing
            })
            selected = pd.concat([selected, dummy])

        parts.append(selected)
        used_names.update(selected['name'].tolist())

    team = pd.concat(parts).drop_duplicates(subset=['name'])
    return team

# ---------- EDGE CASE CHECK ----------
def has_minimum_role_diversity(country_df):
    """Detect edge cases like all-GK or missing position variety."""
    role_counts = country_df['role'].value_counts().to_dict()
    if 'GK' not in role_counts:
        return False
    non_gk_roles = [r for r in ['DF', 'MF', 'FW'] if r in role_counts]
    return len(non_gk_roles) >= 2  # at least 2 other types besides GK

# ---------- MAIN ----------
def main():
    df = pd.read_csv(CSV_IN)
    df['positions'] = df['positions'].astype(str).fillna('').apply(normalize_position)
    df['role'] = df['positions'].apply(get_role)

    required_cols = ['name','nationality','positions','role','overall_rating',
                     'height_cm','weight_kgs','preferred_foot','potential']
    for c in required_cols:
        if c not in df.columns:
            if c == 'potential':
                df[c] = df.get('overall_rating', pd.Series([50]*len(df)))
            else:
                df[c] = ''

    df = df[pd.to_numeric(df['overall_rating'], errors='coerce').notnull()]
    df['overall_rating'] = df['overall_rating'].astype(float)
    if 'height_cm' in df.columns:
        df['height_cm'] = pd.to_numeric(df['height_cm'], errors='coerce').fillna(df['height_cm'].mean())
    if 'weight_kgs' in df.columns:
        df['weight_kgs'] = pd.to_numeric(df['weight_kgs'], errors='coerce').fillna(df['weight_kgs'].mean())
    df['potential'] = pd.to_numeric(df['potential'], errors='coerce').fillna(df['overall_rating'])

    # Eligible countries
    eligible_countries = []
    for country, sub in df.groupby('nationality'):
        if len(sub) >= 11 and 'GK' in sub['role'].values:
            eligible_countries.append(country)
    print(f"Eligible countries found: {len(eligible_countries)}")

    # ---------- Synthetic training ----------
    X_rows = []
    y = []
    players_pool = df.copy()
    role_groups = {r: players_pool[players_pool['role']==r] for r in ['GK','DF','MF','FW']}
    formation_names = list(FORMATIONS.keys())

    for i in range(N_SYNTHETIC_TEAMS):
        form_name = random.choice(formation_names)
        roles = FORMATIONS[form_name]
        feasible = all(len(role_groups.get(r, pd.DataFrame())) >= cnt for r,cnt in roles.items())
        if not feasible:
            continue
        team_parts = []
        for r,cnt in roles.items():
            sampled = role_groups[r].sample(n=cnt, replace=False)
            team_parts.append(sampled)
        team_df = pd.concat(team_parts)
        feats = compute_team_features(team_df)
        X_rows.append(feats)
        y.append(heuristic_team_score(team_df))

    X = pd.DataFrame(X_rows)
    y = np.array(y)
    print("Synthetic teams generated:", len(y))

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=RANDOM_SEED)
    model = RandomForestRegressor(n_estimators=200, random_state=RANDOM_SEED, n_jobs=-1)
    model.fit(X_train.fillna(0), y_train)
    y_pred = model.predict(X_val.fillna(0))
    print("Model R2 on validation:", round(r2_score(y_val, y_pred), 4))

    # ---------- Country-wise best-11 prediction ----------
    results = []
    for country in eligible_countries:
        pool = df[df['nationality'] == country].reset_index(drop=True)

        # Edge case detection
        if not has_minimum_role_diversity(pool):
            print(f"[{country}] ⚠️ Skipped — insufficient positional diversity (edge case).")
            continue

        best_score = -1e9
        best_team = None
        best_formation = None

        for form_name, roles in FORMATIONS.items():
            team = build_greedy_team_for_formation(pool, roles)
            if team is None or team.empty:
                continue
            feats = compute_team_features(team)
            pred_score = model.predict(pd.DataFrame([feats]).fillna(0))[0]
            if pred_score > best_score:
                best_score = pred_score
                best_team = team.copy()
                best_formation = form_name

        if best_team is not None:
            best_team = best_team.copy()
            best_team['chosen_formation'] = best_formation
            best_team['predicted_score'] = best_score
            best_team['country'] = country
            results.append(best_team)
            print(f"[{country}] chosen formation: {best_formation} predicted_score: {round(best_score,2)}")
        else:
            print(f"[{country}] no feasible formation found (skipped)")

    if len(results) == 0:
        print("No teams produced. Check your dataset and roles mapping.")
        return

    final_df = pd.concat(results, ignore_index=True)
    cols_order = ['country','chosen_formation','name','positions','role',
                  'overall_rating','potential','preferred_foot','height_cm','weight_kgs','predicted_score']
    existing_cols = [c for c in cols_order if c in final_df.columns]
    other_cols = [c for c in final_df.columns if c not in existing_cols]
    final_df = final_df[existing_cols + other_cols]

    final_df.to_csv(CSV_OUT, index=False)
    print("✅ Saved best-11 (AI-chosen) to:", CSV_OUT)

if __name__ == "__main__":
    main()


Eligible countries found: 82
Synthetic teams generated: 6000
Model R2 on validation: 0.9991
[Albania] chosen formation: 4-2-3-1 predicted_score: 863.63
[Algeria] chosen formation: 4-2-3-1 predicted_score: 874.43
[Argentina] chosen formation: 4-5-1 predicted_score: 874.06
[Australia] chosen formation: 4-3-3 predicted_score: 873.18
[Austria] chosen formation: 4-2-3-1 predicted_score: 874.1
[Belgium] chosen formation: 4-3-3 predicted_score: 873.37
[Benin] chosen formation: 4-3-3 predicted_score: 805.27
[Bolivia] chosen formation: 3-5-2 predicted_score: 828.73
[Bosnia Herzegovina] chosen formation: 4-3-3 predicted_score: 873.37
[Brazil] chosen formation: 4-4-2 predicted_score: 874.0
[Bulgaria] chosen formation: 4-1-4-1 predicted_score: 832.09
[Burkina Faso] chosen formation: 3-4-3 predicted_score: 835.54
[Cameroon] chosen formation: 4-2-3-1 predicted_score: 872.31
[Canada] chosen formation: 3-4-3 predicted_score: 850.03
[Chile] chosen formation: 4-2-3-1 predicted_score: 873.39
[China PR] c

In [None]:
# import pandas as pd

# # Load dataset
# df = pd.read_csv(r"C:\Users\Manish.Khurana\Downloads\formation\cleaned_file.csv")

# # Clean up position text
# df['positions'] = df['positions'].astype(str).apply(lambda x: x.split(',')[0].strip().upper())

# # Classify each player into role
# def get_role(pos):
#     if 'GK' in pos:
#         return 'GK'
#     elif any(p in pos for p in ['CB', 'LB', 'RB', 'LWB', 'RWB']):
#         return 'DF'
#     elif any(p in pos for p in ['CDM', 'CM', 'CAM', 'LM', 'RM']):
#         return 'MF'
#     elif any(p in pos for p in ['ST', 'CF', 'LW', 'RW']):
#         return 'FW'
#     else:
#         return 'Other'

# df['role'] = df['positions'].apply(get_role)

# # Define formations and required roles
# formations = {
#     "4-3-3": {"GK": 1, "DF": 4, "MF": 3, "FW": 3},
#     "4-4-2": {"GK": 1, "DF": 4, "MF": 4, "FW": 2},
#     "3-5-2": {"GK": 1, "DF": 3, "MF": 5, "FW": 2},
#     "4-2-3-1": {"GK": 1, "DF": 4, "MF": 5, "FW": 1},
#     "4-1-4-1": {"GK": 1, "DF": 4, "MF": 5, "FW": 1},
#     "5-3-2": {"GK": 1, "DF": 5, "MF": 3, "FW": 2},
#     "3-4-3": {"GK": 1, "DF": 3, "MF": 4, "FW": 3}
# }

# # Filter countries with enough players and a GK
# eligible_countries = []
# for country, sub in df.groupby('nationality'):
#     if len(sub) >= 11 and 'GK' in sub['role'].values:
#         eligible_countries.append(country)

# best_xi_list = []

# for country in eligible_countries:
#     sub = df[df['nationality'] == country]
#     best_team = None
#     best_score = -1
#     best_formation = None

#     # Try each formation
#     for form_name, roles in formations.items():
#         team_parts = []

#         valid = True
#         total_score = 0
        
#         for role, count in roles.items():
#             role_players = sub[sub['role'] == role].nlargest(count, 'overall_rating')
#             if len(role_players) < count:
#                 valid = False
#                 break
#             team_parts.append(role_players)
#             total_score += role_players['overall_rating'].sum()
        
#         if valid and total_score > best_score:
#             best_score = total_score
#             best_team = pd.concat(team_parts)
#             best_formation = form_name

#     # Save best formation for the country
#     if best_team is not None:
#         best_team['country'] = country
#         best_team['best_formation'] = best_formation
#         best_team['team_rating_sum'] = best_score
#         best_xi_list.append(best_team)

# # Combine all countries’ best XIs
# final_best_xi = pd.concat(best_xi_list)

# # Save results
# final_best_xi.to_csv(r"C:\Users\Manish.Khurana\Downloads\formation\best_11_optimal_by_country.csv", index=False)

# print("✅ Optimal Best XI for each country saved as 'best_11_optimal_by_country.csv'")


✅ Optimal Best XI for each country saved as 'best_11_optimal_by_country.csv'
