In [None]:
pip install pandas numpy scikit-learn catboost joblib tqdm requests pyyaml streamlit pulp fastapi uvicorn nest_asyncio pyngrok

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pulp
  Downloading pulp-3.3.0-py3-none-any.whl.metadata (8.4 kB)
Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pulp-3.3.0-py3-none-any.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m24.3 MB/s[0m eta [36m0:

In [None]:
import json
import pandas as pd
import requests
from io import BytesIO
from zipfile import ZipFile
from tempfile import TemporaryDirectory
import yaml
import numpy as np
from datetime import datetime
from collections import defaultdict
import os

def calculate_batting_points(player_stats, match_type):
    points = 0
    runs_scored = player_stats.get("Total Runs Scored", 0)
    boundaries = player_stats.get("Fours", 0)
    sixes = player_stats.get("Sixes", 0)
    balls_faced = player_stats.get("Balls Faced", 0)
    inning1_not_out = player_stats.get("How Out Inning 1 (not out)", 0)
    inning2_not_out = player_stats.get("How Out Inning 2 (not out)", 0)
    inning1_not_played = player_stats.get("How Out Inning 1 (Not Played)", 0)
    inning2_not_played = player_stats.get("How Out Inning 2 (Not Played)", 0)

    points += runs_scored + boundaries + 2 * sixes
    if match_type == 't20':
        if runs_scored >= 100: points += 16
        elif runs_scored >= 50: points += 8
        elif runs_scored >= 30: points += 4
        if runs_scored == 0 and ((inning1_not_played == 0 and inning1_not_out == 0) or
                                 (inning2_not_played == 0 and inning2_not_out == 0)) and balls_faced > 0:
            points -= 2
        if balls_faced >= 10:
            strike_rate = player_stats.get("Avg Batting S/R Per Inning", 0)
            if strike_rate > 170: points += 6
            elif 150 < strike_rate <= 170: points += 4
            elif 130 <= strike_rate <= 150: points += 2
            elif 60 <= strike_rate < 70: points -= 2
            elif 50 <= strike_rate < 60: points -= 4
            elif strike_rate < 50: points -= 6
    elif match_type == 'odi':
        if runs_scored >= 100: points += 8
        elif runs_scored >= 50: points += 4
        if runs_scored == 0 and ((inning1_not_played == 0 and inning1_not_out == 0) or
                                 (inning2_not_played == 0 and inning2_not_out == 0)) and balls_faced > 0:
            points -= 3
        if balls_faced >= 20:
            strike_rate = player_stats.get("Avg Batting S/R Per Inning", 0)
            if strike_rate > 140: points += 6
            elif 120 < strike_rate <= 140: points += 4
            elif 100 <= strike_rate <= 120: points += 2
            elif 40 <= strike_rate < 50: points -= 2
            elif 30 <= strike_rate < 40: points -= 4
            elif strike_rate < 30: points -= 6
    return points

def calculate_bowling_points(player_stats, match_type):
    points = 0
    wickets = player_stats.get("Wickets", 0)
    overs_bowled = player_stats.get("Overs Bowled", 0)
    economy_rate = player_stats.get("Avg Economy Rate per inning", 0)
    bowled_or_lbw = player_stats.get("Bowled", 0) + player_stats.get("LBW", 0)
    points += wickets * (25 if match_type in ['t20', 'odi'] else 16)
    points += bowled_or_lbw * 8
    if match_type == 't20':
        if wickets >= 5: points += 16
        elif wickets == 4: points += 8
        elif wickets == 3: points += 4
        maiden_overs = player_stats.get("Maiden Overs", 0)
        points += maiden_overs * 12
        if overs_bowled >= 2:
            if economy_rate < 5: points += 6
            elif 5 <= economy_rate <= 5.99: points += 4
            elif 6 <= economy_rate <= 7: points += 2
            elif 10 <= economy_rate <= 11: points -= 2
            elif 11.01 <= economy_rate <= 12: points -= 4
            elif economy_rate > 12: points -= 6
    elif match_type == 'odi':
        if wickets >= 5: points += 8
        elif wickets == 4: points += 4
        maiden_overs = player_stats.get("Maiden Overs", 0)
        points += maiden_overs * 4
        if overs_bowled >= 5:
            if economy_rate < 2.5: points += 6
            elif 2.5 <= economy_rate < 3.5: points += 4
            elif 3.5 <= economy_rate <= 4.5: points += 2
            elif 7 <= economy_rate <= 8: points -= 2
            elif 8 < economy_rate <= 9: points -= 4
            elif economy_rate > 9: points -= 6
    return points

def calculate_fielding_points(player_stats, match_type):
    points = 0
    catches_taken = player_stats.get("Catches Taken", 0)
    stumped_outs = player_stats.get("Stumped Outs Made", 0)
    run_outs = player_stats.get("Run Outs Made", 0)
    points += catches_taken * 8 + stumped_outs * 12 + run_outs * 8
    if catches_taken >= 3 and match_type in ['t20', 'odi']: points += 4
    return points

def calculate_fantasy_points(player_stats, match_type):
    batting_points = calculate_batting_points(player_stats, match_type)
    bowling_points = calculate_bowling_points(player_stats, match_type)
    fielding_points = calculate_fielding_points(player_stats, match_type)
    total_points = batting_points + bowling_points + fielding_points + 4
    return {
        "total_points": total_points,
        "batting_points": batting_points,
        "bowling_points": bowling_points,
        "fielding_points": fielding_points
    }

def calculate_fantasy_points_test(player_stats):
    points = 0
    runs_scored = player_stats.get("Total Runs Scored", 0)
    boundaries = player_stats.get("Fours", 0)
    sixes = player_stats.get("Sixes", 0)
    balls_faced = player_stats.get("Balls Faced", 0)
    inning_runs = [player_stats.get(f"Innings {i} Runs", 0) for i in range(1, 5)]
    inning_balls = [player_stats.get(f"Innings {i} Balls Faced", 0) for i in range(1, 5)]
    inning_not_out = [player_stats.get(f"How Out Inning {i} (not out)", 0) for i in range(1, 5)]
    inning_not_played = [player_stats.get(f"How Out Inning {i} (Not Played)", 0) for i in range(1, 5)]

    points += runs_scored + boundaries + 2 * sixes
    for i, runs in enumerate(inning_runs, 1):
        if runs >= 100: points += 8
        elif runs >= 50: points += 4
        if runs == 0 and inning_not_played[i-1] == 0 and inning_not_out[i-1] == 0 and inning_balls[i-1] > 0:
            points -= 4

    wickets = player_stats.get("Wickets", 0)
    inning_wickets = [player_stats.get(f"Innings {i} Wickets", 0) for i in range(1, 5)]
    bowled_or_lbw = player_stats.get("Bowled", 0) + player_stats.get("LBW", 0)
    points += wickets * 16 + bowled_or_lbw * 8
    for wickets in inning_wickets:
        if wickets >= 5: points += 8
        elif wickets == 4: points += 4

    points += calculate_fielding_points(player_stats, 'test') + 4
    return {
        "total_points": points,
        "batting_points": calculate_batting_points(player_stats, 'test'),
        "bowling_points": calculate_bowling_points(player_stats, 'test'),
        "fielding_points": calculate_fielding_points(player_stats, 'test')
    }

def download_and_extract_data(formats=['t20s', 'odis', 'tests']):
    match_data = []
    for fmt in formats:
        url = f'https://cricsheet.org/downloads/{fmt}.zip'
        print(f"Downloading {fmt}.zip from {url}...")
        try:
            resp = requests.get(url, timeout=10)
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f'Failed to download {fmt}: {e}')
            continue
        print(f"Download complete for {fmt}.")
        with TemporaryDirectory() as tmpdir:
            try:
                with ZipFile(BytesIO(resp.content)) as z:
                    z.extractall(tmpdir)
                print(f"Extraction complete for {fmt}.")
                for filename in os.listdir(tmpdir):
                    if not filename.endswith('.yaml'):
                        continue
                    print(f"Loading match file: {filename}")
                    with open(os.path.join(tmpdir, filename), 'r') as f:
                        match = yaml.safe_load(f)
                        match_data.append((filename, match))
            except Exception as e:
                print(f"Error processing {fmt}: {e}")
                continue
    match_data = sorted(match_data, key=lambda x: datetime.strptime(str(x[1].get('info', {}).get('dates', [''])[0]), '%Y-%m-%d'))
    return match_data

def process_match_data(match_data):
    player_summary = defaultdict(lambda: {
        'matches': [],
        'total_batting_points': 0,
        'total_bowling_points': 0,
        'total_fielding_points': 0,
        'total_points': 0,
        'total_points_opposition': defaultdict(int),
        'batting_points_3': [], 'bowling_points_3': [], 'fielding_points_3': [],
        'batting_points_10': [], 'bowling_points_10': [], 'fielding_points_10': [],
        'team': set(),
        'role': None,
        'stats': defaultdict(list),
        'match_dates': []
    })

    raw_cols = [
        "Games", "Won", "Drawn", "Innings Batted", "Runs", "Singles", "Fours", "Sixes",
        "Dot Balls", "Balls Faced", "Outs", "Bowled Outs", "LBW Outs", "Hitwicket Outs",
        "Caught Outs", "Stumped Outs", "Run Outs", "Caught and Bowled Outs",
        "Innings Bowled", "Runsgiven", "Singlesgiven", "Foursgiven", "Sixesgiven",
        "Wickets", "Balls Bowled", "Extras", "No Balls", "Wides", "Dot Balls Bowled",
        "Bowleds", "LBWs", "Hitwickets", "Caughts", "Stumpeds", "Caught and Bowleds",
        "Catches", "Runouts", "Stumpings", "Maiden Overs"
    ]

    print("Processing match data...")
    for filename, match in match_data:
        match_info = match.get('info', {})
        match_type = match_info.get('match_type', '').lower()
        teams = match_info.get('teams', [])
        date = match_info.get('dates', [None])[0]
        if not date:
            continue
        date = datetime.strptime(str(date), '%Y-%m-%d')
        outcome = match_info.get('outcome', {})
        winner = outcome.get('winner')

        players_team = {p: team for team, plist in match_info.get('players', {}).items() for p in plist}
        player_match_stats = defaultdict(lambda: defaultdict(float))
        player_match_stats_batted_innings = defaultdict(set)
        player_match_stats_dismissed_innings = defaultdict(set)

        for inning_dict in match.get('innings', []):
            team_key = list(inning_dict.keys())[0]
            if 'super over' in team_key.lower():
                continue
            try:
                inning_num = int(team_key.split()[0][0])
            except (IndexError, ValueError):
                continue
            inning = inning_dict[team_key]
            bowler_over_runs = defaultdict(int)
            bowler_over_legal = defaultdict(int)

            for delivery in inning.get('deliveries', []):
                for ball_str, details in delivery.items():
                    try:
                        if not isinstance(details, dict): continue
                        ball = float(ball_str)
                        over = int(ball)
                        batsman, bowler = details.get('batsman'), details.get('bowler')
                        non_striker = details.get('non_striker')
                        runs, extras = details.get('runs', {}), details.get('extras', {})
                        wickets = details.get('wicket', [])

                        for player in [batsman, bowler, non_striker]:
                            if player:
                                player_summary[player]['team'].add(players_team.get(player, 'Unknown'))
                                player_summary[player]['match_dates'].append(date)

                        if batsman:
                            stats = player_match_stats[batsman]
                            if not player_match_stats_batted_innings[batsman]:
                                stats["Innings Batted"] += 1
                            player_match_stats_batted_innings[batsman].add(inning_num)
                            stats["Total Runs Scored"] += runs.get('batsman', 0)
                            stats[f"Innings {inning_num} Runs"] += runs.get('batsman', 0)
                            stats["Balls Faced"] += 1
                            stats[f"Innings {inning_num} Balls Faced"] += 1
                            stats["Runs"] += runs.get('batsman', 0)
                            if runs.get('batsman', 0) == 1: stats["Singles"] += 1
                            if runs.get('batsman', 0) == 4: stats["Fours"] += 1
                            if runs.get('batsman', 0) == 6: stats["Sixes"] += 1
                            if runs.get('batsman', 0) == 0 and not extras: stats["Dot Balls"] += 1

                        if bowler:
                            stats = player_match_stats[bowler]
                            if stats["Overs Bowled"] == 0:
                                stats["Innings Bowled"] += 1
                            stats["Overs Bowled"] += 1 / 6
                            stats["Runs Given"] += runs.get('total', 0)
                            stats["Balls Bowled"] += 1
                            stats["Runsgiven"] += runs.get('total', 0)
                            if runs.get('total', 0) == 1: stats["Singlesgiven"] += 1
                            if runs.get('total', 0) == 4: stats["Foursgiven"] += 1
                            if runs.get('total', 0) == 6: stats["Sixesgiven"] += 1
                            if runs.get('total', 0) == 0 and not extras: stats["Dot Balls Bowled"] += 1
                            if extras:
                                stats["Extras"] += sum(extras.values())
                                if 'wides' in extras: stats["Wides"] += extras['wides']
                                if 'noballs' in extras: stats["No Balls"] += extras['noballs']
                            key = (bowler, inning_num, over)
                            bowler_over_runs[key] += runs.get('total', 0)
                            if 'wides' not in extras and 'noballs' not in extras:
                                bowler_over_legal[key] += 1

                        if wickets:
                            for wicket in wickets if isinstance(wickets, list) else [wickets]:
                                if not isinstance(wicket, dict): continue
                                kind = wicket.get('kind', '')
                                player_out = wicket.get('player_out')
                                if player_out:
                                    player_match_stats_dismissed_innings[player_out].add(inning_num)
                                    stats = player_match_stats[player_out]
                                    stats[f"How Out Inning {inning_num} ({kind})"] = 1
                                    stats["Outs"] += 1
                                    if kind == 'bowled': stats["Bowled Outs"] += 1
                                    elif kind == 'lbw': stats["LBW Outs"] += 1
                                    elif kind == 'hit wicket': stats["Hitwicket Outs"] += 1
                                    elif kind == 'caught': stats["Caught Outs"] += 1
                                    elif kind == 'stumped': stats["Stumped Outs"] += 1
                                    elif kind == 'run out': stats["Run Outs"] += 1
                                    elif kind == 'caught and bowled': stats["Caught and Bowled Outs"] += 1

                                fielders = wicket.get('fielders', [])
                                for fielder_name in fielders:
                                    if fielder_name:
                                        f_stats = player_match_stats[fielder_name]
                                        if kind == 'caught': f_stats["Catches Taken"] += 1; f_stats["Catches"] += 1
                                        elif kind == 'stumped': f_stats["Stumped Outs Made"] += 1; f_stats["Stumpings"] += 1
                                        elif kind == 'run out': f_stats["Run Outs Made"] += 1; f_stats["Runouts"] += 1

                        if bowler and wickets:
                            stats = player_match_stats[bowler]
                            stats["Wickets"] += len(wickets)
                            stats[f"Innings {inning_num} Wickets"] += len(wickets)
                            for wicket in wickets if isinstance(wickets, list) else [wickets]:
                                if not isinstance(wicket, dict): continue
                                kind = wicket.get('kind', '')
                                if kind == 'bowled': stats["Bowleds"] += 1; stats["Bowled"] += 1
                                elif kind == 'lbw': stats["LBWs"] += 1; stats["LBW"] += 1
                                elif kind == 'hit wicket': stats["Hitwickets"] += 1
                                elif kind == 'caught': stats["Caughts"] += 1
                                elif kind == 'stumped': stats["Stumpeds"] += 1
                                elif kind == 'caught and bowled': stats["Caught and Bowleds"] += 1
                    except Exception as e:
                        continue

            for key in bowler_over_runs:
                if bowler_over_runs[key] == 0 and bowler_over_legal[key] == 6:
                    player_match_stats[key[0]]["Maiden Overs"] += 1

        max_innings = 4 if match_type == 'test' else 2
        for player, stats in player_match_stats.items():
            batted_innings = player_match_stats_batted_innings[player]
            dismissed_innings = player_match_stats_dismissed_innings[player]
            for num in range(1, max_innings + 1):
                if num not in batted_innings: stats[f"How Out Inning {num} (Not Played)"] = 1
                elif num not in dismissed_innings: stats[f"How Out Inning {num} (not out)"] = 1

        for player, stats in player_match_stats.items():
            stats["Games"] += 1
            if winner and winner == players_team.get(player): stats["Won"] += 1
            elif not winner: stats["Drawn"] += 1

            stats["Avg Batting S/R Per Inning"] = (stats.get("Total Runs Scored", 0) / max(1, stats.get("Balls Faced", 0))) * 100
            stats["Avg Economy Rate per inning"] = stats.get("Runs Given", 0) / max(1, stats.get("Overs Bowled", 0))

            points = calculate_fantasy_points_test(stats) if match_type == 'test' else calculate_fantasy_points(stats, match_type)

            player_summary[player]['total_batting_points'] += points['batting_points']
            player_summary[player]['total_bowling_points'] += points['bowling_points']
            player_summary[player]['total_fielding_points'] += points['fielding_points']
            player_summary[player]['total_points'] += points['total_points']

            opposition = teams[0] if players_team.get(player) == teams[1] else teams[1]
            player_summary[player]['total_points_opposition'][opposition] += points['total_points']

            for p_type in ['batting', 'bowling', 'fielding']:
                for N in [3, 10]:
                    key = f'{p_type}_points_{N}'
                    player_summary[player][key].append(points[f'{p_type}_points'])
                    if len(player_summary[player][key]) > N:
                        player_summary[player][key].pop(0)

            for col in raw_cols:
                if col in stats:
                    player_summary[player]['stats'][col].append(stats[col])

    final_player_summary = {}
    current_year = datetime.now().year
    for player, data in player_summary.items():
        final_stats = {}
        for col in raw_cols:
            final_stats[col] = sum(data['stats'].get(col, [0]))

        final_stats['Win %'] = (final_stats['Won'] / max(1, final_stats['Games'])) * 100
        final_stats['Batting Avg'] = final_stats['Runs'] / max(1, final_stats['Outs'])
        final_stats['Batting S/R'] = (final_stats['Runs'] / max(1, final_stats['Balls Faced'])) * 100
        final_stats['Boundary %'] = ((final_stats['Fours'] + final_stats['Sixes']) / max(1, final_stats['Balls Faced'])) * 100
        final_stats['Dismissal Rate'] = final_stats['Outs'] / max(1, final_stats['Innings Batted'])
        final_stats['Bowling Avg'] = final_stats['Runsgiven'] / max(1, final_stats['Wickets'])
        final_stats['Bowling S/R'] = final_stats['Balls Bowled'] / max(1, final_stats['Wickets'])
        final_stats['Economy Rate'] = final_stats['Runsgiven'] / max(1, final_stats['Balls Bowled'] / 6)
        final_stats['Dot Ball %'] = (final_stats['Dot Balls'] / max(1, final_stats['Balls Faced'])) * 100
        final_stats['Dot Ball Bowled %'] = (final_stats['Dot Balls Bowled'] / max(1, final_stats['Balls Bowled'])) * 100
        final_stats['Total Batting Points'] = data['total_batting_points']
        final_stats['Total Bowling Points'] = data['total_bowling_points']
        final_stats['Total Fielding Points'] = data['total_fielding_points']
        final_stats['Total Fantasy Points'] = data['total_points']
        for opp, points in data['total_points_opposition'].items():
            final_stats[f'Points vs {opp}'] = points
        for p_type in ['batting', 'bowling', 'fielding']:
            for N in [3, 10]:
                final_stats[f'Avg {p_type.capitalize()} Points ({N})'] = np.mean(data[f'{p_type}_points_{N}']) if data[f'{p_type}_points_{N}'] else 0

        final_stats['Recent Activity'] = (final_stats['Games'] >= 5 and
                                        (final_stats['Avg Batting Points (3)'] + final_stats['Avg Bowling Points (3)'] +
                                         final_stats['Avg Fielding Points (3)']) > 0)

        final_stats['Form Score'] = np.mean(data['batting_points_3'] + data['bowling_points_3'] + data['fielding_points_3']) if data['batting_points_3'] else 0
        avg_points = data['total_points'] / max(1, final_stats['Games'])
        final_stats['Cost'] = min(10, max(5, int(5 + (avg_points / 50))))
        final_stats['Variance'] = np.var([match['total_points'] for match in data['matches']]) if data['matches'] else 0

        role = 'allrounder'
        games = final_stats['Games']
        stumpings = final_stats['Stumpings']
        catches = final_stats['Catches']
        wickets = final_stats['Wickets']
        runs = final_stats['Runs']
        if stumpings > 0 or catches / max(1, games) > 0.5:
            role = 'wicketkeeper'
        elif wickets / max(1, games) > 0.5:
            role = 'bowler'
        elif runs / max(1, games) > 20:
            role = 'batsman'
        final_stats['Role'] = role
        final_stats['Team'] = list(data['team'])[0] if data['team'] else 'Unknown'

        final_player_summary[player] = final_stats

    df = pd.DataFrame.from_dict(final_player_summary, orient='index').reset_index().rename(columns={'index': 'Player'})
    df.fillna(0, inplace=True)
    df.to_csv('player_summary.csv', index=False)
    print("Player summary saved to 'player_summary.csv'")
    return df

if __name__ == "__main__":
    match_data = download_and_extract_data()
    process_match_data(match_data)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Loading match file: 682937.yaml
Loading match file: 1182652.yaml
Loading match file: 1422818.yaml
Loading match file: 951359.yaml
Loading match file: 1462911.yaml
Loading match file: 573020.yaml
Loading match file: 1418541.yaml
Loading match file: 1425638.yaml
Loading match file: 1343767.yaml
Loading match file: 682919.yaml
Loading match file: 1148056.yaml
Loading match file: 533291.yaml
Loading match file: 1432443.yaml
Loading match file: 1323296.yaml
Loading match file: 1464731.yaml
Loading match file: 1443543.yaml
Loading match file: 1275074.yaml
Loading match file: 566927.yaml
Loading match file: 1339604.yaml
Loading match file: 1384585.yaml
Loading match file: 567367.yaml
Loading match file: 518954.yaml
Loading match file: 1442736.yaml
Loading match file: 1310180.yaml
Loading match file: 542850.yaml
Loading match file: 1482821.yaml
Loading match file: 1335792.yaml
Loading match file: 1331386.yaml
Loading match file: 

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor
import joblib
from tqdm import tqdm

def custom_mape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    nonzero_mask = y_true != 0
    if np.sum(nonzero_mask) == 0: return 0.0
    return np.mean(np.abs((y_true[nonzero_mask] - y_pred[nonzero_mask]) / y_true[nonzero_mask])) * 100

def custom_wmape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    abs_y_true_sum = np.sum(np.abs(y_true))
    if abs_y_true_sum == 0: return 0.0
    return np.sum(np.abs(y_true - y_pred)) / abs_y_true_sum * 100

def extract_features(df):
    print("Extracting features...")
    bat_features = ['Avg Batting Points (3)', 'Avg Batting Points (10)', 'Batting Avg', 'Batting S/R', 'Boundary %', 'Dismissal Rate']
    bowl_features = ['Avg Bowling Points (3)', 'Avg Bowling Points (10)', 'Bowling Avg', 'Bowling S/R', 'Economy Rate', 'Wickets']
    field_features = ['Avg Fielding Points (3)', 'Avg Fielding Points (10)', 'Win %', 'Catches', 'Runouts', 'Stumpings']
    opposition_cols = [col for col in df.columns if col.startswith('Points vs ')]

    feature_lists = {'batting': bat_features, 'bowling': bowl_features, 'fielding': field_features, 'opposition': opposition_cols}

    df['Role'] = df['Role'].replace('wicketkeeper-batsman', 'wicketkeeper')
    le = LabelEncoder()
    roles_encoded = le.fit_transform(df['Role'])

    X_bat = df[bat_features + opposition_cols + ['Role', 'Games']].copy()
    X_bowl = df[bowl_features + opposition_cols + ['Role', 'Games']].copy()
    X_field = df[field_features + opposition_cols + ['Role', 'Games']].copy()

    for X in [X_bat, X_bowl, X_field]:
        X['Role'] = roles_encoded
        X.replace([np.inf, -np.inf], np.nan, inplace=True)
        X.fillna(0, inplace=True)

    return X_bat, X_bowl, X_field, feature_lists, le

if __name__ == "__main__":
    try:
        df = pd.read_csv('player_summary.csv')
    except FileNotFoundError:
        print("Error: 'player_summary.csv' not found."); exit()

    required_columns = ['Player', 'Role', 'Games', 'Total Batting Points', 'Total Bowling Points', 'Total Fielding Points',
                        'Avg Batting Points (3)', 'Avg Batting Points (10)', 'Batting Avg', 'Batting S/R', 'Boundary %', 'Dismissal Rate',
                        'Avg Bowling Points (3)', 'Avg Bowling Points (10)', 'Bowling Avg', 'Bowling S/R', 'Economy Rate', 'Wickets',
                        'Avg Fielding Points (3)', 'Avg Fielding Points (10)', 'Win %', 'Catches', 'Runouts', 'Stumpings']
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"Error: Missing columns: {missing_cols}"); exit()

    df = df[df['Recent Activity'] == True].reset_index(drop=True)

    X_bat, X_bowl, X_field, feature_lists, le = extract_features(df)
    joblib.dump(feature_lists, 'feature_lists.joblib')
    joblib.dump(le, 'role_encoder.joblib')

    y_bat = df['Total Batting Points'] / df['Games']
    y_bowl = df['Total Bowling Points'] / df['Games']
    y_field = df['Total Fielding Points'] / df['Games']

    scaler_bat, scaler_bowl, scaler_field = StandardScaler(), StandardScaler(), StandardScaler()
    X_bat_scaled = scaler_bat.fit_transform(X_bat)
    X_bowl_scaled = scaler_bowl.fit_transform(X_bowl)
    X_field_scaled = scaler_field.fit_transform(X_field)

    X_train_bat, X_test_bat, y_train_bat, y_test_bat = train_test_split(X_bat_scaled, y_bat, test_size=0.2, random_state=42)
    X_train_bowl, X_test_bowl, y_train_bowl, y_test_bowl = train_test_split(X_bowl_scaled, y_bowl, test_size=0.2, random_state=42)
    X_train_field, X_test_field, y_train_field, y_test_field = train_test_split(X_field_scaled, y_field, test_size=0.2, random_state=42)

    param_grid_rf = {
        'n_estimators': [100, 150], 'max_depth': [None, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2], 'max_features': ['sqrt']
    }
    param_grid_cb = {
        'iterations': [300, 500], 'learning_rate': [0.01, 0.05], 'depth': [4, 6], 'l2_leaf_reg': [3, 5]
    }

    models = {
        'RandomForest': (RandomForestRegressor(random_state=42, n_jobs=-1), param_grid_rf),
        'CatBoost': (CatBoostRegressor(random_seed=42, verbose=0), param_grid_cb)
    }

    results, feature_importances_data = [], []
    task_data = {
        'Batting': (X_train_bat, y_train_bat, X_test_bat, y_test_bat, X_bat.columns),
        'Bowling': (X_train_bowl, y_train_bowl, X_test_bowl, y_test_bowl, X_bowl.columns),
        'Fielding': (X_train_field, y_train_field, X_test_field, y_test_field, X_field.columns)
    }

    best_models = {}
    for task_name, (X_train, y_train, X_test, y_test, feature_names) in task_data.items():
        best_mae = float('inf')
        for model_name, (model, param_grid) in models.items():
            print(f"Tuning {model_name} for {task_name}...")
            grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
            grid_search.fit(X_train, y_train)
            tuned_model = grid_search.best_estimator_
            y_test_pred = tuned_model.predict(X_test)
            test_mae = mean_absolute_error(y_test, y_test_pred)
            results.append({
                'Task': task_name,
                'Model': model_name,
                'Test MAE': test_mae,
                'Test MAPE (%)': custom_mape(y_test, y_test_pred),
                'Test WMAPE (%)': custom_wmape(y_test, y_test_pred),
                'Test R2': r2_score(y_test, y_test_pred),
                'Best Params': grid_search.best_params_
            })
            if test_mae < best_mae:
                best_mae = test_mae
                best_models[task_name] = (tuned_model, model_name.lower())
            importances = tuned_model.feature_importances_ if hasattr(tuned_model, 'feature_importances_') else [0] * len(feature_names)
            for feature, importance in zip(feature_names, importances):
                feature_importances_data.append({
                    'Task': task_name, 'Model': model_name, 'Feature': feature, 'Importance': importance
                })
        joblib.dump(best_models[task_name][0], f'{task_name.lower()}_best_model.joblib')
        print(f"Saved best model ({best_models[task_name][1]}) for {task_name} with MAE: {best_mae:.4f}")

    joblib.dump(scaler_bat, 'scaler_bat.joblib')
    joblib.dump(scaler_bowl, 'scaler_bowl.joblib')
    joblib.dump(scaler_field, 'scaler_field.joblib')

    print("Generating predictions for all players...")
    predictions = []
    opposition_teams = [col.replace('Points vs ', '') for col in feature_lists['opposition']]
    for _, player_stats in tqdm(df.iterrows(), total=len(df), desc="Generating Predictions"):
        player_name, player_role = player_stats['Player'], player_stats['Role'].replace('wicketkeeper-batsman', 'wicketkeeper')
        for opp_team in opposition_teams:
            prepared_features = {}
            for task_name in ['Batting', 'Bowling', 'Fielding']:
                features = feature_lists[task_name.lower()]
                opp_cols = feature_lists['opposition']
                X_dict = {f: player_stats.get(f, 0) for f in features}
                for col in opp_cols: X_dict[col] = player_stats.get(col, 0) if col == f"Points vs {opp_team}" else 0
                X_dict['Role'] = le.transform([player_role])[0]
                X_dict['Games'] = player_stats['Games']
                prepared_features[task_name] = pd.DataFrame([X_dict])
            pred_bat = max(0, best_models['Batting'][0].predict(scaler_bat.transform(prepared_features['Batting']))[0])
            pred_bowl = max(0, best_models['Bowling'][0].predict(scaler_bowl.transform(prepared_features['Bowling']))[0])
            pred_field = max(0, best_models['Fielding'][0].predict(scaler_field.transform(prepared_features['Fielding']))[0])
            total = (pred_bat + pred_field + 4) if player_role == 'batsman' else \
                    (pred_bowl + pred_field + 4) if player_role == 'bowler' else \
                    (pred_bat + pred_bowl + pred_field + 4) if player_role == 'allrounder' else \
                    (pred_bat + pred_field + 4) if player_role == 'wicketkeeper' else 0
            predictions.append({
                'Player': player_name,
                'Role': player_role,
                'Opposition': opp_team,
                'Predicted_Batting_Points': pred_bat,
                'Predicted_Bowling_Points': pred_bowl,
                'Predicted_Fielding_Points': pred_field,
                'Predicted_Total_Points': total,
                'Model_Used': f"{best_models['Batting'][1]},{best_models['Bowling'][1]},{best_models['Fielding'][1]}"
            })
    pd.DataFrame(predictions).to_csv('player_predictions.csv', index=False)
    print("Saved predictions to 'player_predictions.csv'")

    results_df = pd.DataFrame(results)
    results_df.to_csv("training_metrics.csv", index=False)
    print("\nModel Performance Metrics:\n", results_df.to_string(index=False))

    if feature_importances_data:
        importances_df = pd.DataFrame(feature_importances_data)
        importances_df['Abs_Importance'] = importances_df['Importance'].abs()
        importances_df = importances_df.sort_values(by=['Task', 'Model', 'Abs_Importance'], ascending=[True, True, False]).drop(columns=['Abs_Importance'])
        importances_df.to_csv("feature_importances.csv", index=False)
        print("\nFeature importances saved to 'feature_importances.csv'")

Extracting features...
Tuning RandomForest for Batting...
Tuning CatBoost for Batting...
Saved best model (catboost) for Batting with MAE: 1.1380
Tuning RandomForest for Bowling...
Tuning CatBoost for Bowling...
Saved best model (catboost) for Bowling with MAE: 3.8025
Tuning RandomForest for Fielding...
Tuning CatBoost for Fielding...
Saved best model (catboost) for Fielding with MAE: 0.2780
Generating predictions for all players...


Generating Predictions: 100%|██████████| 4819/4819 [2:41:19<00:00,  2.01s/it]


Saved predictions to 'player_predictions.csv'

Model Performance Metrics:
     Task        Model  Test MAE  Test MAPE (%)  Test WMAPE (%)  Test R2                                                                                                     Best Params
 Batting RandomForest  2.219303      65.458458       17.142560 0.933433 {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
 Batting     CatBoost  1.137967      29.005623        8.790000 0.979877                                        {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
 Bowling RandomForest  7.521835      91.471455       15.166054 0.925099 {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
 Bowling     CatBoost  3.802497      25.581478        7.666862 0.979633                                        {'depth': 6, 'iterations': 500, 'l2_leaf_reg': 3, 'learning_rate': 0.05}
Field

In [None]:
!pip install pulp

Collecting pulp
  Downloading pulp-3.3.0-py3-none-any.whl.metadata (8.4 kB)
Downloading pulp-3.3.0-py3-none-any.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m93.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pulp
Successfully installed pulp-3.3.0


In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.32.0-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.32.0-py3-none-any.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.32.0


In [None]:
import re

In [None]:
import json

In [None]:
!pip install pyngrok


In [None]:
import pandas as pd
import numpy as np
import json
import re
import logging
import pulp

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class FantasySolver:
    """
    Encapsulates the player data and the optimization logic to find the best team.
    """
    def __init__(self, player_csv='player_summary.csv', prediction_csv='player_predictions.csv'):
        self.player_df = self._load_player_data(player_csv, prediction_csv)
        logger.info("Player database loaded.")

    def _load_player_data(self, player_csv, prediction_csv):
        """Loads player data from CSV files, falling back to synthetic data if files are missing."""
        try:
            df = pd.read_csv(player_csv)
            df['Role'] = df['Role'].replace('wicketkeeper-batsman', 'wicketkeeper')
            df = df[df['Recent Activity'] == True].copy()

            # Calculate base fantasy points and weighted score
            df['Fantasy Points'] = df['Total Fantasy Points'] / df['Games']
            df['Weighted Score'] = 0.7 * df['Fantasy Points'] + 0.3 * df['Form Score']

            # Incorporate predictions if available
            try:
                pred_df = pd.read_csv(prediction_csv)
                pred_df['Role'] = pred_df['Role'].replace('wicketkeeper-batsman', 'wicketkeeper')
                for index, row in df.iterrows():
                    player = row['Player']
                    pred_row = pred_df[(pred_df['Player'] == player) & (pred_df['Opposition'] == row['Team'])]
                    if not pred_row.empty:
                        df.loc[index, 'Fantasy Points'] = pred_row['Predicted_Total_Points'].iloc[0]
                        df.loc[index, 'Weighted Score'] = 0.7 * df.loc[index, 'Fantasy Points'] + 0.3 * df.loc[index, 'Form Score']
            except FileNotFoundError:
                logger.warning("player_predictions.csv not found. Using historical averages.")

            required_cols = ['Player', 'Fantasy Points', 'Form Score', 'Role', 'Cost', 'Team', 'Variance', 'Weighted Score']
            for col in required_cols:
                if col not in df.columns:
                    df[col] = 0
            df.replace([np.inf, -np.inf], np.nan, inplace=True)
            df.fillna(0, inplace=True)
            return df
        except FileNotFoundError:
            logger.warning(f"{player_csv} not found. Generating synthetic data.")
            return self._generate_synthetic_data(100)

    def _generate_synthetic_data(self, num_players):
        """Generates synthetic player data as a fallback."""
        players_data = []
        for i in range(num_players):
            fantasy_points = np.random.randint(20, 50)
            form_scores = np.random.randint(180, 321)
            weighted_score = 0.7 * fantasy_points + 0.3 * form_scores
            players_data.append({
                'Player': f'player_{i+1}',
                'Fantasy Points': fantasy_points,
                'Form Score': form_scores,
                'Role': np.random.choice(['batsman', 'bowler', 'wicketkeeper', 'allrounder']),
                'Cost': np.random.randint(5, 11),
                'Team': np.random.choice(['India', 'Australia']),  # Match example teams
                'Variance': np.random.randint(50, 501),
                'Weighted Score': weighted_score,
                'Recent Activity': True
            })
        return pd.DataFrame(players_data)

    def solve(self, params):
        """
        Runs the PuLP solver based on the provided strategy parameters.
        Returns the selected team DataFrame and an analysis summary dictionary.
        """
        required_params = ['total_players', 'budget', 'role_constraints', 'team1_name', 'team2_name', 'num_team1_players', 'num_team2_players', 'risk', 'format']
        if not all(key in params for key in required_params):
            missing = set(required_params) - set(params.keys())
            raise ValueError(f"Missing required params: {missing}")

        format = params['format'].lower()
        if format not in ['t20', 'odi', 'test']:
            raise ValueError("Invalid format. Choose 't20', 'odi', or 'test'.")

        # Filter players for selected teams
        filtered_df = self.player_df[self.player_df['Team'].isin([params['team1_name'], params['team2_name']])].copy()
        if filtered_df.empty:
            logger.error(f"No active players found for teams {params['team1_name']} and {params['team2_name']}.")
            return None, None

        # Adjust weighted score based on format-specific historical points
        for index, row in filtered_df.iterrows():
            opp_team = params['team2_name'] if row['Team'] == params['team1_name'] else params['team1_name']
            opp_points_col = f'Points vs {opp_team}'
            if opp_points_col in filtered_df.columns:
                filtered_df.loc[index, 'Weighted Score'] = 0.7 * row[opp_points_col] / max(1, row['Games']) + 0.3 * row['Form Score']

        prob = pulp.LpProblem("FantasyTeamSelection", pulp.LpMaximize)
        player_indices = filtered_df.index
        player_vars = pulp.LpVariable.dicts("Player", player_indices, 0, 1, cat='Binary')
        captain_vars = pulp.LpVariable.dicts("Captain", player_indices, 0, 1, cat='Binary')
        vc_vars = pulp.LpVariable.dicts("ViceCaptain", player_indices, 0, 1, cat='Binary')

        # Objective: Maximize weighted score
        objective_key = 'Weighted Score'
        prob += pulp.lpSum(
            [player_vars[i] * filtered_df.loc[i, objective_key] for i in player_indices] +
            [captain_vars[i] * filtered_df.loc[i, objective_key] for i in player_indices] +
            [vc_vars[i] * 0.5 * filtered_df.loc[i, objective_key] for i in player_indices]
        ), "TotalObjectiveScore"

        # Constraints
        prob += pulp.lpSum([player_vars[p] for p in player_indices]) == params['total_players']
        prob += pulp.lpSum([filtered_df.loc[i, 'Cost'] * player_vars[i] for i in player_indices]) <= params['budget']
        for role, (min_val, max_val) in params['role_constraints'].items():
            role_players = filtered_df[filtered_df['Role'] == role].index
            prob += pulp.lpSum([player_vars[p] for p in role_players]) >= min_val
            prob += pulp.lpSum([player_vars[p] for p in role_players]) <= max_val
        team1_players = filtered_df[filtered_df['Team'] == params['team1_name']].index
        team2_players = filtered_df[filtered_df['Team'] == params['team2_name']].index
        prob += pulp.lpSum([player_vars[p] for p in team1_players]) == params['num_team1_players']
        prob += pulp.lpSum([player_vars[p] for p in team2_players]) == params['num_team2_players']
        prob += pulp.lpSum([captain_vars[p] for p in player_indices]) == 1
        prob += pulp.lpSum([vc_vars[p] for p in player_indices]) == 1
        for i in player_indices:
            prob += captain_vars[i] <= player_vars[i]
            prob += vc_vars[i] <= player_vars[i]
            prob += captain_vars[i] + vc_vars[i] <= 1
        if params['risk'] in ['stable', 'risky']:
            avg_variance = filtered_df['Variance'].mean()
            baseline_total_variance = avg_variance * params['total_players']
            if params['risk'] == 'stable':
                prob += pulp.lpSum([filtered_df.loc[i, 'Variance'] * player_vars[i] for i in player_indices]) <= baseline_total_variance * 0.9
            else:
                prob += pulp.lpSum([filtered_df.loc[i, 'Variance'] * player_vars[i] for i in player_indices]) >= baseline_total_variance * 1.1

        prob.solve(pulp.PULP_CBC_CMD(msg=0))
        if pulp.LpStatus[prob.status] == "Optimal":
            selected_indices = [i for i in player_indices if player_vars[i].varValue > 0]
            captain_index = [i for i in player_indices if captain_vars[i].varValue > 0][0]
            vc_index = [i for i in player_indices if vc_vars[i].varValue > 0][0]
            team_df = filtered_df.loc[selected_indices].copy()
            team_df['Team Role'] = 'Player'
            team_df.loc[captain_index, 'Team Role'] = 'Captain'
            team_df.loc[vc_index, 'Team Role'] = 'Vice-Captain'
            summary = {
                'total_objective_score': float(round(pulp.value(prob.objective), 2)),
                'total_cost': int(team_df['Cost'].sum()),
                'total_variance': int(team_df['Variance'].sum()),
                'team1_count': len(team_df[team_df['Team'] == params['team1_name']]),
                'team2_count': len(team_df[team_df['Team'] == params['team2_name']]),
                'format': params['format']
            }
            return team_df, summary
        else:
            logger.error(f"Solver status: {pulp.LpStatus[prob.status]}")
            return None, None

class ExplainabilityAgent:
    """
    Holds the solver's context and generates explanations for the selected team.
    """
    def __init__(self, solver_context):
        self.context = solver_context
        self.chat_history = []
        self.system_prompt = """
You are "Opti-Scout," an expert fantasy sports analyst and strategic AI assistant. Your purpose is to explain the results of a fantasy team optimization algorithm to the user. You are friendly, insightful, and you explain complex topics in a simple, strategic way.
You will be given a JSON object containing the `solver_parameters` that the user set, and the `selected_team` that the algorithm generated. Your entire analysis MUST be based SOLELY on the data provided in this context.
- **Data-Driven:** Always reference specific stats from the provided context.
- **Persona:** Be encouraging and act like a co-pilot.
- **Clarity:** Use formatting like lists and bold text.
- **Do not hallucinate:** Do not invent any information or stats not present in the provided JSON context.
"""
        logger.info("Explainability Agent initialized.")

    def update_context(self, new_context):
        """Updates the agent's knowledge with a new team and resets history."""
        self.context = new_context
        self.chat_history = []
        logger.info("Agent context updated with new team.")

    def _build_prompt(self, user_query):
        """Constructs the full JSON context for the LLM."""
        team_list = self.context['selected_team'].to_dict(orient='records')
        full_prompt_data = {
            "solver_parameters": self.context['solver_parameters'],
            "analysis_summary": self.context['analysis_summary'],
            "selected_team": team_list,
            "chat_history": self.chat_history,
            "user_query": user_query
        }
        return json.dumps(full_prompt_data, indent=2)

    def get_explanation(self, user_query):
        """Generates a mock response based on the query and context."""
        final_prompt_for_llm = self._build_prompt(user_query)
        response_text = self._generate_mock_response(user_query)
        self.chat_history.append({"role": "user", "content": user_query})
        self.chat_history.append({"role": "assistant", "content": response_text})
        return final_prompt_for_llm, response_text

    def _generate_mock_response(self, user_query):
        """Generates a dynamic mock response based on the query and context."""
        team_df = self.context['selected_team']
        params = self.context['solver_parameters']
        summary = self.context['analysis_summary']

        player_match = re.search(r'(\b\w+\b \w+|\b\w+\b)', user_query, re.IGNORECASE)

        if "why" in user_query.lower() and "selected" in user_query.lower():
            response = (
                f"The team was selected to maximize the **weighted score** ({summary['total_objective_score']:.2f}) "
                f"within a **budget** of {params['budget']} for {params['format'].upper()} format.\n"
                f"- **Teams**: {params['team1_name']} ({summary['team1_count']} players) and {params['team2_name']} ({summary['team2_count']} players).\n"
                f"- **Role Constraints**: {params['role_constraints']}.\n"
                f"- **Risk Profile**: {params['risk']}.\n"
                f"The solver prioritized players with high predicted fantasy points and form scores."
            )
        elif "captain" in user_query.lower():
            captain = team_df[team_df['Team Role'] == 'Captain']
            if not captain.empty:
                captain_data = captain.iloc[0]
                response = (
                    f"**{captain_data['Player']}** was chosen as captain due to their high **weighted score** ({captain_data['Weighted Score']:.2f}) "
                    f"and strong recent performance in {params['format'].upper()} format."
                )
            else:
                response = "No captain was selected due to solver constraints."
        elif "performance" in user_query.lower():
            response = (
                f"The selected team has a **total weighted score** of {summary['total_objective_score']:.2f}, "
                f"with a **total cost** of {summary['total_cost']} and **total variance** of {summary['total_variance']:.2f}.\n"
                f"- **Team Composition**: {summary['team1_count']} players from {params['team1_name']}, "
                f"{summary['team2_count']} players from {params['team2_name']}.\n"
                f"- **Format**: {params['format'].upper()}."
            )
        elif player_match:
            player_name = player_match.group(0).title()
            player_row = team_df[team_df['Player'].str.lower() == player_name.lower()]
            if not player_row.empty:
                player_data = player_row.iloc[0]
                response = (
                    f"**{player_data['Player']}** ({player_data['Role'].capitalize()}) was selected because:\n"
                    f"- **Weighted Score**: {player_data['Weighted Score']:.2f} (high performance in {params['format'].upper()}).\n"
                    f"- **Cost**: {player_data['Cost']} (fits within budget).\n"
                    f"- **Variance**: {player_data['Variance']} (matches {params['risk']} risk profile).\n"
                    f"- **Team Role**: {player_data['Team Role']}."
                )
            else:
                response = f"Sorry, **{player_name}** is not in the selected team. Ask about another player!"
        else:
            response = (
                f"I've analyzed your team for {params['format'].upper()} format. "
                "Ask about specific players, the captain, or the team's performance!"
            )

        return response

if __name__ == "__main__":
    logger.info("Starting fantasy cricket team selection...")

    try:
        solver = FantasySolver(player_csv='player_summary.csv', prediction_csv='player_predictions.csv')
    except Exception as e:
        logger.error(f"Failed to initialize solver: {str(e)}")
        print(f"❌ Error: Could not initialize solver. Check if 'player_summary.csv' and 'player_predictions.csv' exist.")
        exit()

    # Define initial strategy
    initial_params = {
        'budget': 90,
        'risk': 'risky',  # 'stable', 'risky'
        'total_players': 11,
        'num_team1_players': 6,
        'num_team2_players': 5,
        'team1_name': 'India',  # Replace with valid team from player_summary.csv
        'team2_name': 'Australia',  # Replace with valid team from player_summary.csv
        'format': 't20',  # Added for format selection
        'role_constraints': {
            'batsman': (3, 5),
            'bowler': (3, 5),
            'allrounder': (1, 3),
            'wicketkeeper': (1, 2)
        }
    }

    # Validate team names
    try:
        player_df = pd.read_csv('player_summary.csv')
        available_teams = player_df['Team'].unique()
        if initial_params['team1_name'] not in available_teams or initial_params['team2_name'] not in available_teams:
            logger.error(f"Invalid team names: {initial_params['team1_name']}, {initial_params['team2_name']}")
            print(f"❌ Error: Teams must be in {available_teams.tolist()}")
            exit()
    except FileNotFoundError:
        logger.error("player_summary.csv not found")
        print("❌ Error: player_summary.csv not found")
        exit()

    print("\n🚀 Running solver with initial parameters...")
    logger.info(f"Initial parameters: {initial_params}")

    try:
        team_df, summary = solver.solve(initial_params)
    except Exception as e:
        logger.error(f"Solver failed: {str(e)}")
        print(f"❌ Error: Could not solve with given parameters. {str(e)}")
        print("Suggestions:")
        print(f"- Check if team names ('{initial_params['team1_name']}', '{initial_params['team2_name']}') exist in player_summary.csv")
        print("- Increase budget (e.g., 100 or 150)")
        print("- Relax role constraints (e.g., 'batsman': [1, 7])")
        print("- Ensure num_team1_players + num_team2_players = total_players")
        exit()

    if team_df is not None:
        print("✅ Optimal Team Found!")
        print(team_df[['Player', 'Role', 'Cost', 'Weighted Score', 'Team Role']].round(2))

        # Create context and initialize agent
        solver_context = {
            'solver_parameters': initial_params,
            'analysis_summary': summary,
            'selected_team': team_df
        }
        agent = ExplainabilityAgent(solver_context)

        # Define questions to be asked automatically
        questions_to_ask = [
            f"Why was {team_df.iloc[0]['Player']} selected?",
            "Can you explain the team's overall strategy?",
            f"Tell me more about {team_df.iloc[1]['Player']}."
        ]

        print(f"\n--- 🤖 Opti-Scout will now answer {len(questions_to_ask)} predefined questions. ---")

        for i, query in enumerate(questions_to_ask):
            print(f"\n--- Question {i+1}: {query} ---")

            prompt, response = agent.get_explanation(query)

            print("\n" + "="*20 + " LLM PROMPT (for Groq) " + "="*20)
            print(prompt)
            print("="*64)

            print(f"\n🤖 Opti-Scout says:\n{response}")

        print("\n--- 🤖 Opti-Scout is ready. Ask a question about the team or type 'quit' to exit. ---")

        # Interactive chat loop
        while True:
            query = input("\nYour question: ")
            if query.lower() == 'quit':
                break

            prompt, response = agent.get_explanation(query)

            print("\n" + "="*20 + " LLM PROMPT (for Groq) " + "="*20)
            print(prompt)
            print("="*64)

            print(f"\n🤖 Opti-Scout says:\n{response}")
    else:
        logger.error("No feasible team found with given constraints.")
        print("❌ Could not find an optimal team with the given constraints.")
        print("Suggestions:")
        print(f"- Check if team names ('{initial_params['team1_name']}', '{initial_params['team2_name']}') exist in player_summary.csv")
        print("- Increase budget (e.g., 100 or 150)")
        print("- Relax role constraints (e.g., 'batsman': [1, 7])")
        print("- Ensure num_team1_players + num_team2_players = total_players")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
      "Points vs Isle of Man": 0.0,
      "Points vs Estonia": 0.0,
      "Points vs Philippines": 0.0,
      "Points vs Mexico": 0.0,
      "Points vs Turks and Caicos Island": 0.0,
      "Points vs Samoa": 0.0,
      "Points vs Luxembourg": 0.0,
      "Points vs Turkey": 0.0,
      "Points vs Slovenia": 0.0,
      "Points vs Switzerland": 0.0,
      "Points vs Sierra Leone": 0.0,
      "Points vs Fiji": 0.0,
      "Points vs Cook Islands": 0.0,
      "Points vs Eswatini": 0.0,
      "Points vs South Korea": 0.0,
      "Points vs Ivory Coast": 0.0,
      "Points vs Swaziland": 0.0,
      "Points vs Bulgaria": 0.0,
      "Points vs Serbia": 0.0,
      "Points vs Iran": 0.0,
      "Points vs Suriname": 0.0,
      "Points vs Chile": 0.0,
      "Points vs Costa Rica": 0.0,
      "Fantasy Points": 43.05000205781303,
      "Weighted Score": 45.849999999999994,
      "Team Role": "Player"
    },
    {
      "Player": "T Nataraj

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib
from fastapi import FastAPI, HTTPException
from contextlib import asynccontextmanager
import uvicorn
import nest_asyncio
from pyngrok import ngrok
import threading
import os
from google.colab import userdata
from pydantic import BaseModel
from typing import Dict
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

app_data = {}

class TeamSelectionParams(BaseModel):
    total_players: int = 11
    budget: int = 100
    role_constraints: Dict[str, tuple] = {
        'batsman': (3, 5),
        'bowler': (3, 5),
        'wicketkeeper': (1, 2),
        'allrounder': (1, 3)
    }
    team1_name: str
    team2_name: str
    num_team1_players: int = 5
    num_team2_players: int = 6
    risk: str = 'stable'
    format: str = 't20'  # Format for validation and response

@asynccontextmanager
async def lifespan(app: FastAPI):
    logger.info("FastAPI app starting up...")
    required_files = [
        'player_summary.csv', 'feature_lists.joblib', 'role_encoder.joblib',
        'scaler_bat.joblib', 'scaler_bowl.joblib', 'scaler_field.joblib',
        'batting_best_model.joblib', 'bowling_best_model.joblib', 'fielding_best_model.joblib',
        'training_metrics.csv'
    ]
    missing_files = [f for f in required_files if not os.path.exists(f)]
    if missing_files:
        logger.error(f"Missing files: {', '.join(missing_files)}")
        app_data['df'] = None
    else:
        try:
            app_data['df'] = pd.read_csv('player_summary.csv')
            app_data['df']['Role'] = app_data['df']['Role'].replace('wicketkeeper-batsman', 'wicketkeeper')
            app_data['df'] = app_data['df'][app_data['df']['Recent Activity'] == True].copy()
            app_data['feature_lists'] = joblib.load('feature_lists.joblib')
            required_columns = ['Player', 'Role', 'Games'] + app_data['feature_lists']['batting'] + \
                              app_data['feature_lists']['bowling'] + app_data['feature_lists']['fielding']
            missing_cols = [col for col in required_columns if col not in app_data['df'].columns]
            if missing_cols:
                logger.error(f"Missing columns in player_summary.csv: {missing_cols}")
                app_data['df'] = None
            else:
                app_data['scalers'] = {
                    'Batting': joblib.load('scaler_bat.joblib'),
                    'Bowling': joblib.load('scaler_bowl.joblib'),
                    'Fielding': joblib.load('scaler_field.joblib')
                }
                app_data['models'] = {
                    'Batting': joblib.load('batting_best_model.joblib'),
                    'Bowling': joblib.load('bowling_best_model.joblib'),
                    'Fielding': joblib.load('fielding_best_model.joblib')
                }
                app_data['role_encoder'] = joblib.load('role_encoder.joblib')
                metrics_df = pd.read_csv('training_metrics.csv')
                app_data['model_names'] = {}
                for task in ['Batting', 'Bowling', 'Fielding']:
                    task_metrics = metrics_df[metrics_df['Task'] == task]
                    best_model_name = task_metrics.loc[task_metrics['Test MAE'].idxmin()]['Model'].lower()
                    app_data['model_names'][task] = best_model_name
                logger.info("Successfully loaded all required files and models.")
        except Exception as e:
            logger.error(f"Error during startup: {str(e)}")
            app_data['df'] = None
    yield
    logger.info("FastAPI app shutting down...")
    app_data.clear()

app = FastAPI(
    title="Cricket Fantasy Points Predictor API",
    description="API for predicting player performance and selecting optimal fantasy teams.",
    version="2.2.2",
    lifespan=lifespan
)

@app.get("/", tags=["Welcome"])
async def read_root():
    return {
        "message": "Welcome to the Cricket Fantasy Points Predictor API!",
        "documentation_url": "/docs"
    }

@app.get("/predict", tags=["Prediction"])
async def get_prediction(player_name: str, opposition_team: str, scoring_rule: str = "default"):
    if app_data.get('df') is None:
        logger.error("Player data or models not loaded.")
        raise HTTPException(status_code=503, detail="Service Unavailable: Player data or models not loaded.")
    player_stats = app_data['df'][app_data['df']['Player'] == player_name]
    if player_stats.empty:
        logger.warning(f"Player '{player_name}' not found.")
        raise HTTPException(status_code=404, detail=f"Player '{player_name}' not found.")
    player_stats, player_role = player_stats.iloc[0], player_stats.iloc[0]['Role']
    prepared_features = {}
    for task_name in ['Batting', 'Bowling', 'Fielding']:
        features = app_data['feature_lists'][task_name.lower()]
        opp_cols = app_data['feature_lists']['opposition']
        X_dict = {f: player_stats.get(f, 0) for f in features}
        for col in opp_cols: X_dict[col] = player_stats.get(col, 0) if col == f"Points vs {opposition_team}" else 0
        X_dict['Role'] = app_data['role_encoder'].transform([player_role])[0]
        X_dict['Games'] = player_stats['Games']
        prepared_features[task_name] = pd.DataFrame([X_dict])
    pred_bat = max(0, app_data['models']['Batting'].predict(app_data['scalers']['Batting'].transform(prepared_features['Batting']))[0])
    pred_bowl = max(0, app_data['models']['Bowling'].predict(app_data['scalers']['Bowling'].transform(prepared_features['Bowling']))[0])
    pred_field = max(0, app_data['models']['Fielding'].predict(app_data['scalers']['Fielding'].transform(prepared_features['Fielding']))[0])
    if scoring_rule == 'simple_sum':
        total = pred_bat + pred_bowl + pred_field
    else:
        total = (pred_bat + pred_field + 4) if player_role == 'batsman' else \
                (pred_bowl + pred_field + 4) if player_role == 'bowler' else \
                (pred_bat + pred_bowl + pred_field + 4) if player_role == 'allrounder' else \
                (pred_bat + pred_field + 4) if player_role == 'wicketkeeper' else 0
    return {
        'player_info': {'name': player_name, 'role': player_role, 'opposition': opposition_team},
        'prediction': {
            'model_used': f"{app_data['model_names']['Batting']},{app_data['model_names']['Bowling']},{app_data['model_names']['Fielding']}",
            'predicted_batting_points': round(pred_bat, 2),
            'predicted_bowling_points': round(pred_bowl, 2),
            'predicted_fielding_points': round(pred_field, 2),
            'predicted_total_fantasy_points': round(total, 2)
        }
    }

@app.get("/predict_all", tags=["Prediction"])
async def get_all_predictions(opposition_team: str, scoring_rule: str = "default"):
    if app_data.get('df') is None:
        logger.error("Player data or models not loaded.")
        raise HTTPException(status_code=503, detail="Service Unavailable: Player data or models not loaded.")
    predictions = []
    for _, player_stats in app_data['df'].iterrows():
        player_name, player_role = player_stats['Player'], player_stats['Role']
        prepared_features = {}
        for task_name in ['Batting', 'Bowling', 'Fielding']:
            features = app_data['feature_lists'][task_name.lower()]
            opp_cols = app_data['feature_lists']['opposition']
            X_dict = {f: player_stats.get(f, 0) for f in features}
            for col in opp_cols: X_dict[col] = player_stats.get(col, 0) if col == f"Points vs {opposition_team}" else 0
            X_dict['Role'] = app_data['role_encoder'].transform([player_role])[0]
            X_dict['Games'] = player_stats['Games']
            prepared_features[task_name] = pd.DataFrame([X_dict])
        pred_bat = max(0, app_data['models']['Batting'].predict(app_data['scalers']['Batting'].transform(prepared_features['Batting']))[0])
        pred_bowl = max(0, app_data['models']['Bowling'].predict(app_data['scalers']['Bowling'].transform(prepared_features['Bowling']))[0])
        pred_field = max(0, app_data['models']['Fielding'].predict(app_data['scalers']['Fielding'].transform(prepared_features['Fielding']))[0])
        if scoring_rule == 'simple_sum':
            total = pred_bat + pred_bowl + pred_field
        else:
            total = (pred_bat + pred_field + 4) if player_role == 'batsman' else \
                    (pred_bowl + pred_field + 4) if player_role == 'bowler' else \
                    (pred_bat + pred_bowl + pred_field + 4) if player_role == 'allrounder' else \
                    (pred_bat + pred_field + 4) if player_role == 'wicketkeeper' else 0
        predictions.append({
            'player_info': {'name': player_name, 'role': player_role, 'opposition': opposition_team},
            'prediction': {
                'model_used': f"{app_data['model_names']['Batting']},{app_data['model_names']['Bowling']},{app_data['model_names']['Fielding']}",
                'predicted_batting_points': round(pred_bat, 2),
                'predicted_bowling_points': round(pred_bowl, 2),
                'predicted_fielding_points': round(pred_field, 2),
                'predicted_total_fantasy_points': round(total, 2)
            }
        })
    return predictions

@app.post("/select_team", tags=["Team Selection"])
async def select_team(params: TeamSelectionParams):
    format = params.format.lower()
    if format not in ['t20', 'odi', 'test']:
        logger.warning(f"Invalid format: {format}")
        raise HTTPException(status_code=400, detail="Invalid format. Choose 't20', 'odi', or 'test'.")

    logger.info(f"Processing team selection for teams {params.team1_name} vs {params.team2_name}, format: {format}")

    try:
        from fantasy_solver import FantasySolver
        solver = FantasySolver(player_csv='player_summary.csv', prediction_csv='player_predictions.csv')

        # Remove format from params since original solver doesn't use it
        params_dict = params.dict()
        params_dict.pop('format', None)

        # Validate team names
        if app_data.get('df') is None:
            logger.error("Player data not loaded.")
            raise HTTPException(status_code=503, detail="Service Unavailable: Player data not loaded.")

        available_teams = app_data['df']['Team'].unique()
        if params.team1_name not in available_teams or params.team2_name not in available_teams:
            logger.warning(f"Invalid team names: {params.team1_name}, {params.team2_name}")
            raise HTTPException(status_code=400, detail=f"Teams must be in {available_teams.tolist()}")

        # Validate constraints
        if params.num_team1_players + params.num_team2_players != params.total_players:
            logger.warning(f"Invalid team counts: {params.num_team1_players} + {params.num_team2_players} != {params.total_players}")
            raise HTTPException(status_code=400, detail="Total players must equal num_team1_players + num_team2_players")

        team_df, summary = solver.solve(params_dict)
        if team_df is None:
            logger.error("No feasible team found.")
            raise HTTPException(status_code=400, detail=f"No feasible team found for {format}. Adjust constraints (budget, roles, or team counts).")

        # Add format to summary for response
        summary['format'] = format
        logger.info(f"Team selection successful: {len(team_df)} players selected")

        return {
            'team': team_df[['Player', 'Team', 'Role', 'Team Role', 'Fantasy Points', 'Cost', 'Variance']].to_dict(orient='records'),
            'summary': summary
        }
    except Exception as e:
        logger.error(f"Error in team selection: {str(e)}", exc_info=True)
        raise HTTPException(status_code=500, detail=f"Internal Server Error: {str(e)}")

@app.get("/players", tags=["Player Data"])
async def get_players():
    if app_data.get('df') is None:
        logger.error("Player data not loaded.")
        raise HTTPException(status_code=503, detail="Service Unavailable: Player data not loaded.")
    columns = ['Player', 'Role', 'Games', 'Cost', 'Team']
    return app_data['df'][columns].to_dict(orient='records')

@app.get("/metrics", tags=["Evaluation"])
async def get_metrics():
    try:
        metrics_df = pd.read_csv("training_metrics.csv")
        return metrics_df.to_dict(orient='records')
    except FileNotFoundError:
        logger.error("Metrics file not found.")
        raise HTTPException(status_code=404, detail="Metrics file not found.")

@app.get("/feature_importances", tags=["Evaluation"])
async def get_feature_importances():
    try:
        importances_df = pd.read_csv("feature_importances.csv")
        return importances_df.to_dict(orient='records')
    except FileNotFoundError:
        logger.error("Feature importances file not found.")
        raise HTTPException(status_code=404, detail="Feature importances file not found.")

if __name__ == "__main__":
    nest_asyncio.apply()
    try:
        ngrok.set_auth_token(userdata.get('NGROK_AUTH_TOKEN'))
    except Exception as e:
        logger.error(f"Error setting NGROK_AUTH_TOKEN: {str(e)}")
        exit()
    public_url = ngrok.connect(8000)
    logger.info(f"FastAPI app running on: {public_url}")
    logger.info(f"API Docs: {public_url}/docs")
    thread = threading.Thread(target=uvicorn.run, kwargs={"app": app, "host": "0.0.0.0", "port": 8000})
    thread.start()