In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import re
import warnings
warnings.filterwarnings('ignore')

# Load the data from uploaded files
print("Loading data...")
player_data = pd.read_csv(r"C:\Users\nanhs\Downloads\Player Data.csv")
matchups = pd.read_csv(r"C:\Users\nanhs\Downloads\Player Matchups.csv")
current_form_bowlers = pd.read_csv(r"C:\Users\nanhs\Downloads\Current From Bowler.csv")
current_form_batters = pd.read_csv(r"C:\Users\nanhs\Downloads\Current From Batter.csv")
pitch_report = pd.read_csv(r"C:\Users\nanhs\Downloads\Pitch Report.csv", header=None)
head_to_head = pd.read_csv(r"C:\Users\nanhs\Downloads\Head To Head.csv")

print("Data loaded successfully!")

# Data preprocessing
print("\n----- Preprocessing Data -----")

# Clean player data
player_data.columns = player_data.columns.str.strip()
player_data['Wicket Keeper'] = player_data['Wicket Keeper'].fillna('')
player_data['Bowl Avg'] = player_data['Bowl Avg'].fillna(0)
player_data['Bat Avg'] = player_data['Bat Avg'].fillna(0)

# Clean current form data
current_form_batters['% Team Runs'] = current_form_batters['% Team Runs'].str.strip()
current_form_batters['% Team Runs'] = current_form_batters['% Team Runs'].str.replace('%', '').astype(float)
current_form_batters['S/R'] = pd.to_numeric(current_form_batters['S/R'], errors='coerce').fillna(0)
current_form_batters['Avg'] = pd.to_numeric(current_form_batters['Avg'], errors='coerce').fillna(0)

# Fix format issues in bowling data
current_form_bowlers['Avg'] = pd.to_numeric(current_form_bowlers['Avg'], errors='coerce').fillna(0)
current_form_bowlers['S/R'] = pd.to_numeric(current_form_bowlers['S/R'], errors='coerce').fillna(0)
current_form_bowlers['E/R'] = pd.to_numeric(current_form_bowlers['E/R'], errors='coerce').fillna(0)

# Extract pitch and weather information
pitch_info = ' '.join(pitch_report[0].astype(str).tolist())
batting_friendly = 1 if 'favorable surface for batsmen' in pitch_info else 0
spin_friendly = 1 if 'Spinners get some turn' in pitch_info else 0
rain_expected = 1 if 'rainy' in pitch_info.lower() else 0

print(f"Pitch conditions - Batting friendly: {batting_friendly}, Spin friendly: {spin_friendly}, Rain expected: {rain_expected}")

# Process head to head data
teams = head_to_head.columns[1:-1].tolist()  # Extract team names from columns
home_team = teams[0]  # First team is considered home team
away_team = teams[1]  # Second team is considered away team

home_ground = head_to_head['Ground'].iloc[0]
home_advantage = head_to_head[head_to_head['Ground'] == home_ground]
home_win_ratio = float(home_advantage[home_team].values[0]) / float(home_advantage['Total'].values[0])
print(f"Home team win ratio at {home_ground}: {home_win_ratio:.2f}")

# Create player feature dataset
print("\n----- Creating Player Features -----")

def create_player_features():
    # Start with all players from player_data
    players = player_data.copy()
    
    # Add current form for batters
    players = pd.merge(
        players, 
        current_form_batters[['Player', 'S/R', 'Avg', '% Team Runs', 'Ca', 'St']],
        left_on='Name', 
        right_on='Player', 
        how='left'
    )
    players.rename(columns={'S/R': 'Current_SR_Bat', 'Avg': 'Current_Bat_Avg'}, inplace=True)
    players.drop('Player', axis=1, inplace=True)
    
    # Add current form for bowlers
    players = pd.merge(
        players, 
        current_form_bowlers[['Player', 'Avg', 'S/R', 'E/R', 'W']],
        left_on='Name', 
        right_on='Player', 
        how='left'
    )
    players.rename(columns={
        'Avg': 'Current_Bowl_Avg', 
        'S/R': 'Current_Bowl_SR', 
        'E/R': 'Current_ER',
        'W': 'Recent_Wickets'
    }, inplace=True)
    players.drop('Player', axis=1, inplace=True)
    
    # Fill missing values
    players['Current_SR_Bat'] = players['Current_SR_Bat'].fillna(0)
    players['Current_Bat_Avg'] = players['Current_Bat_Avg'].fillna(0)
    players['% Team Runs'] = players['% Team Runs'].fillna(0)
    players['Ca'] = players['Ca'].fillna(0)
    players['St'] = players['St'].fillna(0)
    players['Current_Bowl_Avg'] = players['Current_Bowl_Avg'].fillna(0)
    players['Current_Bowl_SR'] = players['Current_Bowl_SR'].fillna(0)
    players['Current_ER'] = players['Current_ER'].fillna(0)
    players['Recent_Wickets'] = players['Recent_Wickets'].fillna(0)
    
    # Add player role features
    players['is_batsman'] = ((players['Bat Avg'] > 20) & (players['Bowl Avg'] == 0)).astype(int)
    players['is_bowler'] = ((players['Bowl Avg'] > 0) & (players['Bat Avg'] < 15)).astype(int)
    players['is_allrounder'] = ((players['Bowl Avg'] > 0) & (players['Bat Avg'] >= 15)).astype(int)
    players['is_wk'] = players['Wicket Keeper'].str.contains('WK', case=False, na=False).astype(int)
    
    # Pitch condition features
    players['batting_friendly_factor'] = batting_friendly * players['Bat Avg'] / 100
    
    # Add benefit for spinners if pitch is spin-friendly - using bowling style data instead of names
    players['spin_friendly_factor'] = 0
    # Create a column to identify spinner type bowlers based on bowling data
    is_spinner = (players['Bowl Avg'] > 0) & (player_data['Bowling Style'].str.contains('spin|leg|off', case=False, na=False))
    players.loc[is_spinner, 'spin_friendly_factor'] = spin_friendly * 0.2
    
    # Add rain impact - typically helps pace bowlers - using bowling style data instead of names
    players['rain_factor'] = 0
    # Create a column to identify pace bowler types based on bowling data
    is_pacer = (players['Bowl Avg'] > 0) & (player_data['Bowling Style'].str.contains('fast|medium|pace|seam', case=False, na=False))
    players.loc[is_pacer, 'rain_factor'] = rain_expected * 0.25
    
    # Home advantage
    players['home_advantage'] = 0
    players.loc[players['Team(s)'] == home_team, 'home_advantage'] = home_win_ratio * 0.15
    
    return players

players_with_features = create_player_features()

# Extract Dream11 points from matchups data for training
def get_dream11_points():
    # Group by player name and calculate average Dream11 points
    dream11_stats = matchups.groupby('Name')['Dream11'].mean().reset_index()
    return dream11_stats

dream11_points = get_dream11_points()

# Merge Dream11 points with player features
players_with_points = pd.merge(
    players_with_features,
    dream11_points,
    left_on='Name',
    right_on='Name',
    how='left'
)
players_with_points['Dream11'] = players_with_points['Dream11'].fillna(0)

print(f"Created features for {len(players_with_features)} players")

# Train linear regression model
print("\n----- Training Dream11 Prediction Model -----")

def train_model(players_df):
    # Filter players with Dream11 points (our training data)
    train_data = players_df[players_df['Dream11'] > 0].copy()
    
    # Define features
    features = [
        'Matches', 'Bat Avg', 'Bowl Avg', 'Current_SR_Bat', 'Current_Bat_Avg', 
        '% Team Runs', 'Ca', 'Current_Bowl_Avg', 'Current_Bowl_SR', 'Current_ER', 
        'Recent_Wickets', 'is_batsman', 'is_bowler', 'is_allrounder', 'is_wk',
        'batting_friendly_factor', 'spin_friendly_factor', 'rain_factor', 'home_advantage'
    ]
    
    # Handle missing values
    for feature in features:
        if feature in train_data.columns:
            train_data[feature] = train_data[feature].fillna(0)
    
    # Select only features in our dataframe
    available_features = [f for f in features if f in train_data.columns]
    
    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(train_data[available_features])
    y = train_data['Dream11']
    
    # Train model
    model = LinearRegression()
    model.fit(X, y)
    
    # Get feature importance
    coefficients = pd.DataFrame({
        'Feature': available_features,
        'Importance': model.coef_
    })
    coefficients = coefficients.sort_values('Importance', ascending=False)
    
    print("Feature importance:")
    print(coefficients.head(5))
    
    return model, scaler, available_features

model, scaler, features = train_model(players_with_points)

# Predict Dream11 points for all players
print("\n----- Predicting Dream11 Points -----")

def predict_points(players_df, model, scaler, features):
    # Prepare features for prediction
    pred_data = players_df.copy()
    
    # Handle missing values
    for feature in features:
        if feature in pred_data.columns:
            pred_data[feature] = pred_data[feature].fillna(0)
    
    # Standardize features
    X_pred = scaler.transform(pred_data[features])
    
    # Predict points
    pred_points = model.predict(X_pred)
    
    # Add predictions to dataframe
    pred_data['Predicted_Dream11_Points'] = pred_points
    
    # Ensure no negative points
    pred_data['Predicted_Dream11_Points'] = pred_data['Predicted_Dream11_Points'].clip(lower=0)
    
    return pred_data

players_with_predictions = predict_points(players_with_features, model, scaler, features)

# Select best Dream11 team
print("\n----- Selecting Best Dream11 Team -----")

def select_dream11_team(players_df):
    # Sort players by predicted points
    sorted_players = players_df.sort_values('Predicted_Dream11_Points', ascending=False)
    
    # Define Dream11 rules
    max_players = 11
    max_per_team = 7
    min_batsmen = 3
    min_bowlers = 3
    min_allrounders = 1
    min_wk = 1
    
    # Initialize team
    dream_team = pd.DataFrame(columns=sorted_players.columns)
    
    # Keep track of team composition
    team_count = {home_team: 0, away_team: 0}
    role_count = {'batsman': 0, 'bowler': 0, 'allrounder': 0, 'wk': 0}
    
    # First select mandatory wicketkeeper
    wk_candidates = sorted_players[sorted_players['is_wk'] == 1]
    if not wk_candidates.empty:
        best_wk = wk_candidates.iloc[0]
        dream_team = pd.concat([dream_team, pd.DataFrame([best_wk])], ignore_index=True)
        team = best_wk['Team(s)']
        team_count[team] = team_count.get(team, 0) + 1
        role_count['wk'] = 1
        sorted_players = sorted_players[sorted_players['Name'] != best_wk['Name']]
    
    # Then fill remaining positions
    for _, player in sorted_players.iterrows():
        team = player['Team(s)']
        
        # Check team limit
        if team_count.get(team, 0) >= max_per_team:
            continue
        
        # Determine player role
        if player['is_wk'] == 1:
            role = 'wk'
        elif player['is_batsman'] == 1:
            role = 'batsman'
        elif player['is_bowler'] == 1:
            role = 'bowler'
        else:
            role = 'allrounder'
        
        # Add player if needed for minimum role requirements or if team not full
        can_add = False
        
        if role == 'batsman' and role_count['batsman'] < min_batsmen:
            can_add = True
        elif role == 'bowler' and role_count['bowler'] < min_bowlers:
            can_add = True
        elif role == 'allrounder' and role_count['allrounder'] < min_allrounders:
            can_add = True
        elif (role_count['batsman'] >= min_batsmen and 
              role_count['bowler'] >= min_bowlers and 
              role_count['allrounder'] >= min_allrounders and
              role_count['wk'] >= min_wk and
              len(dream_team) < max_players):
            can_add = True
        
        if can_add:
            dream_team = pd.concat([dream_team, pd.DataFrame([player])], ignore_index=True)
            team_count[team] = team_count.get(team, 0) + 1
            role_count[role] = role_count.get(role, 0) + 1
        
        if len(dream_team) >= max_players:
            break
    
    # Select captain and vice-captain (top 2 predicted point scorers)
    dream_team = dream_team.sort_values('Predicted_Dream11_Points', ascending=False)
    
    if len(dream_team) >= 2:
        dream_team.iloc[0, dream_team.columns.get_loc('Name')] = dream_team.iloc[0]['Name'] + ' (C)'
        dream_team.iloc[1, dream_team.columns.get_loc('Name')] = dream_team.iloc[1]['Name'] + ' (VC)'
    
    return dream_team

dream11_team = select_dream11_team(players_with_predictions)

# Organize team by roles for display
def organize_team_by_roles(team_df):
    # Create role-based team
    wk = team_df[team_df['is_wk'] == 1]
    batsmen = team_df[(team_df['is_batsman'] == 1) & (team_df['is_wk'] == 0)]
    all_rounders = team_df[team_df['is_allrounder'] == 1]
    bowlers = team_df[(team_df['is_bowler'] == 1) & (team_df['is_wk'] == 0)]
    
    # Sort each category by predicted points
    wk = wk.sort_values('Predicted_Dream11_Points', ascending=False)
    batsmen = batsmen.sort_values('Predicted_Dream11_Points', ascending=False)
    all_rounders = all_rounders.sort_values('Predicted_Dream11_Points', ascending=False)
    bowlers = bowlers.sort_values('Predicted_Dream11_Points', ascending=False)
    
    # Combine sorted categories
    organized_team = pd.concat([wk, batsmen, all_rounders, bowlers])
    
    return organized_team

organized_dream11_team = organize_team_by_roles(dream11_team)

# Define teams of 12 players (playing XI + substitute)
def select_team_players(players_df, team_name):
    team_players = players_df[players_df['Team(s)'] == team_name].copy()
    team_players = team_players.sort_values('Predicted_Dream11_Points', ascending=False)
    
    # Ensure team has at least one wicketkeeper
    has_wk = any(team_players['is_wk'] == 1)
    if not has_wk:
        # Find wicketkeepers and add the best one
        wks = players_df[(players_df['is_wk'] == 1) & (players_df['Team(s)'] != team_name)]
        if not wks.empty:
            best_wk = wks.iloc[0]
            team_players = pd.concat([team_players, pd.DataFrame([best_wk])], ignore_index=True)
    
    # Ensure minimum number of players per role
    min_batsmen = 4
    min_bowlers = 4
    min_allrounders = 1
    
    # Count roles
    batsmen_count = sum(team_players['is_batsman'] == 1)
    bowlers_count = sum(team_players['is_bowler'] == 1)
    allrounders_count = sum(team_players['is_allrounder'] == 1)
    
    # Add players from other teams if needed
    other_players = players_df[players_df['Team(s)'] != team_name].sort_values('Predicted_Dream11_Points', ascending=False)
    
    if batsmen_count < min_batsmen:
        needed = min_batsmen - batsmen_count
        extra_batsmen = other_players[other_players['is_batsman'] == 1].head(needed)
        team_players = pd.concat([team_players, extra_batsmen], ignore_index=True)
    
    if bowlers_count < min_bowlers:
        needed = min_bowlers - bowlers_count
        extra_bowlers = other_players[other_players['is_bowler'] == 1].head(needed)
        team_players = pd.concat([team_players, extra_bowlers], ignore_index=True)
    
    if allrounders_count < min_allrounders:
        needed = min_allrounders - allrounders_count
        extra_allrounders = other_players[other_players['is_allrounder'] == 1].head(needed)
        team_players = pd.concat([team_players, extra_allrounders], ignore_index=True)
    
    # Ensure exactly 12 players total (11 + 1 substitute)
    if len(team_players) > 12:
        team_players = team_players.head(12)
    elif len(team_players) < 12:
        needed = 12 - len(team_players)
        extra_players = other_players[~other_players['Name'].isin(team_players['Name'])].head(needed)
        team_players = pd.concat([team_players, extra_players], ignore_index=True)
    
    return team_players.head(12)

home_playing_12 = select_team_players(players_with_predictions, home_team)
away_playing_12 = select_team_players(players_with_predictions, away_team)

# Print results
print("\n============= TEAM PREDICTIONS =============")
print(f"\n{home_team} Playing 12:")
for idx, player in home_playing_12.iterrows():
    role = "WK" if player['is_wk'] == 1 else "BAT" if player['is_batsman'] == 1 else "BOWL" if player['is_bowler'] == 1 else "AR"
    print(f"{player['Name']} ({role}) - Predicted Points: {player['Predicted_Dream11_Points']:.2f}")

print(f"\n{away_team} Playing 12:")
for idx, player in away_playing_12.iterrows():
    role = "WK" if player['is_wk'] == 1 else "BAT" if player['is_batsman'] == 1 else "BOWL" if player['is_bowler'] == 1 else "AR"
    print(f"{player['Name']} ({role}) - Predicted Points: {player['Predicted_Dream11_Points']:.2f}")

print("\n============= DREAM 11 PREDICTION =============")
print("\nRecommended Dream11 Team:")
for idx, player in organized_dream11_team.iterrows():
    role = "WK" if player['is_wk'] == 1 else "BAT" if player['is_batsman'] == 1 else "BOWL" if player['is_bowler'] == 1 else "AR"
    team = player['Team(s)']
    print(f"{player['Name']} ({team}, {role}) - Predicted Points: {player['Predicted_Dream11_Points']:.2f}")

# Calculate team distribution
home_count = sum(organized_dream11_team['Team(s)'] == home_team)
away_count = sum(organized_dream11_team['Team(s)'] == away_team)
print(f"\nTeam Distribution: {home_team} {home_count} - {away_count} {away_team}")

# Match prediction
home_team_strength = home_playing_12['Predicted_Dream11_Points'].sum()
away_team_strength = away_playing_12['Predicted_Dream11_Points'].sum()
home_win_prob = home_team_strength / (home_team_strength + away_team_strength)
away_win_prob = 1 - home_win_prob

# Adjust for home advantage
home_win_prob = home_win_prob * (1 + home_win_ratio * 0.1)
away_win_prob = 1 - home_win_prob

print("\n============= MATCH PREDICTION =============")
print(f"{home_team} Win Probability: {home_win_prob * 100:.1f}%")
print(f"{away_team} Win Probability: {away_win_prob * 100:.1f}%")

# Key matchups analysis
print("\n============= KEY MATCHUPS =============")
# Find top batsman vs top bowler matchups
top_batsmen = matchups.groupby('Name')['Dream11'].mean().sort_values(ascending=False).head(5).index
top_bowlers = current_form_bowlers.sort_values('W', ascending=False).head(5)['Player'].tolist()

key_matchups = matchups[matchups['Name'].isin(top_batsmen) & matchups['Bowler'].isin(top_bowlers)]
if not key_matchups.empty:
    for _, matchup in key_matchups.iterrows():
        print(f"{matchup['Name']} vs {matchup['Bowler']}: {matchup['Boundaries']} boundaries in {matchup['B']} balls")

Loading data...


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 33: invalid start byte