In [1]:
import pandas as pd
import joblib
import plotly.express as px
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.ensemble import AdaBoostRegressor
import plotly.graph_objs as go
import ipywidgets as widgets
from ipywidgets import GridBox, Layout, HTML
from IPython.display import display
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load the trained model
model = joblib.load('models/model.pkl')

# Load the future data for prediction
future_data1 = pd.read_csv('data/afcon.csv')

# Get the required columns for prediction
prediction_features = ['home_score', 'away_score', 'goal_difference', 'neutral']
future_data = future_data1[prediction_features]

# Perform necessary data preprocessing steps on the future data (if necessary)
# ...

# Make predictions on the future data
predictions = model.predict(future_data)

# Add the 'winner' column to the future_data DataFrame
future_data['winner'] = predictions

# Get the names of the home and away teams from the original future_data DataFrame
team_names = future_data1.merge(
    pd.DataFrame(future_data.index),
    left_index=True,
    right_index=True
)[['date', 'home_team', 'away_team']]

# Replace values in the 'winner' column based on the specified conditions
future_data.loc[future_data['winner'] == 2, 'winner'] = team_names['away_team']
future_data.loc[future_data['winner'] == 1, 'winner'] = team_names['home_team']
future_data.loc[future_data['winner'] == 0, 'winner'] = 'Draw'

output = pd.concat([future_data, team_names], axis=1)

# Rearrange the columns
column_order = ['date', 'home_team', 'home_score', 'away_score', 'away_team', 'winner']
output = output[column_order]

# Print the predicted winners along with the other columns
output.head(5)

Unnamed: 0,date,home_team,home_score,away_score,away_team,winner
0,1957-02-10,Sudan,1.0,2.0,Egypt,Egypt
1,1957-02-16,Egypt,4.0,0.0,Ethiopia,Egypt
2,1959-05-22,Egypt,4.0,0.0,Ethiopia,Egypt
3,1959-05-25,Ethiopia,0.0,1.0,Sudan,Sudan
4,1959-05-29,Egypt,2.0,1.0,Sudan,Egypt


In [2]:
# Count the occurrences of each team in the 'winner' column, excluding 'Draw' values
winner_counts = future_data.loc[future_data['winner'] != 'Draw', 'winner'].value_counts()

# Get all unique team names from the 'winner' column
all_teams = pd.unique(future_data['winner'])

# Create a DataFrame with the team names and their counts
team_counts = pd.DataFrame({'Team': winner_counts.index, 'Wins': winner_counts.values})

# Sort the DataFrame in descending order based on the number of wins
team_counts = team_counts.sort_values('Wins', ascending=False)

# Reset the index of the DataFrame and set it to start counting from 1
team_counts.index = range(1, len(team_counts) + 1)

# Calculate the total number of appearances for each team
total_appearances = output['home_team'].value_counts() + output['away_team'].value_counts()

# Calculate the Win Rate (%) for each team
team_counts['Appearances'] = total_appearances[team_counts['Team']].values
team_counts['Win Rate (%)'] = round((team_counts['Wins'] / team_counts['Appearances']) * 100, 2)

# Calculate the weighted average Win Rate (%)
max_appearances = team_counts['Appearances'].max()
team_counts['Weighted Win Rate (%)'] = round(((team_counts['Wins'] / team_counts['Appearances']) * (team_counts['Appearances'] / max_appearances)) * 100, 2)

# Sort the DataFrame in descending order based on the Weighted Win Rate (%)
team_counts = team_counts.sort_values('Weighted Win Rate (%)', ascending=False)

# Reset the index of the DataFrame and set it to start counting from 1
team_counts.index = range(1, len(team_counts) + 1)

team_counts.head(5)

Unnamed: 0,Team,Wins,Appearances,Win Rate (%),Weighted Win Rate (%)
1,Egypt,60,111,54.05,54.05
2,Nigeria,57,103,55.34,51.35
3,Ghana,54,105,51.43,48.65
4,Ivory Coast,48,106,45.28,43.24
5,Cameroon,46,95,48.42,41.44


In [3]:
# Create a new instance of the AdaBoostRegressor class
model = AdaBoostRegressor()

# Set the start date and end date for the forecast
years = 30
start_date = datetime.now().replace(day=1, month=1) + timedelta(days=365)
end_date = datetime.strptime(output['date'].max(), '%Y-%m-%d') + timedelta(days=365 * years)

# Create a date range between the start date and end date with a frequency of 1 month
date_range = pd.date_range(start=start_date, end=end_date, freq='MS')

# Create an empty DataFrame to store the forecasted values
forecast = pd.DataFrame(index=date_range)

# Iterate over each team in the team_counts DataFrame
for team in team_counts['Team']:
    # Get the historical data for the current team
    historical_data = output.loc[(output['home_team'] == team) | (output['away_team'] == team)]
    
    # Convert the 'date' column to a datetime object
    historical_data['date'] = pd.to_datetime(historical_data['date'])
    
    # Set the 'date' column as the index of the DataFrame
    historical_data = historical_data.set_index('date')
    
    # Resample the historical data to a monthly frequency and count the number of wins for each month
    historical_data = historical_data.resample('MS')['winner'].apply(lambda x: (x == team).sum())
    
    # Check if there are at least two values in the historical data
    if len(historical_data) > 1:
        # Create a DataFrame with the historical data and a column of ones
        X = pd.DataFrame({'ones': 1, 'x': range(len(historical_data))})
        y = historical_data.values
        
        # Fit a random forest regressor to the historical data
        model = AdaBoostRegressor()
        model.fit(X, y)

        # Create a DataFrame with the date range and a column of ones
        X_new = pd.DataFrame({'ones': 1, 'x': range(len(date_range))})
        
        # Forecast the number of wins for each month in the date range
        forecast[team] = model.predict(X_new)
    else:
        # Set all forecasted values to zero if there are not enough observations
        forecast[team] = 0

In [4]:
# Create a list of years from 2024 to 2050
years = list(range(2024, 2051))

# Set the start year to the first year in the list of years
start_year = years[0]

# Create a slider widget for the year input
year_slider = widgets.SelectionSlider(
    options=years,
    value=years[0],
    description='Year:',
    continuous_update=False
)

# Create a line chart for the forecast data
forecast_fig = go.FigureWidget()

# Create a bubble map for the team wins data
team_wins_fig = go.FigureWidget()

# Create an HTML widget for the top 5 countries
top_countries_html = HTML()

# Create a function to update the forecast chart, bubble map, and top countries table based on the selected year
def update_dashboard(year):
    # Calculate the end date for the forecast based on the selected year
    end_date = start_date + timedelta(days=365 * (year - start_year))
    
    # Filter the forecast DataFrame based on the end date
    forecast_filtered = forecast.loc[forecast.index <= end_date]
    
    # Sum the forecasted wins for each team
    total_wins = forecast_filtered.sum()

    # Create a DataFrame with the team names and their forecasted wins
    team_wins = pd.DataFrame({'Team': total_wins.index, 'Wins': total_wins.values})

    # Sort the DataFrame in descending order based on the number of wins
    team_wins = team_wins.sort_values('Wins', ascending=False)

    # Reset the index of the DataFrame and set it to start counting from 1
    team_wins.reset_index(drop=True, inplace=True)
    team_wins.index += 1

    # Update the line chart with the filtered forecast data
    forecast_fig.data = []
    excluded_teams = ['Rwanda', 'Equatorial Guinea', 'Zimbabwe']
    for col in forecast_filtered.columns:
        if col not in excluded_teams:
            forecast_fig.add_scatter(x=forecast_filtered.index, y=forecast_filtered[col], name=col)
    forecast_fig.update_layout(title='Forecasted Wins for Each Team')
    
    # Update the bubble map with the updated team_wins DataFrame
    team_wins_fig.data = []
    team_wins_fig.add_scattergeo(locations=team_wins.loc[~team_wins['Team'].isin(excluded_teams), 'Team'], locationmode='country names',
                                 marker=dict(size=team_wins.loc[~team_wins['Team'].isin(excluded_teams), 'Wins'], sizemode='diameter', color=team_wins.loc[~team_wins['Team'].isin(excluded_teams), 'Wins'], colorscale='Viridis', showscale=True),
                                 text=team_wins.loc[~team_wins['Team'].isin(excluded_teams), 'Team'] + ': ' + team_wins.loc[~team_wins['Team'].isin(excluded_teams), 'Wins'].astype(str) + ' Wins', hoverinfo='text')
    
    # Customize the layout
    team_wins_fig.update_layout(
        title='African Teams and Their Forecasted Wins',
        geo=dict(showframe=False, showcoastlines=False, projection_type='orthographic',
                 showcountries=True, showland=True, landcolor='rgb(243, 243, 243)',
                 showocean=True, oceancolor='rgb(10, 200, 255)'),
                 height=600, width=800
    )
    
    # Update the top countries HTML widget with the top 5 countries excluding excluded teams
    top_countries_html.value = '<h3>Top 5 Countries</h3><table style="width:100%;margin:auto;text-align:center"><tr style="background-color:darkblue;color:white"><th style="text-align:center">Rank</th><th style="text-align:center">Team</th><th style="text-align:center">Wins</th></tr>' + ''.join(['<tr><td style="text-align:center">{}</td><td style="text-align:center">{}</td><td style="text-align:center">{}</td></tr>'.format(rank,row['Team'],round(row['Wins'])) for rank,row in team_wins.loc[~team_wins['Team'].isin(excluded_teams)].head(5).iterrows()]) + '</table>'

# Create an interactive output widget to display the dashboard
dashboard = widgets.interactive_output(update_dashboard, {'year': year_slider})

# Display the year slider and dashboard
display(year_slider, dashboard)

# Display everything in a GridBox container with reduced margin between widgets and vertical layout
GridBox([top_countries_html,
         team_wins_fig,
         forecast_fig],
        layout=Layout(grid_template_columns='repeat(1, minmax(250px, 1fr))'))

SelectionSlider(continuous_update=False, description='Year:', options=(2024, 2025, 2026, 2027, 2028, 2029, 203…

Output()

GridBox(children=(HTML(value='<h3>Top 5 Countries</h3><table style="width:100%;margin:auto;text-align:center">…