In [None]:
# Import all libraries and dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker
import matplotlib.ticker as plticker
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
# load the data files

world_cup = pd.read_csv('WC-2023.csv')
results = pd.read_csv('ODI.csv')

In [None]:
# Display the head of the data file

world_cup.head()

In [None]:
# Display the head of the data file

results.head()

In [None]:
# Filter the matches played by India

df = results[(results['Team_1'] == 'India') | (results['Team_2'] == 'India')]
india = df.iloc[:]
india.head()

In [None]:
#Create a column for the matches played in 2019
year = []
for row in india['date']:
    # Split the date using comma as a separator and extract the second part (year)
    year_part = row.split(',')[1].strip()
    # Convert the extracted year to an integer and append it to the 'year' list
    year.append(int(year_part))
india['match_year'] = year

# Filter matches played in 2019
# or later
india_2019 = india[india.match_year >= 2019]
india_2019.count()


In [None]:
#Combine the teams participating in the world cup

worldcup_teams = ['England', ' South Africa', 'Netherlans',
            'Pakistan', 'New Zealand', 'Sri Lanka', 'Afghanistan',
            'Australia', 'Bangladesh', 'India']
df_teams_1 = results[results['Team_1'].isin(worldcup_teams)]
df_teams_2 = results[results['Team_2'].isin(worldcup_teams)]
df_teams = pd.concat((df_teams_1, df_teams_2))
df_teams.drop_duplicates()
df_teams.count()

In [None]:
# Display the results of the newly created dataframe

df_teams.head()

In [None]:
# Delete the columns that won't affect match results

df_teams_2019 = df_teams.drop(['date','Margin' ,'Ground'], axis=1)
df_teams_2019.head()

In [None]:
# Building the model

# The prediction label: The winning_team column will show "1" if Team 1 has won, and "2" if Team 2 has won.

df_teams_2019 = df_teams_2019.reset_index(drop=True)
df_teams_2019.loc[df_teams_2019.Winner == df_teams_2019.Team_1,'winning_team']=1
df_teams_2019.loc[df_teams_2019.Winner == df_teams_2019.Team_2, 'winning_team']=2
df_teams_2019 = df_teams_2019.drop(['winning_team'], axis=1)

df_teams_2019.head()

In [None]:
# Convert team-1 and team-2 from categorical variables to continous inputs

final = pd.get_dummies(df_teams_2019, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

# Separate X and y sets

X = final.drop(['Winner'], axis=1)
y = final["Winner"]


# Divide the data for training and testing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
# Display the first 5 rows of the data
final.head()

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
score = logreg.score(X_train, y_train)
score2 = logreg.score(X_test, y_test)

print("Training set accuracy: ", '%.3f'%(score))
print("Test set accuracy: ", '%.3f'%(score2))

In [None]:
# Adding the ICC rankings
# The team which is positioned higher on the ICC Ranking will be considered "favourite" for the match
# and will be positioned under the "Team_1" column

# Loading new datasets

ranking = pd.read_csv('icc_rankings.csv')
fixtures = pd.read_csv('ICC-Cricket-World-Cup-2023-Schedule-Excel.csv')

# List for storing the group stage games

pred_set = []

In [None]:
# Create new columns with ranking position of each team

fixtures.insert(1, 'first_position', fixtures['Team_1'].map(ranking.set_index('Team')['Position']))
fixtures.insert(2, 'second_position', fixtures['Team_2'].map(ranking.set_index('Team')['Position']))

# We only need the group stage games, so slice the dataset and display the last 5 rows

fixtures = fixtures.iloc[:45, :]
fixtures.tail()

In [None]:
# Loop to add teams to new prediction dataset based on the ranking position of each team

for index, row in fixtures.iterrows():
    if row['first_position'] < row['second_position']:
        pred_set.append({'Team_1': row['Team_1'], 'Team_2': row['Team_2'], 'winning_team': None})
    else:
        pred_set.append({'Team_1': row['Team_2'], 'Team_2': row['Team_1'], 'winning_team': None})

pred_set = pd.DataFrame(pred_set)
backup_pred_set = pred_set
pred_set.head()

In [None]:
# Get dummy variables and drop winning_team column

pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

# Add missing columns compared to the model's training dataset

missing_cols = set(final.columns) - set(pred_set.columns)
for c in missing_cols:
    pred_set[c] = 0
pred_set = pred_set[final.columns]


pred_set = pred_set.drop(['Winner'], axis=1)
pred_set.head()

In [None]:
predictions = logreg.predict(pred_set)
for i in range(fixtures.shape[0]):
    print(backup_pred_set.iloc[i, 1] + " and " + backup_pred_set.iloc[i, 0])
    if predictions[i] == 1:
        print("Winner: " + backup_pred_set.iloc[i, 1])
    else:
        print("Winner: " + backup_pred_set.iloc[i, 0])
    print("")


In [None]:
predictions = logreg.predict(pred_set)
winners = []

for i in range(fixtures.shape[0]):
    team1 = backup_pred_set.iloc[i, 1]
    team2 = backup_pred_set.iloc[i, 0]
    if predictions[i] == 1:
        winner = team1
    else:
        winner = team2
    winners.append(winner)

# Now the 'winners' list contains the names of the predicted winners for each match
print(winners)


In [None]:
team_wins = {}

# Count the number of wins for each team
for winner in winners:
    if winner in team_wins:
        team_wins[winner] += 1
    else:
        team_wins[winner] = 1

sorted_team_wins = dict(sorted(team_wins.items(), key=lambda item: item[1], reverse=True))

for team, wins in sorted_team_wins.items():
    print(f"{team}: {wins} wins")

In [None]:
teams = list(sorted_team_wins.keys())

# Create new list of tuples with top team paired with fourth and second with third
semi_finals = [(teams[0], teams[3]), (teams[1], teams[2])]

print(semi_finals)

In [None]:
def clean_and_predict(matches, ranking, final, logreg):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to ICC ranking
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0],'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1],'Position'].iloc[0])

    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better then this team will be the 'Team_1' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'Team_1': matches[j][0], 'Team_2': matches[j][1]})
        else:
            dict1.update({'Team_1': matches[j][1], 'Team_2': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1

        # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    pred_set = pred_set.drop(['Winner'], axis=1)

    # Predict!
    predictions = logreg.predict(pred_set)
    final_results = []

    for i in range(len(pred_set)):
        team1 = backup_pred_set.iloc[i, 1]
        team2 = backup_pred_set.iloc[i, 0]
        if predictions[i] == 1:
            winner = team1
        else:
            winner = team2
        final_results.append(winner)

        # Print the match details and winner
        print(team1 + " and " + team2)
        print("Winner: " + winner)
        print("")

    # Create a tuple containing only the winners
    finals = tuple(final_results)

    return finals


In [None]:
semi_finals_results = clean_and_predict(semi_finals, ranking, final, logreg)



In [None]:
winner1, winner2 = semi_finals_results

# Forming the finals match tuple
finals = [(winner1,winner2)]

# Print or process finals as needed
print("Finals:")
print(finals)

In [None]:
def clean_and_predict_final(matches, ranking, final, logreg):

    # Initialization of auxiliary list for data cleaning
    positions = []

    # Loop to retrieve each team's position according to ICC ranking
    for match in matches:
        positions.append(ranking.loc[ranking['Team'] == match[0],'Position'].iloc[0])
        positions.append(ranking.loc[ranking['Team'] == match[1],'Position'].iloc[0])

    # Creating the DataFrame for prediction
    pred_set = []

    # Initializing iterators for while loop
    i = 0
    j = 0

    # 'i' will be the iterator for the 'positions' list, and 'j' for the list of matches (list of tuples)
    while i < len(positions):
        dict1 = {}

        # If position of first team is better then this team will be the 'Team_1' team, and vice-versa
        if positions[i] < positions[i + 1]:
            dict1.update({'Team_1': matches[j][0], 'Team_2': matches[j][1]})
        else:
            dict1.update({'Team_1': matches[j][1], 'Team_2': matches[j][0]})

        # Append updated dictionary to the list, that will later be converted into a DataFrame
        pred_set.append(dict1)
        i += 2
        j += 1

        # Convert list into DataFrame
    pred_set = pd.DataFrame(pred_set)
    backup_pred_set = pred_set

    # Get dummy variables and drop winning_team column
    pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'])

    # Add missing columns compared to the model's training dataset
    missing_cols2 = set(final.columns) - set(pred_set.columns)
    for c in missing_cols2:
        pred_set[c] = 0
    pred_set = pred_set[final.columns]

    pred_set = pred_set.drop(['Winner'], axis=1)

    # Predict!
    predictions = logreg.predict(pred_set)
    final_results = []

    for i in range(len(pred_set)):
        team1 = backup_pred_set.iloc[i, 1]
        team2 = backup_pred_set.iloc[i, 0]
        if predictions[i] == 1:
            winner = team1
        else:
            winner = team2
        final_results.append(winner)

        # Print the match details and winner
        print(team1 + " and " + team2)
        print("Winner: " + winner)
        print("")

    # Create a tuple containing only the winners


In [None]:
clean_and_predict_final(finals, ranking, final, logreg)