# Prediction of the World Cup 2023 Winner

# Importing Libraries

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Loading Datasets

In [51]:
world_cup = pd.read_csv("World_cup_2023.csv")
world_cup.head()

Unnamed: 0,Team_name,Team_ranking,Titles,Win_percentage_ODI,WC_matches,WC_match_won,Win_percent_WC,WC_match_loss,Loss_percent_WC,Tied,No_result,World_cup_winner,Recent_points,Rating
0,Australia,1,5,60.73,94,69,73.4,23,24.46,1,1,Yes,2714,118
1,Pakistan,2,1,52.78,79,45,56.96,32,40.5,0,2,Yes,2316,116
2,India,3,2,52.38,84,53,63.09,29,34.52,1,1,Yes,3807,115
3,New Zealand,4,0,45.89,89,54,60.67,33,37.07,1,1,No,2806,104
4,England,5,1,50.32,83,48,57.83,32,38.55,2,1,Yes,2426,101


In [52]:
results = pd.read_csv("results.csv")
results.head()

Unnamed: 0,date,team1,team2,winner,margin,Ground
0,17-04-2015,Bangladesh,Pakistan,Bangladesh,won by 79 runs,Shere Bangla National Stadium
1,19-04-2015,Bangladesh,Pakistan,Bangladesh,won by 7 wickets,Shere Bangla National Stadium
2,22-04-2015,Bangladesh,Pakistan,Bangladesh,won by 8 wickets,Shere Bangla National Stadium
3,08-05-2015,Ireland,England,No result,No result,The Village
4,26-05-2015,Pakistan,Zimbabwe,Pakistan,won by 41 runs,Gaddafi Stadium


In [53]:
latest = pd.read_csv('latest_data.csv')
latest.head()

Unnamed: 0,date,team1,team2,winner,margin,venue
0,05-10-2023,England,New Zealand,New Zealand,9 wickets,"Narendra Modi Stadium, Ahmedabad"
1,06-10-2023,Netherlands,Pakistan,Pakistan,81 runs,"Rajiv Gandhi International Stadium, Hyderabad"
2,07-10-2023,Afghanistan,Bangladesh,Bangladesh,6 wickets,"Himachal Pradesh Cricket Association Stadium, ..."
3,07-10-2023,South Africa,Sri Lanka,South Africa,102 runs,"Arun Jaitley Stadium, Delhi"
4,08-10-2023,India,Australia,India,6 wickets,"MA Chidambaram Stadium, Chennai"


In [54]:
print(f'World_cup data shape  : {world_cup.shape}')
print(f'Results data shape  : {results.shape}')
print(f'Latest world cup data  shape  : {latest.shape}')

World_cup data shape  : (10, 14)
Results data shape  : (764, 6)
Latest world cup data  shape  : (42, 6)


In [55]:
results = pd.concat([results, latest], axis=0)

results = results.reset_index(drop = True)
results.tail(5)

Unnamed: 0,date,team1,team2,winner,margin,Ground,venue
801,06-11-2023,Bangladesh,Sri Lanka,Bangladesh,3 wickets,,"Arun Jaitley Stadium, Delhi"
802,07-11-2023,Afghanistan,Australia,Australia,3 wickets,,"Wankhede Stadium, Mumbai"
803,08-11-2023,England,Netherlands,England,160 runs,,"Maharashtra Cricket Association Stadium, Pune"
804,09-11-2023,New Zealand,Sri Lanka,New Zealand,5 wickets,,"M Chinnaswamy Stadium, Bengaluru, Bangalore"
805,10-11-2023,Afghanistan,South Africa,South Africa,5 wickets,,"Narendra Modi Stadium, Ahmedabad"


In [56]:
results.shape

(806, 7)

In [57]:
results.columns

Index(['date', 'team1', 'team2', 'winner', 'margin', 'Ground', 'venue'], dtype='object')

In [58]:
results.drop(columns=['date','margin', 'Ground','venue'], axis=1, inplace=True)

results.head()

Unnamed: 0,team1,team2,winner
0,Bangladesh,Pakistan,Bangladesh
1,Bangladesh,Pakistan,Bangladesh
2,Bangladesh,Pakistan,Bangladesh
3,Ireland,England,No result
4,Pakistan,Zimbabwe,Pakistan


# Filtering WC Teams from overall data

In [59]:
world_cup_teams = ['England' , 'South Africa' , 'Netherlands' , 'Pakistan', 'New Zealand' ,
                   'Sri Lanka', 'Afganistan', 'Australia', 'Bangladesh', 'India']

In [60]:
df_teams_1 = results[results['team1'].isin(world_cup_teams)]
df_teams_2 = results[results['team2'].isin(world_cup_teams)]
df_winners = results[results['winner'].isin(world_cup_teams)]

df_team = pd.concat((df_teams_1, df_teams_2, df_winners), axis=0)

df_team.head()

Unnamed: 0,team1,team2,winner
0,Bangladesh,Pakistan,Bangladesh
1,Bangladesh,Pakistan,Bangladesh
2,Bangladesh,Pakistan,Bangladesh
4,Pakistan,Zimbabwe,Pakistan
5,Pakistan,Zimbabwe,Pakistan


# Categorical Encoding

In [61]:
df_team.loc[:,'Winning'] = np.where(df_team['winner']==df_team['team1'],1,2)
df_team.head()

Unnamed: 0,team1,team2,winner,Winning
0,Bangladesh,Pakistan,Bangladesh,1
1,Bangladesh,Pakistan,Bangladesh,1
2,Bangladesh,Pakistan,Bangladesh,1
4,Pakistan,Zimbabwe,Pakistan,1
5,Pakistan,Zimbabwe,Pakistan,1


In [62]:
df_team.drop(columns=['winner'], axis=1, inplace=True)
df_team.head()

Unnamed: 0,team1,team2,Winning
0,Bangladesh,Pakistan,1
1,Bangladesh,Pakistan,1
2,Bangladesh,Pakistan,1
4,Pakistan,Zimbabwe,1
5,Pakistan,Zimbabwe,1


In [63]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

#Apply the encoding
df_team= pd.get_dummies(df_team, prefix=['team1', 'team2'], columns = ['team1', 'team2'], dtype=int, sparse=False)
df_team.head()

Unnamed: 0,Winning,team1_Afghanistan,team1_Australia,team1_Bangladesh,team1_England,team1_Hong Kong,team1_India,team1_Ireland,team1_Nepal,team1_Netherlands,...,team2_Netherlands,team2_New Zealand,team2_Oman,team2_Pakistan,team2_Scotland,team2_South Africa,team2_Sri Lanka,team2_United Arab Emirates,team2_West Indies,team2_Zimbabwe
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Choosing the best Classification Algorithm

In [64]:
x = df_team.drop(columns=['Winning'], axis=1)
y = df_team['Winning']
# Splitting the data in the training and testing set
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=34)

In [65]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

#Define classifiers

classifiers = {
    'Random Forest' : RandomForestClassifier(),
    'Logistic Regression' : LogisticRegression(),
    'Decision Tree' : DecisionTreeClassifier(),
    'Support Vector Classifier' : SVC()
}

for name, clf in classifiers.items():
  pipeline = Pipeline([('classifier',clf)])

  pipeline.fit(x_train, y_train)

  #Make Predictions
  y_pred = pipeline.predict(x_test)

  #Calculate the accuracy
  acc = accuracy_score(y_test, y_pred)

  print(f'{name}: ')
  print(f"Accuracy : {acc:.4f}")

Random Forest: 
Accuracy : 0.7317
Logistic Regression: 
Accuracy : 0.6802
Decision Tree: 
Accuracy : 0.7317
Support Vector Classifier: 
Accuracy : 0.7344


# Random Forest Algorithm

In [66]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)

In [67]:
predictions = rf.predict(x_test)

In [68]:
predictions

array([2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1,
       1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1,
       2, 2, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2,
       1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1,
       2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 2, 1,
       1, 2, 1, 1, 2, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 2, 1,
       1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 1, 1, 2,
       2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 2, 1, 1, 2,
       2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2,
       1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1,

In [69]:
label_to_team = {1: 'team1' , 2:'team2'}

Winner =[label_to_team[label] for label in predictions]

print(Winner)

['team2', 'team1', 'team1', 'team1', 'team2', 'team2', 'team1', 'team1', 'team2', 'team2', 'team1', 'team1', 'team2', 'team2', 'team1', 'team1', 'team1', 'team1', 'team2', 'team1', 'team2', 'team1', 'team1', 'team1', 'team2', 'team1', 'team1', 'team2', 'team2', 'team1', 'team1', 'team2', 'team1', 'team1', 'team1', 'team1', 'team2', 'team1', 'team2', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team2', 'team2', 'team1', 'team1', 'team1', 'team1', 'team2', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team2', 'team2', 'team2', 'team2', 'team1', 'team2', 'team2', 'team2', 'team1', 'team2', 'team1', 'team1', 'team1', 'team2', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team2', 'team1', 'team2', 'team1', 'team2', 'team1', 'team2', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team1', 'team2', 'team2', 'team1', 'team1', 'team1', 'team1', 'team1', 'team2', 'team1', 'team1', 'team1', 'team2', 

# Predicting for current Fixtures

In [70]:
rankings = pd.read_csv('Icc_ranking.csv')
rankings.head()

Unnamed: 0,Team_ranking,Team_name,Rating
0,1,Australia,118
1,2,Pakistan,116
2,3,India,115
3,4,New Zealand,104
4,5,England,101


In [71]:
fixtures = pd.read_csv("Fixtures.csv")
fixtures.head()

Unnamed: 0,Round_number,Team_1,Team_2,Date,Location,Group,Result
0,1,England,New Zealand,5/10/2023,"Narendra Modi Stadium, Ahmedabad",Group A,
1,1,Pakistan,Netherlands,6/10/2023,"Rajiv Gandhi International Stadium, Hyderabad",Group A,
2,1,Bangladesh,Afghanistan,7/10/2023,"Himachal Pradesh Cricket Association Stadium, ...",Group A,
3,1,South Africa,Sri Lanka,7/10/2023,"Arun Jaitley Stadium, Delhi",Group A,
4,1,India,Australia,8/10/2023,"MA Chidambaram Stadium, Chennai",Group A,


In [72]:
pred_set=[]

fixtures.insert(1, 'first_position', fixtures['Team_1'].map(rankings.set_index('Team_name')['Team_ranking']))
fixtures.insert(2, 'second_position', fixtures['Team_2'].map(rankings.set_index('Team_name')['Team_ranking']))

fixtures = fixtures.iloc[:80, :]
fixtures.head()

Unnamed: 0,Round_number,first_position,second_position,Team_1,Team_2,Date,Location,Group,Result
0,1,5.0,,England,New Zealand,5/10/2023,"Narendra Modi Stadium, Ahmedabad",Group A,
1,1,,10.0,Pakistan,Netherlands,6/10/2023,"Rajiv Gandhi International Stadium, Hyderabad",Group A,
2,1,7.0,8.0,Bangladesh,Afghanistan,7/10/2023,"Himachal Pradesh Cricket Association Stadium, ...",Group A,
3,1,6.0,9.0,South Africa,Sri Lanka,7/10/2023,"Arun Jaitley Stadium, Delhi",Group A,
4,1,3.0,1.0,India,Australia,8/10/2023,"MA Chidambaram Stadium, Chennai",Group A,


In [73]:
for index, row in fixtures.iterrows():
  if row['first_position'] < row['second_position']:
    pred_set.append({'Team_1' :row['Team_1'] , 'Team_2' :row['Team_2'] , 'Winning_team' : None })
  else:
    pred_set.append({'Team_1' :row['Team_2'] , 'Team_2' :row['Team_1'] , 'Winning_team' : None })

pred_set = pd.DataFrame(pred_set)

pred_set.head()

Unnamed: 0,Team_1,Team_2,Winning_team
0,New Zealand,England,
1,Netherlands,Pakistan,
2,Bangladesh,Afghanistan,
3,South Africa,Sri Lanka,
4,Australia,India,


In [74]:
backup_pred_set = pred_set

In [75]:
pred_set = pd.get_dummies(pred_set, prefix=['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'], dtype=int)

missing_cols = set(df_team.columns) - set(pred_set.columns)

In [76]:
for cols in missing_cols :
  pred_set[cols] = 0

pred_set = pred_set[df_team.columns]

pred_set = pred_set.drop(['Winning'], axis=1)
pred_set.head()

Unnamed: 0,team1_Afghanistan,team1_Australia,team1_Bangladesh,team1_England,team1_Hong Kong,team1_India,team1_Ireland,team1_Nepal,team1_Netherlands,team1_New Zealand,...,team2_Netherlands,team2_New Zealand,team2_Oman,team2_Pakistan,team2_Scotland,team2_South Africa,team2_Sri Lanka,team2_United Arab Emirates,team2_West Indies,team2_Zimbabwe
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Group Stage matches

In [77]:
predictions = rf.predict(pred_set)
for i in range(fixtures.shape[0]):
  print(backup_pred_set.iloc[i,1] + " Vs " + backup_pred_set.iloc[i,0])
  if predictions[i]==1 :
    print('Winner : ' + backup_pred_set.iloc[i,0])
  else :
    print('Winner : ' + backup_pred_set.iloc[i,1])
  print("")

England Vs New Zealand
Winner : New Zealand

Pakistan  Vs Netherlands
Winner : Netherlands

Afghanistan Vs Bangladesh
Winner : Bangladesh

Sri Lanka Vs South Africa
Winner : South Africa

India Vs Australia
Winner : Australia

New Zealand Vs Netherlands
Winner : Netherlands

Bangladesh Vs England
Winner : England

Afghanistan Vs India
Winner : India

Pakistan  Vs Sri Lanka
Winner : Sri Lanka

South Africa Vs Australia
Winner : Australia

New Zealand Vs Bangladesh
Winner : Bangladesh

Afghanistan Vs England
Winner : England

India Vs Pakistan
Winner : Pakistan

Sri Lanka Vs Australia
Winner : Australia

Netherlands Vs South Africa
Winner : South Africa

New Zealand Vs Afghanistan
Winner : Afghanistan

Bangladesh Vs India
Winner : India

Pakistan Vs Australia
Winner : Australia

Netherlands Vs Sri Lanka
Winner : Sri Lanka

South Africa Vs England
Winner : England

India Vs New Zealand
Winner : New Zealand

Pakistan  Vs Afghanistan
Winner : Afghanistan

Bangladesh Vs South Africa
Winner :

In [78]:
latest.head()

Unnamed: 0,date,team1,team2,winner,margin,venue
0,05-10-2023,England,New Zealand,New Zealand,9 wickets,"Narendra Modi Stadium, Ahmedabad"
1,06-10-2023,Netherlands,Pakistan,Pakistan,81 runs,"Rajiv Gandhi International Stadium, Hyderabad"
2,07-10-2023,Afghanistan,Bangladesh,Bangladesh,6 wickets,"Himachal Pradesh Cricket Association Stadium, ..."
3,07-10-2023,South Africa,Sri Lanka,South Africa,102 runs,"Arun Jaitley Stadium, Delhi"
4,08-10-2023,India,Australia,India,6 wickets,"MA Chidambaram Stadium, Chennai"


## Semi-Finalists

In [79]:
top_winners = latest['winner'].value_counts().head(4).index.tolist()

print(f"Top 4 teams : {top_winners}")

Top 4 teams : ['India', 'South Africa', 'Australia', 'New Zealand']


In [80]:
# Predict the single match results of future

def predict_single_match(model, rankings, team_1, team_2):
  single_match_data = pd.DataFrame({
      'Team_1': [team_1],
      'Team_2':[team_2]
  })

  #Insert the team ranking data
  single_match_data.insert(1, 'first_position',single_match_data['Team_1'].map(rankings.set_index("Team_name")['Team_ranking']))
  single_match_data.insert(2, 'second_position',single_match_data['Team_2'].map(rankings.set_index("Team_name")['Team_ranking']))

  # Apply one hot encoding
  single_match_data = pd.get_dummies(single_match_data, prefix = ['Team_1', 'Team_2'], columns=['Team_1', 'Team_2'], dtype=int)

  #Find the missing columns
  missing_cols = set(df_team.columns) - set(single_match_data.columns)

  #Set the missing columns to 0 and then keep only th ecolumns present

  for col in missing_cols:
    single_match_data[col] = 0

  single_match_data = single_match_data[df_team.columns]

  #Drop the winning column
  single_match_data = single_match_data.drop(['Winning'], axis=1)

  #Making the prediction
  prediction = model.predict(single_match_data)

  #print the result
  print(f"{team_1} vs {team_2}")

  if prediction[0] ==1 :
    print(f"Winner: {team_1}")
  else:
    print(f"Winner: {team_2}")

  print((""))


## Finalists

In [81]:
predict_single_match(rf, rankings, "India", "New Zealand")

India vs New Zealand
Winner: India



In [82]:
predict_single_match(rf, rankings, "Australia", "South Africa")

Australia vs South Africa
Winner: Australia



## The Last Show

## Decision Tree

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create the dataset
data = {
    'Year': [1975, 1979, 1983, 1987, 1992, 1996, 1999, 2003, 2007, 2011, 2015, 2019],
    'Winner': ['West Indies', 'West Indies', 'India', 'Australia', 'Pakistan', 'Sri Lanka', 'Australia', 'Australia', 'Australia', 'India', 'Australia', 'England'],
    'Main_Hosting_Country': ['England', 'England', 'England', 'India', 'Australia', 'Pakistan', 'England', 'South Africa', 'West Indies', 'India', 'Australia', 'England'],
    'team1': ['Australia', 'England', 'West Indies', 'England', 'Pakistan', 'Australia', 'Pakistan', 'India', 'Sri Lanka', 'Sri Lanka', 'New Zealand', 'New Zealand'],
    'team2': ['West Indies', 'West Indies', 'India', 'Australia', 'England', 'Sri Lanka', 'Australia', 'Australia', 'Australia', 'India', 'Australia', 'New Zealand']
}

df = pd.DataFrame(data)

# Convert categorical variables to numerical labels
df['Main_Hosting_Country'] = pd.Categorical(df['Main_Hosting_Country']).codes
df['team1'] = pd.Categorical(df['team1']).codes
df['team2'] = pd.Categorical(df['team2']).codes

# Create feature matrix (X) and target variable (y)
X = df[['Year', 'Main_Hosting_Country', 'team1', 'team2']]
y = df['Winner']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")

Accuracy: 0.6666666666666666
Classification Report:
              precision    recall  f1-score   support

   Australia       1.00      1.00      1.00         1
     England       0.00      0.00      0.00         0
       India       0.00      0.00      0.00         1
 West Indies       1.00      1.00      1.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.50      0.50         3
weighted avg       0.67      0.67      0.67         3



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Create the dataset
data = {
    'Year': [1975, 1979, 1983, 1987, 1992, 1996, 1999, 2003, 2007, 2011, 2015, 2019],
    'Winner': ['West Indies', 'West Indies', 'India', 'Australia', 'Pakistan', 'Sri Lanka', 'Australia', 'Australia', 'Australia', 'India', 'Australia', 'England'],
    'Main_Hosting_Country': ['England', 'England', 'England', 'India', 'Australia', 'Pakistan', 'England', 'South Africa', 'West Indies', 'India', 'Australia', 'England'],
    'team1': ['Australia', 'England', 'West Indies', 'England', 'Pakistan', 'Australia', 'Pakistan', 'India', 'Sri Lanka', 'Sri Lanka', 'New Zealand', 'New Zealand'],
    'team2': ['West Indies', 'West Indies', 'India', 'Australia', 'England', 'Sri Lanka', 'Australia', 'Australia', 'Australia', 'India', 'Australia', 'New Zealand']
}

df = pd.DataFrame(data)

# Convert categorical variables to numerical labels
df['Main_Hosting_Country'] = pd.Categorical(df['Main_Hosting_Country']).codes
df['team1'] = pd.Categorical(df['team1']).codes
df['team2'] = pd.Categorical(df['team2']).codes

# Create feature matrix (X) and target variable (y)
X = df[['Year', 'Main_Hosting_Country', 'team1', 'team2']]
y = df['Winner']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier
clf_rf = RandomForestClassifier(random_state=42)

# Train the model
clf_rf.fit(X_train, y_train)

# Make predictions on the test set
y_pred_rf = clf_rf.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
classification_rep_rf = classification_report(y_test, y_pred_rf)
confusion_mat_rf = confusion_matrix(y_test, y_pred_rf, labels=df['Winner'].unique())

print("Results using Random Forest:")
print(f"Accuracy: {accuracy_rf}")
print(f"Classification Report:\n{classification_rep_rf}")

Results using Random Forest:
Accuracy: 0.6666666666666666
Classification Report:
              precision    recall  f1-score   support

   Australia       0.50      1.00      0.67         1
       India       0.00      0.00      0.00         1
 West Indies       1.00      1.00      1.00         1

    accuracy                           0.67         3
   macro avg       0.50      0.67      0.56         3
weighted avg       0.50      0.67      0.56         3

Confusion Matrix:
[[1 0 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [45]:
# Assuming you've already trained the model (clf) as per the previous example

# Create a DataFrame for the new match
new_match = pd.DataFrame({
    'Year': [2023],  # Replace with the actual year of the match
    'Main_Hosting_Country': ['India'],
    'team1': ['India'],
    'team2': ['Australia']
})

# Convert categorical variables to numerical labels
new_match['Main_Hosting_Country'] = pd.Categorical(new_match['Main_Hosting_Country']).codes
new_match['team1'] = pd.Categorical(new_match['team1']).codes
new_match['team2'] = pd.Categorical(new_match['team2']).codes

# Make predictions for the new match
prediction = clf.predict(new_match[['Year', 'Main_Hosting_Country', 'team1', 'team2']])

# Display the prediction
print(f"Prediction: {prediction[0]}")


Prediction: Australia


## Logistic Regression

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Create the dataset
data = {
    'Year': [1975, 1979, 1983, 1987, 1992, 1996, 1999, 2003, 2007, 2011, 2015, 2019],
    'Winner': ['West Indies', 'West Indies', 'India', 'Australia', 'Pakistan', 'Sri Lanka', 'Australia', 'Australia', 'Australia', 'India', 'Australia', 'England'],
    'Main_Hosting_Country': ['England', 'England', 'England', 'India', 'Australia', 'Pakistan', 'England', 'South Africa', 'West Indies', 'India', 'Australia', 'England'],
    'team1': ['Australia', 'England', 'West Indies', 'England', 'Pakistan', 'Australia', 'Pakistan', 'India', 'Sri Lanka', 'Sri Lanka', 'New Zealand', 'New Zealand'],
    'team2': ['West Indies', 'West Indies', 'India', 'Australia', 'England', 'Sri Lanka', 'Australia', 'Australia', 'Australia', 'India', 'Australia', 'New Zealand']
}

df = pd.DataFrame(data)

# Convert categorical variables to numerical labels
df['Main_Hosting_Country'] = pd.Categorical(df['Main_Hosting_Country']).codes
df['team1'] = pd.Categorical(df['team1']).codes
df['team2'] = pd.Categorical(df['team2']).codes

# Create feature matrix (X) and target variable (y)
X = df[['Year', 'Main_Hosting_Country', 'team1', 'team2']]
y = df['Winner']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a regularized Logistic Regression model
# You can adjust the value of C for regularization strength
reg_logistic = LogisticRegression(penalty='l2', C=1.0, random_state=42)

# Train the model
reg_logistic.fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg_logistic.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred, labels=df['Winner'].unique())

print("Results using Regularized Logistic Regression:")
print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")
print(f"Confusion Matrix:\n{confusion_mat}")


Results using Regularized Logistic Regression:
Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

   Australia       1.00      1.00      1.00         1
       India       1.00      1.00      1.00         1
 West Indies       1.00      1.00      1.00         1

    accuracy                           1.00         3
   macro avg       1.00      1.00      1.00         3
weighted avg       1.00      1.00      1.00         3

Confusion Matrix:
[[1 0 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [85]:

# Create a DataFrame for the new match
new_match = pd.DataFrame({
    'Year': [2023],  # Replace with the actual year of the match
    'Main_Hosting_Country': ['India'],
    'team1': ['India'],
    'team2': ['Australia']
})

# Convert categorical variables to numerical labels
new_match['Main_Hosting_Country'] = pd.Categorical(new_match['Main_Hosting_Country']).codes
new_match['team1'] = pd.Categorical(new_match['team1']).codes
new_match['team2'] = pd.Categorical(new_match['team2']).codes

# Make predictions for the new match
prediction = reg_logistic.predict(new_match[['Year', 'Main_Hosting_Country', 'team1', 'team2']])

# Display the prediction
print(f"Prediction: {prediction[0]}")


Prediction: Australia


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Target variable (y) and feature matrix (X)
X = df.drop('winner', axis=1)  # Exclude the target variable
y = df['winner']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naive Bayes
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)
nb_predictions = nb_classifier.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_classification_report = classification_report(y_test, nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Naive Bayes Classification Report:\n", nb_classification_report)

# Support Vector Machine
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_classification_report = classification_report(y_test, svm_predictions)
print("\nSupport Vector Machine Accuracy:", svm_accuracy)
print("Support Vector Machine Classification Report:\n", svm_classification_report)

# Random Forest
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_classification_report = classification_report(y_test, rf_predictions)
print("\nRandom Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_classification_report)


Naive Bayes Accuracy: 0.4444444444444444
Naive Bayes Classification Report:
               precision    recall  f1-score   support

 Afghanistan       0.00      0.00      0.00         1
   Australia       0.00      0.00      0.00         2
     England       0.00      0.00      0.00         1
       India       0.50      1.00      0.67         2
 New Zealand       0.00      0.00      0.00         0
    Pakistan       0.00      0.00      0.00         1
South Africa       0.50      1.00      0.67         2

    accuracy                           0.44         9
   macro avg       0.14      0.29      0.19         9
weighted avg       0.22      0.44      0.30         9


Support Vector Machine Accuracy: 0.4444444444444444
Support Vector Machine Classification Report:
               precision    recall  f1-score   support

 Afghanistan       0.00      0.00      0.00         1
   Australia       0.00      0.00      0.00         2
     England       0.00      0.00      0.00         1
       In

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
# end of the notebook