<h1>ML Task 1: Team A vs Team B; who wins</h1>

In [1]:
import pandas as pd

df = pd.read_csv('ml_task1_table.csv')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


# Drop unnecessary columns
df = df.drop(columns=['game_id'])

# Define features and target variable
X = df.drop(columns=['win'])
y = df['win']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('encoder', OneHotEncoder(), ['home_team_name', 'away_team_name']),
    ],
    remainder='passthrough'
)

# Define the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42)),
])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9994044073853484


In [9]:
# Prompt user to input team names
input_teams = [input("Enter the home team name: "), input("Enter the away team name: ")]

# Function to fetch historical goal information
def get_goal_information(team1, team2, historical_data):
    match = historical_data[((historical_data['home_team_name'] == team1) & (historical_data['away_team_name'] == team2)) |
                             ((historical_data['home_team_name'] == team2) & (historical_data['away_team_name'] == team1))]
    
    if not match.empty:
        own_goals = match['own_goals'].values[0]
        opponent_goals = match['opponent_goals'].values[0]
        return own_goals, opponent_goals
    else:
        # Handle the case where there's no historical data for the given pair of teams
        return 0, 0

# Get historical goal information
own_goals, opponent_goals = get_goal_information(input_teams[0], input_teams[1], df)

# Create a DataFrame with the input data
input_data = pd.DataFrame({
    'home_team_name': [input_teams[0]],
    'away_team_name': [input_teams[1]],
    'own_goals': [own_goals],
    'opponent_goals': [opponent_goals],
    'hosting': [0],  # You need to determine whether it's a home or away game
})

# Use the preprocessor and model for prediction
input_features = model.named_steps['preprocessor'].transform(input_data)
prediction = model.named_steps['classifier'].predict(input_features)

# Convert the numeric prediction to a human-readable result
result = f'{input_teams[0]} wins' if prediction[0] == 1 else f'{input_teams[1]} wins' if prediction[0] == 2 else 'It\'s a draw'

print(f'Prediction for {input_teams[0]} vs {input_teams[1]}: {result}')

Enter the home team name: Bayern Munich
Enter the away team name: Ajax Amsterdam
Prediction for Bayern Munich vs Ajax Amsterdam: It's a draw


<h1>ML Task 2: Top scorer prediction</h1>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

df = pd.read_csv('ml_task2_table.csv')

# Preprocessing
df['average_goals'] = df.loc[:, '2012':'2023'].mean(axis=1)
df = df[df['Team'] != 'retired']

# Handle missing values
features = ['average_goals', 'Age'] + [str(year) for year in range(2013, 2024)]
X = df[features]
y = df['2023']  # Target variable, goals scored in 2023

# Impute missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Model Training
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Model Evaluation (Optional)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

# Impute missing values in the prediction dataset
X_pred = imputer.transform(df[features])



Mean Squared Error: 4.127501155536134e-29




In [5]:
# Prediction for 2024
df['predicted_goals_2024'] = model.predict(X_pred)

# Identify Top 3 Scorers in 2024
top_scorers_2024 = df.nlargest(3, 'predicted_goals_2024')

print("Top 3 Scorers in 2024:")
print(" ")
for index, row in top_scorers_2024.iterrows():
    print(f"{row['Name']} with {int(row['predicted_goals_2024'])} goals.")


Top 3 Scorers in 2024:
 
Erling Haaland with 36 goals.
Harry Kane with 30 goals.
Kylian Mbappe with 29 goals.
