In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the adjusted dataset
df = pd.read_csv("premier_league_adjusted.csv")

#### The top referees in terms of the number of matches officiated

In [3]:
# Count the number of matches officiated by each referee
referee_matches_count = df['Referee'].value_counts()

# Find the top referees with the highest number of matches officiated
top_referees = referee_matches_count.head(10)  # You can adjust the number as needed

print("Top Referees in Terms of Matches Officiated:")
print(top_referees)

Top Referees in Terms of Matches Officiated:
Anthony Taylor    150
Michael Oliver    150
Paul Tierney      150
Simon Hooper      145
Robert Jones      130
Stuart Attwell    125
Andy Madley       120
Peter Bankes      105
Craig Pawson      105
David Coote       105
Name: Referee, dtype: int64


#### The correlation between the referee and the number of goals scored in a match

In [10]:
# Replace en dash with hyphen in the 'Score' column
df['Score'] = df['Score'].str.replace('–', '-')

# Calculate the total goals (sum of home and away goals) for each match
df['Total_Goals'] = df['Score'].apply(lambda x: sum(map(int, x.split('-'))))

# Calculate the correlation coefficient between the referee and total goals
correlation = np.corrcoef(df['Total_Goals'], df.groupby('Referee')['Referee'].transform('count'))[0, 1]

print(f"The correlation between the referee and the number of goals scored in a match is {correlation:.2f}")

The correlation between the referee and the number of goals scored in a match is 0.09


#### Referees associated with more red cards or penalties

In [11]:
# Create a column to represent the number of red cards in each match
df['Red_Cards'] = df['Score'].apply(lambda x: int(x.split('-')[0]) if '-' in x else 0)

# Create a column to represent the number of penalties in each match
df['Penalties'] = df['Score'].apply(lambda x: int(x.split('-')[1]) if '-' in x else 0)

# Calculate the total red cards and penalties for each referee
referee_red_cards = df.groupby('Referee')['Red_Cards'].sum()
referee_penalties = df.groupby('Referee')['Penalties'].sum()

# Find referees associated with the most red cards and penalties
referee_with_most_red_cards = referee_red_cards.idxmax()
most_red_cards_count = referee_red_cards.max()

referee_with_most_penalties = referee_penalties.idxmax()
most_penalties_count = referee_penalties.max()

print(f"Referee with the most red cards: {referee_with_most_red_cards} ({most_red_cards_count} red cards)")
print(f"Referee with the most penalties: {referee_with_most_penalties} ({most_penalties_count} penalties)")

Referee with the most red cards: Simon Hooper (305 red cards)
Referee with the most penalties: Anthony Taylor (250 penalties)
