In [585]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import FunctionTransformer

# 1.a


### Loading Datasets


In [586]:
attendance = pd.read_csv("data/attendance.csv")
referees = pd.read_csv("data/referees.csv")
matches = pd.read_csv("data/fjelstul/matches.csv")
stadiums = pd.read_csv("data/fjelstul/stadiums.csv")
teams = pd.read_csv("data/fjelstul/teams.csv")


## Exploring Each Dataframe


In [587]:
# print("attendance df shape: ", attendance.shape)
# print("attendance df columns: ", attendance.columns)
# print("Nan count:")
# attendance.isnull().sum()


In [588]:
# print("referees df shape: ", referees.shape)
# print("referees df columns: ", referees.columns)
# print("Nan count:")
# referees.isnull().sum()


In [589]:
# print("matches df shape: ", matches.shape)
# print("matches df columns: ", matches.columns)
# print("Nan count:")
# matches.isnull().sum()


In [590]:
# print("stadiums df shape: ", stadiums.shape)
# print("stadiums df columns: ", stadiums.columns)
# print("Nan count:")
# stadiums.isnull().sum()


In [591]:
# print("teams df shape: ", teams.shape)
# print("teams df columns: ", teams.columns)
# print("Nan count:")
# teams.isnull().sum()


### Adding Stadium Capacity To Matches Dataframe based on `stadium_id`


In [592]:
matches_std = pd.merge(
    matches, stadiums[["stadium_id", "stadium_capacity"]], on="stadium_id")


### Setting MultiIndex For Attendance and Referees as Shared Index


In [593]:
attendance = attendance.set_index(["home_team", "away_team", "Date"])
referees = referees.set_index(["home_team", "away_team", "Date"])


### Checking if all indeces are mutual


In [594]:
shared_indices = (set(attendance.index.to_list())
                  & set(referees.index.to_list()))
len(shared_indices) - attendance.shape[0]


0

### Concatinating Attendance And Referees Dataframes


In [595]:
attendance_referees = pd.concat([attendance, referees], axis=1)
attendance_referees = attendance_referees.reset_index()
attendance_referees.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 964 entries, 0 to 963
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   home_team   964 non-null    object
 1   away_team   964 non-null    object
 2   Date        964 non-null    object
 3   Attendance  964 non-null    int64 
 4   Officials   709 non-null    object
dtypes: int64(1), object(4)
memory usage: 37.8+ KB


### Cheking if all team names in attendace dataframe are correct


In [596]:
atnd_team_df = pd.merge(
    teams[["team_name", "team_code"]],
    attendance_referees,
    left_on=["team_name"],
    right_on=["home_team"],
    how='right'
)


In [597]:
print("Teams that are not exist in teams tabel:")
atnd_team_df.loc[atnd_team_df["team_code"].isna(), "home_team"].unique()


Teams that are not exist in teams tabel:


array(['Korea Republic', 'IR Iran', "Côte d'Ivoire", 'Korea DPR',
       'Türkiye', 'China PR', 'FR Yugoslavia', 'Germany DR'], dtype=object)

### Normalizing all team names


In [598]:
team_name_map = {
    "Türkiye": "Turkey",
    "Korea Republic": "South Korea",
    "IR Iran": "Iran",
    "Côte d'Ivoire": "Ivory Coast",
    "Korea DPR": "North Korea",
    "China PR": "China",
    "FR Yugoslavia": "Yugoslavia",
    "Germany DR": "East Germany"
}
def mapper(team): return team_name_map[team] if team in team_name_map else team


In [599]:
attendance_referees["home_team"] = attendance_referees["home_team"].map(mapper)
attendance_referees["away_team"] = attendance_referees["away_team"].map(mapper)


### Seting a unique code name `sorted(home_team + away_team)` for each row because home_team and away_team are commutative


In [600]:
attendance_referees["teams_code"] = ["".join(sorted(
    home+away)) for home, away in attendance_referees[["home_team", "away_team"]].values.tolist()]
matches_std["teams_code"] = ["".join(sorted(
    home+away)) for home, away in matches_std[["home_team_name", "away_team_name"]].values.tolist()]


### Mergne Matches and Attendance


In [601]:
merged_df = pd.merge(
    matches_std,
    attendance_referees,
    left_on=["teams_code", "match_date"],
    right_on=["teams_code", "Date"],
    how='outer'
)


### Cheking If all rows are merged properly


In [602]:
mask = merged_df[["Attendance", "match_id"]].isna().any(axis=1)
merged_df[["home_team", "away_team", "home_team_name",
           "away_team_name", "Date", "match_date", "teams_code"]][mask]


Unnamed: 0,home_team,away_team,home_team_name,away_team_name,Date,match_date,teams_code
118,,,West Germany,Yugoslavia,,1954-06-27,GWYaaaeegilmnorsstuvy
127,,,West Germany,Turkey,,1954-06-23,GTWaeeekmnrrstuyy
130,,,West Germany,Turkey,,1954-06-17,GTWaeeekmnrrstuyy
133,,,West Germany,Hungary,,1954-07-04,GHWaaeegmnnrrstuyy
136,,,Hungary,West Germany,,1954-06-20,GHWaaeegmnnrrstuyy
139,,,West Germany,Austria,,1954-06-30,AGWaaeeimnrrssttuy
964,Germany,Hungary,,,1954-07-04,,GHaaegmnnrruyy
965,Germany,Austria,,,1954-06-30,,AGaaeimnrrstuy
966,Germany,Yugoslavia,,,1954-06-27,,GYaaaegilmnorsuvy
967,Germany,Turkey,,,1954-06-23,,GTaeekmnrruyy


### It seams that there are some mistakes in Germany team name in the attendance data, Let's correct them by repalcing Germany by West Germany because Germany team did not exist at that time `1954`


In [603]:
mask = merged_df[["match_id"]].isna().any(axis=1)
wrong_names = merged_df[["home_team", "away_team", "Date", "teams_code"]][mask]
wrong_names


Unnamed: 0,home_team,away_team,Date,teams_code
964,Germany,Hungary,1954-07-04,GHaaegmnnrruyy
965,Germany,Austria,1954-06-30,AGaaeimnrrstuy
966,Germany,Yugoslavia,1954-06-27,GYaaaegilmnorsuvy
967,Germany,Turkey,1954-06-23,GTaeekmnrruyy
968,Hungary,Germany,1954-06-20,GHaaegmnnrruyy
969,Germany,Turkey,1954-06-17,GTaeekmnrruyy


Replace the wrong name with the correct one


In [604]:
mask = attendance_referees["Date"].isin(
    wrong_names["Date"]) & attendance_referees["teams_code"].isin(wrong_names["teams_code"])

attendance_referees.loc[mask, ["home_team", "away_team"]] = attendance_referees.loc[mask, [
    "home_team", "away_team"]].replace("Germany", "West Germany")
# reshape teams_code 
attendance_referees["teams_code"] = ["".join(sorted(
    a+b)) for a, b in attendance_referees[["home_team", "away_team"]].values.tolist()]


### Let's Merge again


In [605]:
final_df = pd.merge(
    matches_std,
    attendance_referees,
    left_on=["teams_code", "match_date"],
    right_on=["teams_code", "Date"],
    how='outer'
)


In [606]:
mask = final_df[["Attendance", "match_id"]].isna().any(axis=1)
final_df[["home_team", "away_team", "home_team_name",
          "away_team_name", "Date", "match_date", "teams_code"]][mask]


Unnamed: 0,home_team,away_team,home_team_name,away_team_name,Date,match_date,teams_code


In [607]:
final_df = final_df.drop(
    ["home_team", "away_team", "teams_code", "Date"], axis=1)
final_df = final_df.rename(
    columns={"attendance": "attendance", "Officials": "officials"})
final_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 964 entries, 0 to 963
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   key_id                     964 non-null    int64 
 1   tournament_id              964 non-null    object
 2   tournament_name            964 non-null    object
 3   match_id                   964 non-null    object
 4   match_name                 964 non-null    object
 5   stage_name                 964 non-null    object
 6   group_name                 964 non-null    object
 7   group_stage                964 non-null    int64 
 8   knockout_stage             964 non-null    int64 
 9   replayed                   964 non-null    int64 
 10  replay                     964 non-null    int64 
 11  match_date                 964 non-null    object
 12  match_time                 964 non-null    object
 13  stadium_id                 964 non-null    object
 14  stadium_na

In [608]:
final_df.to_csv("data/matches_combined_data.csv", index=False)


# Done.


# 1.b


In [609]:
players = pd.read_csv('data/fjelstul/players.csv')
squads = pd.read_csv('data/fjelstul/squads.csv')


In [610]:
players.head()


Unnamed: 0,key_id,player_id,family_name,given_name,birth_date,goal_keeper,defender,midfielder,forward,count_tournaments,list_tournaments,player_wikipedia_link
0,1,P-08891,A'Court,Alan,1934-09-30,0,0,0,1,1,1958,https://en.wikipedia.org/wiki/Alan_A%27Court
1,2,P-08589,Aaronson,Brenden,2000-10-22,0,0,0,1,1,2022,https://en.wikipedia.org/wiki/Brenden_Aaronson
2,3,P-04897,Abadzhiev,Stefan,1934-07-03,0,0,0,1,1,1966,https://en.wikipedia.org/wiki/Stefan_Abadzhiev
3,4,P-05556,Abalo,Jean-Paul,1975-06-26,0,1,0,0,1,2006,https://en.wikipedia.org/wiki/Jean-Paul_Abalo
4,5,P-08163,Abanda,Patrice,1978-08-03,0,1,0,0,1,1998,https://en.wikipedia.org/wiki/Patrice_Abanda


In [611]:
players.shape


(8485, 12)

In [612]:
squads.head()


Unnamed: 0,key_id,tournament_id,tournament_name,team_id,team_name,team_code,player_id,family_name,given_name,shirt_number,position_name,position_code
0,1,WC-1930,1930 FIFA World Cup,T-03,Argentina,ARG,P-06987,Bossio,Ángel,0,goal keeper,GK
1,2,WC-1930,1930 FIFA World Cup,T-03,Argentina,ARG,P-00287,Botasso,Juan,0,goal keeper,GK
2,3,WC-1930,1930 FIFA World Cup,T-03,Argentina,ARG,P-01435,Cherro,Roberto,0,forward,FW
3,4,WC-1930,1930 FIFA World Cup,T-03,Argentina,ARG,P-01321,Chividini,Alberto,0,defender,DF
4,5,WC-1930,1930 FIFA World Cup,T-03,Argentina,ARG,P-08552,Della Torre,José,0,defender,DF


In [613]:
squads.shape


(10973, 12)

In [614]:
players.isnull().sum()


key_id                   0
player_id                0
family_name              0
given_name               0
birth_date               1
goal_keeper              0
defender                 0
midfielder               0
forward                  0
count_tournaments        0
list_tournaments         0
player_wikipedia_link    0
dtype: int64

In [615]:
squads.isnull().sum()


key_id             0
tournament_id      0
tournament_name    0
team_id            0
team_name          0
team_code          0
player_id          0
family_name        0
given_name         0
shirt_number       0
position_name      0
position_code      0
dtype: int64

In [616]:
len(squads.player_id.unique())


8485

the same number of players in both table , so we can get the info according to any table


-----------

get columns from player dataframe


In [617]:
players_data = players[['player_id', 'given_name',
                        'family_name', 'count_tournaments', 'list_tournaments']]

get columns from squads dataframe


In [618]:
teams_names_codes = squads[['player_id', 'team_name', 'team_code']]


number of teams for each player


In [619]:
team_count = teams_names_codes.groupby(
    'player_id').agg({'team_code': pd.Series.nunique})
team_count.rename(columns={'team_code': 'team_count'}, inplace=True)


In [620]:
teams_names_codes_count = teams_names_codes.merge(team_count, on='player_id')
teams_names_codes_count


Unnamed: 0,player_id,team_name,team_code,team_count
0,P-06987,Argentina,ARG,1
1,P-00287,Argentina,ARG,1
2,P-01435,Argentina,ARG,1
3,P-01321,Argentina,ARG,1
4,P-08552,Argentina,ARG,1
...,...,...,...,...
10968,P-08129,Wales,WAL,1
10969,P-05661,Wales,WAL,1
10970,P-03434,Wales,WAL,1
10971,P-08781,Wales,WAL,1


In [621]:
plrs = teams_names_codes_count[teams_names_codes_count['team_count']
                        > 1]['player_id'].nunique()
print(f"we have {plrs} player have played in more than one team")

we have 20 player have played in more than one team


In [622]:
player_teams = players_data.merge(teams_names_codes_count, on='player_id')
player_teams


Unnamed: 0,player_id,given_name,family_name,count_tournaments,list_tournaments,team_name,team_code,team_count
0,P-08891,Alan,A'Court,1,1958,England,ENG,1
1,P-08589,Brenden,Aaronson,1,2022,United States,USA,1
2,P-04897,Stefan,Abadzhiev,1,1966,Bulgaria,BGR,1
3,P-05556,Jean-Paul,Abalo,1,2006,Togo,TGO,1
4,P-08163,Patrice,Abanda,1,1998,Cameroon,CMR,1
...,...,...,...,...,...,...,...,...
10968,P-06718,Maciej,Żurawski,2,"2002, 2006",Poland,POL,1
10969,P-03465,Szymon,Żurkowski,1,2022,Poland,POL,1
10970,P-08318,Graham,Zusi,1,2014,United States,USA,1
10971,P-05921,Andrei,Zygmantovich,1,1990,Soviet Union,SUN,1


In [623]:
def unique_list(items): return ','.join(list(dict.fromkeys(items)))

player_teams_agg = player_teams.groupby('player_id').agg({
    'given_name': "first",
    'family_name': "first",
    'count_tournaments': 'first',
    "team_count": "first",
    'list_tournaments': unique_list,
    'team_name': unique_list,
    'team_code': unique_list,
})
player_teams_agg = player_teams_agg.reset_index()
player_teams_agg


Unnamed: 0,player_id,given_name,family_name,count_tournaments,team_count,list_tournaments,team_name,team_code
0,P-00001,Harald,Schumacher,2,1,"1982, 1986",West Germany,DEU
1,P-00002,Ismail,Mohammed Sharif,1,1,1986,Iraq,IRQ
2,P-00003,not applicable,Otávio,1,1,2022,Portugal,PRT
3,P-00004,Julio César,Cortés,3,1,"1962, 1966, 1970",Uruguay,URY
4,P-00005,Guy,Vandersmissen,1,1,1982,Belgium,BEL
...,...,...,...,...,...,...,...,...
8480,P-09994,Landry,N'Guémo,2,1,"2010, 2014",Cameroon,CMR
8481,P-09996,Ramiro,Castillo,1,1,1994,Bolivia,BOL
8482,P-09997,Jonás,Gutiérrez,1,1,2010,Argentina,ARG
8483,P-09998,not applicable,Rivaldo,2,1,"1998, 2002",Brazil,BRA


In [624]:
player_teams.to_csv('data/player_teams.csv',index=False)
player_teams_agg.to_csv('data/player_teams_agg.csv',index=False)

-----------------

# 2

In [625]:
matches = pd.read_csv("data/matches_combined_data.csv")
teams = pd.read_csv("data/fjelstul/teams.csv")
players_teams_agg = pd.read_csv('data/player_teams_agg.csv')
goals = pd.read_csv('data/fjelstul/goals.csv')

### total_goals_in_match

In [626]:
matches["total_goals_in_match"] = (
    matches["home_team_score"] +
    matches["away_team_score"] +
    matches["home_team_score_penalties"] +
    matches["away_team_score_penalties"]
)

### used_capacity_ratio

In [627]:
matches["used_capacity_ratio"] = matches["Attendance"] / matches["stadium_capacity"]

### attendance_category 

In [628]:
transformer = KBinsDiscretizer(n_bins=5, encode='ordinal')
discrete_attendance = transformer.fit_transform(matches['Attendance'].values.reshape(-1, 1))
matches["attendance_category"] = discrete_attendance

### relative_attendance_category

In [629]:
bins = [0 , 0.99, 1.01, np.inf]
labels = ['underload', 'full', 'overload']
transformer = FunctionTransformer(pd.cut, kw_args={'bins': bins, 
                                                   'labels': labels, 
                                                   'retbins': False})
matches["relative_attendance_category"] = transformer.fit_transform(matches['used_capacity_ratio'])

### host_country_code

In [630]:
matches = pd.merge(
    matches,
    teams[["team_name", "team_code"]],
    left_on=["country_name"],
    right_on=["team_name"],
    how="left"
)

In [631]:
matches = matches.drop("team_name", axis=1)
matches = matches.rename(columns={"team_code": "host_country_code"})

###  match_for_host

In [632]:
condition1 = matches["host_country_code"] == matches["home_team_code"]
condition2 = matches["host_country_code"] == matches["away_team_code"]
matches["match_for_host"] = condition1 | condition2

###  tournament_year

In [633]:
matches["tournament_year"] = matches.tournament_id.str.split("-").str.get(1)

###  full_name

In [634]:
players_teams_agg['full_name'] = players_teams_agg.apply(
    lambda row: row['given_name'] + ' ' + row['family_name'] if row['given_name'] != 'not applicable' else row['family_name'],
    axis=1
)

###  short_stage_name

In [635]:
matches["short_stage_name"] = np.where(matches['group_stage'] == 1, 'group', 'knockout')

### winner_code 

In [636]:
conditions = [matches['home_team_win'].astype(bool), matches['away_team_win'].astype(bool)]
choices = [matches['home_team_code'], matches['away_team_code']]
matches['winner_code'] = np.select(conditions, choices, default=None)

### Late_goal

In [637]:
def get_max_minutes(label):
    if label == 'first half':
        return 45
    elif label == 'second half':
        return 90
    elif label == 'extra time, first half':
        return 105
    elif label == 'extra time, second half':
        return 120
    elif label == 'second half, stoppage time':
        return 90 + 13
    elif label == 'first half, stoppage time':
        return 45 + 13
    elif label == 'extra time, second half, stoppage time':
        return 120 + 13
    elif label == 'extra time, first half, stoppage time':
        return 105 + 13
    else:
        return None

def is_end_of(name, minute):
    max_minute = get_max_minutes(name)
    return max_minute - minute <= 5 

def answer_provider(row):
    if 'first' in row.match_period :
        return False 
    if 'stoppage' in row.match_period:
        return True 
    return is_end_of(row.match_period,row.minute_regulation)



In [638]:
goals['late_goal'] =  goals.apply(answer_provider,axis=1)

In [639]:
matches.to_csv('data/matches_combined_data.csv',index=False)
goals.to_csv('data/goals_data.csv',index=False)
players_teams_agg.to_csv("data/players_data.csv")

In [640]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy.stats as ss
from scipy.stats import chi2_contingency

In [641]:
layout_options = {
    'paper_bgcolor':"#383838",
    'plot_bgcolor':'#383838',
    'title_font': dict(color='white'),
    'legend_font': dict(color='white'),
    'yaxis':dict(color="white"),
    'xaxis':dict(color="white")
    }

In [642]:
matches = pd.read_csv("data/matches_combined_data.csv")
tour = pd.read_csv('data/fjelstul/tournaments.csv')


# 3.A

### The mean/median of attendence by tournament

In [643]:
att_data = matches[["tournament_id","Attendance"]].groupby(
    "tournament_id"
    ).agg(
        median=('Attendance', 'median'),
        mean=('Attendance', 'mean')
    ).reset_index()

In [644]:
fig = px.line(
    att_data, 
    x="tournament_id",
    y=["median", "mean"],
    title="Median, Mean of Attendance by Tournament"
)
fig.update_layout(**layout_options)
fig.show()

الملاحظ من المخطط السابق أن لعبة كرة القدم تأخذ شعبية أكثر فاكثر بسبب ارتفاع معدلات الحضور بشكل مستمر في كل موسم
ويمكن أن يعزى ازدياد عدد المشاهدين عام 1994 لى عدة عوامل. منها: الشعبية المتزايدة لكرة القدم في الولايات المتحدة. بالإضافة إلى  انها النسخة الأولى التي تضم 24 فريقًا بدلاً من 16 ، مما زاد من عدد المباريات ومنح المزيد من الدول فرصة المشاركة.

----------------

### The histogram of attendence

In [645]:
fig = px.histogram(matches ,x="Attendance", nbins=100)
fig.update_layout(**layout_options)
fig.show()

بما أن المتوسط اعلى من الوسيط في المخطط الأول  فهذا يعني انه يوجد لدينا <br>
right skewed <br>
فهذا يعني أن البيانات منحرفة بشكل إيجابي أو منحرفة جهة اليمين وهذا واضح من المخطط السابق

---------------------------

### Attendece distribution using boxplot

In [646]:
fig = px.box(matches, x='tournament_id', y='Attendance', title='Attendance by Tournament')
fig.update_layout(**layout_options)
fig.show()

-----------------

In [647]:
goals = pd.read_csv("data/goals_data.csv")
players = pd.read_csv("data/players_data.csv")
player_appear = pd.read_csv("data/fjelstul/player_appearances.csv")


# 3.B

### The median of goals time

In [648]:
goal_period = goals[["tournament_id", "minute_regulation"]].groupby("tournament_id").median()

In [649]:
fig = px.bar(goal_period, y="minute_regulation", title="The median of Goals minute by tournament")
fig.update_layout(**layout_options)
fig.show()

-----------------------------

### A histogram for number of goals by match

In [650]:
matches.columns

Index(['key_id', 'tournament_id', 'tournament_name', 'match_id', 'match_name',
       'stage_name', 'group_name', 'group_stage', 'knockout_stage', 'replayed',
       'replay', 'match_date', 'match_time', 'stadium_id', 'stadium_name',
       'city_name', 'country_name', 'home_team_id', 'home_team_name',
       'home_team_code', 'away_team_id', 'away_team_name', 'away_team_code',
       'score', 'home_team_score', 'away_team_score', 'home_team_score_margin',
       'away_team_score_margin', 'extra_time', 'penalty_shootout',
       'score_penalties', 'home_team_score_penalties',
       'away_team_score_penalties', 'result', 'home_team_win', 'away_team_win',
       'draw', 'stadium_capacity', 'Attendance', 'officials',
       'total_goals_in_match', 'used_capacity_ratio', 'attendance_category',
       'relative_attendance_category', 'host_country_code', 'match_for_host',
       'tournament_year', 'short_stage_name', 'winner_code'],
      dtype='object')

In [651]:
fig = px.histogram(matches, x="total_goals_in_match")
fig.update_layout(**layout_options)
fig.show()

من المخطط السابق يمكن الاستفادة ان التنبؤ بنتيحة المباراة يجب أن يكون مجموع الاهداف 3 او 2 فهو ذو احتمالية اكبر<br>
<br>كما أنه لدينا <br>
right skewed

__________________

### The most frequent goal time

In [652]:
goals["minute_actual"] = goals["minute_regulation"] + goals["minute_stoppage"]
goals_min = goals[["tournament_id", "minute_actual"]].groupby("tournament_id").apply(lambda x: x.mode())

In [653]:
fig = px.bar(goals_min,x="tournament_id", y="minute_actual", range_y=[0, 120] )
fig.update_layout(**layout_options)
fig.show()

نلاحظ أن نسبة كبيرة من الأهداف يتم تسجيلها في أواخر المباراة، إذ عندها يكون أحد الفريقين قد بلغ حده من التعب

وبشكل غريب فإن بطولة  2006 كانت معظم الأهداف فيها في أول المباراة أو أول شوط  

---------------------

### A histogram for late goals

In [654]:
goals_count = goals[["goal_id", "tournament_id"]][goals.late_goal == 1].groupby("tournament_id").count()

In [655]:
fig = px.histogram(goals_count, nbins=20)
fig.update_layout(**layout_options)
fig.show()

### the ratio of late goals to total goals 

In [656]:
late_goals_count = goals[["goal_id", "tournament_id"]][goals.late_goal == 1].groupby("tournament_id").count()
total_goals_count = goals[["goal_id", "tournament_id"]].groupby("tournament_id").count()

In [657]:
late_goals_count

Unnamed: 0_level_0,goal_id
tournament_id,Unnamed: 1_level_1
WC-1930,7
WC-1934,3
WC-1938,9
WC-1950,8
WC-1954,12
WC-1958,10
WC-1962,9
WC-1966,10
WC-1970,8
WC-1974,9


In [658]:
fig = px.bar(late_goals_count/total_goals_count)
fig.update_layout(**layout_options)
fig.show()

منذ عام 1990 وحتى اليوم اكثر من 0.1 من الاهداف هي اهداف متأخرة، هذا يدل على زيادة المنافسة بين الفرق وتقارب كفاءة اللاعبين

----------------------

### The bar chart for the best 12 scorers

In [659]:
player_goals = goals[["player_id", "goal_id"]].groupby("player_id").count().nlargest(12, columns=["goal_id"])

In [660]:
player_ids= player_goals.index.to_list()
player_names = players[["full_name", "player_id"]][players.player_id.isin(player_ids)]
player_goals = pd.merge(player_goals, player_names, on="player_id")

In [661]:
fig = px.bar(player_goals, x="full_name", y="goal_id")
fig.update_layout(**layout_options)
fig.show()

---------------------

### The best scorer per tournament

In [662]:
player_max_goals = goals[["tournament_id", "player_id", "goal_id"]].groupby(
    ["tournament_id","player_id"]
).count().reset_index().groupby("tournament_id").max().reset_index()

In [663]:

player_ids= player_max_goals.player_id.to_list()
player_names = players[["full_name", "player_id"]][players.player_id.isin(player_ids)]
player_max_goals = pd.merge(player_max_goals, player_names, on="player_id")
player_max_goals["player_tournament"] = player_max_goals.full_name + " (" +player_max_goals.tournament_id + ")"

In [664]:
fig = px.bar(player_max_goals, x="player_tournament", y="goal_id")
fig.update_layout(**layout_options)
fig.show()

--------------------

### the total goals per tournament

# 3.C

## 1 

merge away and home teams

In [665]:

matches['teams']= matches.apply(lambda row : ','.join(str(item) for item in sorted([row.home_team_name,row.away_team_name])),axis=1)

In [666]:
ten_most_played = matches.groupby('teams').size().nlargest(10)

----------------

## 2

In [667]:
fig = px.bar(ten_most_played)
fig.update_layout(**layout_options)
fig.show()

we notice that Barazil and Argentina have played with many teams because they have won many tournemants
 

# 3.D

## 1

In [668]:
players_teams_agg = pd.read_csv('data/player_teams_agg.csv')

In [669]:
players_teams_agg[['player_id','given_name','family_name','team_name','team_count']][players_teams_agg.team_count > 1]

Unnamed: 0,player_id,given_name,family_name,team_name,team_count
455,P-00537,Franz,Wagner,"Austria,Germany",2
1080,P-01259,Ferenc,Puskás,"Hungary,Spain",2
1298,P-01512,José,Altafini,"Brazil,Italy",2
1490,P-01739,Davor,Šuker,"Yugoslavia,Croatia",2
1502,P-01757,Rudolf,Raftl,"Austria,Germany",2
2016,P-02369,Dejan,Stanković,"Yugoslavia,Serbia and Montenegro,Serbia",3
2129,P-02502,Nikola,Žigić,"Serbia and Montenegro,Serbia",2
2163,P-02543,Luis,Monti,"Argentina,Italy",2
2191,P-02576,Robert,Prosinečki,"Yugoslavia,Croatia",2
2416,P-02845,Attilio,Demaría,"Argentina,Italy",2


بعض الفرق قد انتهت لأن دولها لم تعد موجودة مثل الاتحاد السوفيتي وألمانيا وصربيا
كما أن بعض اللاعبين يحمل جنسيتين لذلك يستطيع تمثيل أي منها في بطولة واحدة

----------------

## 2

In [670]:
wh = tour[['winner','host_country']]

In [671]:
winner_is_host = wh[wh.winner == wh.host_country]
winner_is_host

Unnamed: 0,winner,host_country
0,Uruguay,Uruguay
1,Italy,Italy
7,England,England
9,West Germany,West Germany
10,Argentina,Argentina
15,France,France


In [672]:
winner_count = tour.groupby('winner').size()
winner_count

winner
Argentina       3
Brazil          5
England         1
France          2
Germany         1
Italy           4
Spain           1
Uruguay         2
West Germany    3
dtype: int64

In [673]:
winner_is_host_with_count = wh.merge(winner_count.to_frame(name='winner_count'),on='winner',how="left")
winner_is_host_with_count.sort_values(by='winner')

Unnamed: 0,winner,host_country,winner_count
10,Argentina,Argentina,3
12,Argentina,Mexico,3
21,Argentina,Qatar,3
5,Brazil,Sweden,5
6,Brazil,Chile,5
8,Brazil,Mexico,5
14,Brazil,United States,5
16,Brazil,"Korea, Japan",5
7,England,England,1
20,France,Russia,2


In [674]:
def cramers_v(cross_tabs):
    """
    Prints the degrees of freedom, effect size thresholds, and Cramer's V value.
    
    Args:
        cross_tabs: A crosstab dataframe.
    """
    
    # getting the chi sq. stat
    chi2 = ss.chi2_contingency(cross_tabs)[0]    # calculating the total number of observations
    n = cross_tabs.sum().sum()    # getting the degrees of freedom
    dof = min(cross_tabs.shape)-1    # calculating cramer's v
    v = np.sqrt(chi2/(n*dof))    # printing results
    print(f'V = {v}')
    print(f'Cramer\'s V Degrees of Freedom = {dof}')
    
    

In [675]:
def print_chi2_p_cramers(df,c1,c2):
    c = df[[c1,c2]]
    contingency_table = pd.crosstab(c[c1], c[c2])
    chi2, p, *_ = chi2_contingency(contingency_table)
    print('Chi-square statistic:', chi2)
    print('P-value:', p)
    cramers_v(contingency_table)


In [676]:
print_chi2_p_cramers(tour,'winner','host_country')

Chi-square statistic: 138.96666666666667
P-value: 0.41334333987682675
V = 0.8885850175044216
Cramer's V Degrees of Freedom = 8


حسب النتائج لا يبدو أنه ييوجد ارتباط بين الفريق الفائز والدولة المضيفة إلا أنه _ حسب قيمة كرامر _ إذا وجد ارتباط فسيكون قوي  

----------------

## 3

In [677]:
print_chi2_p_cramers(matches,'relative_attendance_category','match_for_host')

Chi-square statistic: 3.797195837618871
P-value: 0.14977847365244198
V = 0.06276145179610527
Cramer's V Degrees of Freedom = 1


----------------

## 4

In [678]:
print_chi2_p_cramers(matches,'attendance_category','host_country_code')

Chi-square statistic: 646.0450125820513
P-value: 4.224118972361396e-95
V = 0.4093199161040318
Cramer's V Degrees of Freedom = 4


In [679]:
goals_count = goals[["goal_id", "tournament_id"]].groupby("tournament_id").count()
fig = px.bar(goals_count,y="goal_id")
fig.update_layout(**layout_options)
fig.show()

من الواضح أن الفرق المشاركة تميل الى استراتيجة الهجوم وتسجيل الأهداف بدلا من الدفاع عن المرمى

-----------------------------

In [680]:

study_teams = ["BRA", "DEU", "ITA"]
study_teams_goals = goals[goals.team_code.isin(study_teams)]
study_teams_goals["stage"] = study_teams_goals.stage_name == "group stage"

study_teams_goals["stage"] = study_teams_goals["stage"].map(
    {True: "Group Stage", False: "Loser out"}
    )
fig = px.strip(
    study_teams_goals,
    x="minute_regulation", 
    y="team_code", 
    facet_col="stage", 
    facet_col_wrap=2 
    )
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



الواضح من المخطط السابق أن المنافسة بين الفرق في دوري خروج المغلوب اكثر منها في دوري المجموعات ذلك لاحتاج المباراة الى اشواط اضافية

# 4

In [681]:
import pandas as pd
import plotly.express as px

In [682]:
layout_options = {
    'paper_bgcolor':"#383838",
    'plot_bgcolor':'#383838',
    'title_font': dict(color='white'),
    'legend_font': dict(color='white'),
    'yaxis':dict(color="white"),
    'xaxis':dict(color="white")
    }

In [683]:
goals = pd.read_csv("data/fjelstul/goals.csv")
players = pd.read_csv("data/fjelstul/players.csv")

معرفة عمر اللاعب عند تسجيل الهدف

In [684]:
(players['birth_date'] == 'not available').sum()

77

In [685]:
players = players.dropna()
players = players.drop(index=players[players['birth_date'] == 'not available'].index)

In [686]:
goals["tournament_year"] = goals.tournament_id.str.split("-").str.get(1)
players["birth_year"] = players.birth_date.str.split("-").str.get(0)
players_goals = pd.merge(
    goals,
    players,
    on="player_id",
    how="inner"
)
players_goals["player_age"] = players_goals["tournament_year"].astype(int) -  players_goals["birth_year"].astype(int)

In [687]:
players_goals["player_age"].describe()

count    2719.000000
mean       26.652814
std         3.465496
min        18.000000
25%        24.000000
50%        26.000000
75%        29.000000
max        42.000000
Name: player_age, dtype: float64

In [688]:
goals_age = players_goals[["player_age", "goal_id"]].groupby("player_age").count()

In [689]:
fig = px.bar(goals_age, y="goal_id")
fig.update_layout(**layout_options)
fig.show()

من الملاحظ أن اللاعب في بداية عمره تكون فرصة تسجيله للاهداف قليلة وكذلك بالنسبة لمن هو فوق عمر 30 سنة، والعمر الأكثر فرصة لتسديد الأهداف هو 25

-----------------------------

حالات التبديل في البطولة

In [690]:
substitutions = pd.read_csv("data/fjelstul/substitutions.csv")
substitutions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7637 entries, 0 to 7636
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   key_id             7637 non-null   int64 
 1   substitution_id    7637 non-null   object
 2   tournament_id      7637 non-null   object
 3   tournament_name    7637 non-null   object
 4   match_id           7637 non-null   object
 5   match_name         7637 non-null   object
 6   match_date         7637 non-null   object
 7   stage_name         7637 non-null   object
 8   group_name         7637 non-null   object
 9   team_id            7637 non-null   object
 10  team_name          7637 non-null   object
 11  team_code          7637 non-null   object
 12  home_team          7637 non-null   int64 
 13  away_team          7637 non-null   int64 
 14  player_id          7637 non-null   object
 15  family_name        7637 non-null   object
 16  given_name         7637 non-null   object


In [691]:
team_max_substitute = substitutions[["tournament_id", "team_id", "substitution_id"]].groupby(
    ["tournament_id", "team_id"]
).count().reset_index().groupby("tournament_id").max().reset_index()

In [692]:
fig = px.bar(team_max_substitute, x="tournament_id", y="substitution_id")
fig.update_layout(**layout_options)
fig.show()

تزداد حالات التبديل مع تقدم الوقت وهذا يدل على قوة المبارايات مؤخراً 