In [1]:
import pandas as pd
import plotly.express as px
import numpy as np
import scipy.stats as ss
from scipy.stats import chi2_contingency

In [2]:
layout_options = {
    'paper_bgcolor':"#383838",
    'plot_bgcolor':'#383838',
    'title_font': dict(color='white'),
    'legend_font': dict(color='white'),
    'yaxis':dict(color="white"),
    'xaxis':dict(color="white")
    }

In [3]:
matches = pd.read_csv("data/matches_combined_data.csv")
tour = pd.read_csv('data/fjelstul/tournaments.csv')


# A

### The mean/median of attendence by tournament

In [4]:
att_data = matches[["tournament_id","Attendance"]].groupby(
    "tournament_id"
    ).agg(
        median=('Attendance', 'median'),
        mean=('Attendance', 'mean')
    ).reset_index()

In [5]:
fig = px.line(
    att_data, 
    x="tournament_id",
    y=["median", "mean"],
    title="Median, Mean of Attendance by Tournament"
)
fig.update_layout(**layout_options)
fig.show()

الملاحظ من المخطط السابق أن لعبة كرة القدم تأخذ شعبية أكثر فاكثر بسبب ارتفاع معدلات الحضور بشكل مستمر في كل موسم
ويمكن أن يعزى ازدياد عدد المشاهدين عام 1994 لى عدة عوامل. منها: الشعبية المتزايدة لكرة القدم في الولايات المتحدة. بالإضافة إلى  انها النسخة الأولى التي تضم 24 فريقًا بدلاً من 16 ، مما زاد من عدد المباريات ومنح المزيد من الدول فرصة المشاركة.

----------------

### The histogram of attendence

In [6]:
fig = px.histogram(matches ,x="Attendance", nbins=100)
fig.update_layout(**layout_options)
fig.show()

بما أن المتوسط اعلى من الوسيط في المخطط الأول  فهذا يعني انه يوجد لدينا <br>
right skewed <br>
فهذا يعني أن البيانات منحرفة بشكل إيجابي أو منحرفة جهة اليمين وهذا واضح من المخطط السابق

---------------------------

### Attendece distribution using boxplot

In [7]:
fig = px.box(matches, x='tournament_id', y='Attendance', title='Attendance by Tournament')
fig.update_layout(**layout_options)
fig.show()

-----------------

In [8]:
goals = pd.read_csv("data/goals_data.csv")
players = pd.read_csv("data/players_data.csv")
player_appear = pd.read_csv("data/fjelstul/player_appearances.csv")


# B

### The median of goals time

In [9]:
goal_period = goals[["tournament_id", "minute_regulation"]].groupby("tournament_id").median()

In [10]:
fig = px.bar(goal_period, y="minute_regulation", title="The median of Goals minute by tournament")
fig.update_layout(**layout_options)
fig.show()

-----------------------------

### A histogram for number of goals by match

In [11]:
fig = px.histogram(matches, x="total_goals_in_match")
fig.update_layout(**layout_options)
fig.show()

من المخطط السابق يمكن الاستفادة ان التنبؤ بنتيحة المباراة يجب أن يكون مجموع الاهداف 3 او 2 فهو ذو احتمالية اكبر

__________________

### The most frequent goal time

In [12]:
goals["minute_actual"] = goals["minute_regulation"] + goals["minute_stoppage"]
goals_min = goals[["tournament_id", "minute_actual"]].groupby("tournament_id").apply(lambda x: x.mode())

In [13]:
fig = px.bar(goals_min,x="tournament_id", y="minute_actual", range_y=[0, 120] )
fig.update_layout(**layout_options)
fig.show()

نلاحظ أن نسبة كبيرة من الأهداف يتم تسجيلها في أواخر المباراة، إذ عندها يكون أحد الفريقين قد بلغ حده من التعب

وبشكل غريب فإن بطولتي 2004 و 2006 كانت معظم الأهداف فيها في أول المباراة أو أول شوط  

---------------------

### A histogram for late goals

In [14]:
goals_count = goals[["goal_id", "tournament_id"]][goals.late_goal == 1].groupby("tournament_id").count()

In [15]:
fig = px.histogram(goals_count, nbins=20)
fig.update_layout(**layout_options)
fig.show()

### the ratio of late goals to total goals 

In [16]:
late_goals_count = goals[["goal_id", "tournament_id"]][goals.late_goal == 1].groupby("tournament_id").count()
total_goals_count = goals[["goal_id", "tournament_id"]].groupby("tournament_id").count()

In [50]:
late_goals_count

Unnamed: 0_level_0,goal_id
tournament_id,Unnamed: 1_level_1
WC-1930,7
WC-1934,3
WC-1938,9
WC-1950,8
WC-1954,12
WC-1958,10
WC-1962,9
WC-1966,10
WC-1970,8
WC-1974,9


In [17]:
fig = px.bar(late_goals_count/total_goals_count)
fig.update_layout(**layout_options)
fig.show()

منذ عام 1990 وحتى اليوم اكثر من 0.1 من الاهداف هي اهداف متأخرة، هذا يدل على زيادة المنافسة بين الفرق وتقارب كفاءة اللاعبين

----------------------

### The bar chart for the best 12 scorers

In [18]:
player_goals = goals[["player_id", "goal_id"]].groupby("player_id").count().nlargest(12, columns=["goal_id"])

In [19]:
player_ids= player_goals.index.to_list()
player_names = players[["full_name", "player_id"]][players.player_id.isin(player_ids)]
player_goals = pd.merge(player_goals, player_names, on="player_id")

In [20]:
fig = px.bar(player_goals, x="full_name", y="goal_id")
fig.update_layout(**layout_options)
fig.show()

---------------------

### The best scorer per tournament

In [21]:
player_max_goals = goals[["tournament_id", "player_id", "goal_id"]].groupby(
    ["tournament_id","player_id"]
).count().reset_index().groupby("tournament_id").max().reset_index()

In [22]:

player_ids= player_max_goals.player_id.to_list()
player_names = players[["full_name", "player_id"]][players.player_id.isin(player_ids)]
player_max_goals = pd.merge(player_max_goals, player_names, on="player_id")
player_max_goals["player_tournament"] = player_max_goals.full_name + " (" +player_max_goals.tournament_id + ")"

In [23]:
fig = px.bar(player_max_goals, x="player_tournament", y="goal_id")
fig.update_layout(**layout_options)
fig.show()

--------------------

### the total goals per tournament

# C

## 1 

merge away and home teams

In [24]:

matches['teams']= matches.apply(lambda row : ','.join(str(item) for item in sorted([row.home_team_name,row.away_team_name])),axis=1)

In [25]:
ten_most_played = matches.groupby('teams').size().nlargest(10)

----------------

## 2

In [26]:
px.bar(ten_most_played)

we notice that Barazil and Argentina have played with many teams because they have won many tournemants
 

# D

## 1

In [27]:
players_teams_agg = pd.read_csv('data/player_teams_agg.csv')

In [28]:
players_teams_agg[['player_id','given_name','family_name','team_name','team_count']][players_teams_agg.team_count > 1]

Unnamed: 0,player_id,given_name,family_name,team_name,team_count
455,P-00537,Franz,Wagner,"Austria,Germany",2
1080,P-01259,Ferenc,Puskás,"Hungary,Spain",2
1298,P-01512,José,Altafini,"Brazil,Italy",2
1490,P-01739,Davor,Šuker,"Yugoslavia,Croatia",2
1502,P-01757,Rudolf,Raftl,"Austria,Germany",2
2016,P-02369,Dejan,Stanković,"Yugoslavia,Serbia and Montenegro,Serbia",3
2129,P-02502,Nikola,Žigić,"Serbia and Montenegro,Serbia",2
2163,P-02543,Luis,Monti,"Argentina,Italy",2
2191,P-02576,Robert,Prosinečki,"Yugoslavia,Croatia",2
2416,P-02845,Attilio,Demaría,"Argentina,Italy",2


بعض الفرق قد انتهت لأن دولها لم تعد موجودة مثل الاتحاد السوفيتي وألمانيا وصربيا
كما أن بعض اللاعبين يحمل جنسيتين لذلك يستطيع تمثيل أي منها في بطولة واحدة

----------------

## 2

In [29]:
wh = tour[['winner','host_country']]

In [31]:
winner_is_host = wh[wh.winner == wh.host_country]
winner_is_host

Unnamed: 0,winner,host_country
0,Uruguay,Uruguay
1,Italy,Italy
7,England,England
9,West Germany,West Germany
10,Argentina,Argentina
15,France,France


In [32]:
winner_count = tour.groupby('winner').size()
winner_count

winner
Argentina       3
Brazil          5
England         1
France          2
Germany         1
Italy           4
Spain           1
Uruguay         2
West Germany    3
dtype: int64

In [33]:
winner_is_host_with_count = wh.merge(winner_count.to_frame(name='winner_count'),on='winner',how="left")
winner_is_host_with_count.sort_values(by='winner')

Unnamed: 0,winner,host_country,winner_count
10,Argentina,Argentina,3
12,Argentina,Mexico,3
21,Argentina,Qatar,3
5,Brazil,Sweden,5
6,Brazil,Chile,5
8,Brazil,Mexico,5
14,Brazil,United States,5
16,Brazil,"Korea, Japan",5
7,England,England,1
20,France,Russia,2


In [34]:
def cramers_v(cross_tabs):
    """
    Prints the degrees of freedom, effect size thresholds, and Cramer's V value.
    
    Args:
        cross_tabs: A crosstab dataframe.
    """
    
    # getting the chi sq. stat
    chi2 = ss.chi2_contingency(cross_tabs)[0]    # calculating the total number of observations
    n = cross_tabs.sum().sum()    # getting the degrees of freedom
    dof = min(cross_tabs.shape)-1    # calculating cramer's v
    v = np.sqrt(chi2/(n*dof))    # printing results
    print(f'V = {v}')
    print(f'Cramer\'s V Degrees of Freedom = {dof}')
    
    

In [35]:
def print_chi2_p_cramers(df,c1,c2):
    c = df[[c1,c2]]
    contingency_table = pd.crosstab(c[c1], c[c2])
    chi2, p, *_ = chi2_contingency(contingency_table)
    print('Chi-square statistic:', chi2)
    print('P-value:', p)
    cramers_v(contingency_table)


In [36]:
print_chi2_p_cramers(tour,'winner','host_country')

Chi-square statistic: 138.96666666666667
P-value: 0.41334333987682675
V = 0.8885850175044216
Cramer's V Degrees of Freedom = 8


حسب النتائج لا يبدو أنه ييوجد ارتباط بين الفريق الفائز والدولة المضيفة إلا أنه _ حسب قيمة كرامر _ إذا وجد ارتباط فسيكون قوي  

----------------

## 3

In [37]:
print_chi2_p_cramers(matches,'relative_attendance_category','match_for_host')

Chi-square statistic: 3.797195837618871
P-value: 0.14977847365244198
V = 0.06276145179610527
Cramer's V Degrees of Freedom = 1


----------------

## 4

In [38]:
print_chi2_p_cramers(matches,'attendance_category','host_country_code')

Chi-square statistic: 646.0450125820513
P-value: 4.224118972361396e-95
V = 0.4093199161040318
Cramer's V Degrees of Freedom = 4


In [39]:
goals_count = goals[["goal_id", "tournament_id"]].groupby("tournament_id").count()
fig = px.bar(goals_count,y="goal_id")
fig.update_layout(**layout_options)
fig.show()

من الواضح أن الفرق المشاركة تميل الى استراتيجة الهجوم وتسجيل الأهداف بدلا من الدفاع عن المرمى

-----------------------------

In [40]:

study_teams = ["BRA", "DEU", "ITA"]
study_teams_goals = goals[goals.team_code.isin(study_teams)]
study_teams_goals["stage"] = study_teams_goals.stage_name == "group stage"

study_teams_goals["stage"] = study_teams_goals["stage"].map(
    {True: "Group Stage", False: "Loser out"}
    )
fig = px.strip(
    study_teams_goals,
    x="minute_regulation", 
    y="team_code", 
    facet_col="stage", 
    facet_col_wrap=2 
    )
fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



الواضح من المخطط السابق أن المنافسة بين الفرق في دوري خروج المغلوب اكثر منها في دوري المجموعات ذلك لاحتاج المباراة الى اشواط اضافية