# Uppgift 1

In [217]:
import pandas as pd
import seaborn as sns
import hashlib as hl
import matplotlib.pyplot as plt
import numpy as np
import sys
sys.path.append("..")
from data_utils import *
import matplotlib.image as mpimg
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go


# FutureWarning handling
pd.options.mode.copy_on_write = True

df = read_athlete_events("/home/albot/coding/repos/Projekt-OS-Norge/athlete_events.csv")
df = hash_column(df, "Name")
nor = df[df["NOC"] == "NOR"]

In [None]:
def medal_counter(df=nor, col="Games"):
    df_medal = df.copy()
    df_medal["Medal"] = df_medal["Medal"].fillna("No Medal")        # inte säker på att jag behöver denna
    df_medal = df_medal.drop_duplicates(subset=["Event", "Games", "Team", "Medal"])
    medal_count = df_medal.groupby([col, "Medal"]).size().unstack(fill_value=0)
    medal_count["Total"] = medal_count[["Bronze", "Silver", "Gold"]].sum(axis=1)
    medal_count = medal_count.reindex(columns=["Bronze", "Silver", "Gold", "Total"])
    return medal_count

In [219]:
nor_medals = medal_counter()
nor_medals.head()

Medal,Bronze,Silver,Gold,Total
Games,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900 Summer,3,2,0,5
1904 Summer,0,0,2,2
1906 Summer,0,1,1,2
1908 Summer,3,3,2,8
1912 Summer,5,1,4,10


In [220]:
fig = px.line(nor_medals, x=nor_medals.index, y="Total", markers=True, title="Medals won by Norway in the Olympics")
fig.show()

## Gender distribution  

### Perhaps  
* age  
* medals  
* px subplot pie chart of women w/ medals vs women participating, same for men  
* sns barplot (countplot?) of women particiapting through the years
* best sports

In [221]:
fig = px.histogram(nor, x="Age", color="Sex", barmode="overlay", title="Average age of Norwegian Olympic athletes", labels={"Age": "Age", "Sex": "Gender"})
fig.update_traces(marker_line_width=1.5)
fig.show()

In [222]:
nor_wom = nor[nor["Sex"] == "F"]
nor_men = nor[nor["Sex"] == "M"]
nor_medals_wom = medal_counter(nor_wom, "Games").sort_values(by="Games")
nor_medals_men = medal_counter(nor_men, "Games").sort_values(by="Games")

fig = px.line(nor_medals.reset_index(), x="Games", y="Total", color_discrete_sequence=["crimson"], labels={"Total": "Overall"}, title="Norwegian Olympic medals")
fig.add_scatter(x=nor_medals_men.reset_index()["Games"], y=nor_medals_men["Total"], mode="lines", name="Men", line=dict(color="forestgreen"))
fig.add_scatter(x=nor_medals_wom.reset_index()["Games"], y=nor_medals_wom["Total"], mode="lines", name="Women", line=dict(color="orange"))
fig.update_layout(xaxis_title="Year", yaxis_title="Number of medals", legend_title_text="Category")
fig.update_xaxes(tickangle=-90)
fig.show()

# FIXME: t ex, 1920 har overall 32, men 32 och women 1. 2014 har overall 26, men 14 och women 13. 

In [223]:
nor_athletes = nor.groupby("Games")["ID"].nunique().reset_index(name="All")
nor_athletes_wom = nor_wom.groupby("Games")["ID"].nunique().reset_index(name="Women")
nor_athletes_men = nor_men.groupby("Games")["ID"].nunique().reset_index(name="Men")
nor_athletes = nor_athletes.merge(nor_athletes_wom, on="Games", how="left").fillna(0)
nor_athletes = nor_athletes.merge(nor_athletes_men, on="Games", how="left").fillna(0)
nor_athletes[["Women", "Men"]] = nor_athletes[["Women", "Men"]].astype(int)
nor_athletes.head()

Unnamed: 0,Games,All,Women,Men
0,1900 Summer,7,0,7
1,1904 Summer,3,0,3
2,1906 Summer,32,0,32
3,1908 Summer,69,0,69
4,1912 Summer,190,2,188


In [224]:
fig = px.bar(nor_athletes, x="Games", y=["Women", "Men"], 
             color_discrete_sequence=["orange", "forestgreen"], 
             title="Norwegian athletes in the Olympics", 
             labels={"value": "Pizza", "variable": "Gender", "Games": ""})
fig.update_xaxes(tickangle=-90)
fig.show()

In [225]:
nor_athletes["Women%"] = ((nor_athletes["Women"] / nor_athletes["All"]) * 100).round(0).astype(int)
nor_athletes["Men%"] = ((nor_athletes["Men"] / nor_athletes["All"]) * 100).round(0).astype(int)
nor_athletes = nor_athletes.reindex(columns=["Games", "All", "Women", "Women%", "Men", "Men%"])
display(nor_athletes.tail())
fig = px.line(nor_athletes, x="Games", y=["Women%", "Men%"], color_discrete_sequence=["orange", "forestgreen"], labels={"value": "Percentage", "variable": "Gender", "Games": " "}, title="Women vs men in the Norwegian Olympics teams")
fig.update_xaxes(tickangle=-90)
fig.show()

Unnamed: 0,Games,All,Women,Women%,Men,Men%
44,2008 Summer,84,54,64,30,36
45,2010 Winter,95,25,26,70,74
46,2012 Summer,61,28,46,33,54
47,2014 Winter,110,32,29,78,71
48,2016 Summer,62,33,53,29,47


In [238]:
nor_medals_sport = nor.copy()
nor_medals_sport = nor_medals_sport.drop_duplicates(subset=["Event", "Games", "Team", "Medal"])
nor_medals_sport = nor_medals_sport.groupby(["Games", "Sport"])["Medal"].count().unstack(fill_value=0)
# nor_medals_sport["Total"] = nor_medals_sport.sum(axis=1)      # bara för slippa räkna manuellt för att verifiera
nor_medals_sport = nor_medals_sport.reset_index()
nor_medals_sport.head()

Sport,Games,Alpine Skiing,Archery,Art Competitions,Athletics,Badminton,Beach Volleyball,Biathlon,Bobsleigh,Boxing,...,Skeleton,Ski Jumping,Snowboarding,Speed Skating,Swimming,Taekwondo,Tennis,Triathlon,Weightlifting,Wrestling
0,1900 Summer,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1904 Summer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
2,1906 Summer,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1908 Summer,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1912 Summer,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [288]:
fig = px.bar(nor_medals_sport, 
             x="Games", y=nor_medals_sport.columns[1:], 
             title="Norwegian Olympic medals by sport", 
             labels={"Total": "Medals", "index": "Sport", "Games": "", "value": "Medals"}, 
             color_discrete_sequence=px.colors.qualitative.Light24)
fig.update_xaxes(tickangle=-90)
fig.show()

In [228]:
def sports_medals(df=nor):
    sports_medals = df.copy()
    sports_medals = sports_medals.drop_duplicates(subset=["Event", "Games", "Team", "Medal"])
    sports_medals = sports_medals.groupby(["Sport", "Medal"]).size().unstack(fill_value=0)
    sports_medals["Total"] = sports_medals.sum(axis=1)
    sports_medals = sports_medals.reindex(columns=["Bronze", "Silver", "Gold", "Total"])
    sports_medals = sports_medals.sort_values(by="Total", ascending=False)
    sports_medals = sports_medals.reset_index()
    return sports_medals


nor_medals_sport_all = sports_medals()
nor_medals_sport_wom = sports_medals(nor_wom)
nor_medals_sport_men = sports_medals(nor_men)

In [229]:
# # Give each sport a unique colour so that it looks the same in all plots
# sports_list = nor_medals_sport_all["Sport"].unique().tolist()
# sports_list

fig = px.bar(nor_medals_sport_all, x="Sport", y="Total", title="Norwegian Olympic medals by sport", labels={"Total": "Medals", "Sport": ""}, color="Sport")
fig.update_xaxes(tickangle=-90)
fig.show()

fig = px.bar(nor_medals_sport_wom, x="Sport", y="Total", title="Norwegian Olympic medals by sport for women", labels={"Total": "Medals", "Sport": ""}, color="Sport")
fig.update_xaxes(tickangle=-90)
fig.show()

fig = px.bar(nor_medals_sport_men, x="Sport", y="Total", title="Norwegian Olympic medals by sport for men", labels={"Total": "Medals", "Sport": ""}, color="Sport")
fig.update_xaxes(tickangle=-90)
fig.show()

In [287]:
nor_medals_decade = nor_medals.reset_index()
nor_medals_decade = nor_medals_decade[["Games", "Total"]]
temp_men = nor_medals_men.reset_index()
temp_men = temp_men[["Games", "Total"]]
temp_wom = nor_medals_wom.reset_index()
temp_wom = temp_wom[["Games", "Total"]]

nor_medals_decade = nor_medals_decade.merge(temp_men, on="Games", how="left")
nor_medals_decade = nor_medals_decade.merge(temp_wom, on="Games", how="left").fillna(0)
nor_medals_decade["Total"] = nor_medals_decade["Total"].astype(int)
nor_medals_decade = nor_medals_decade.rename(columns={"Total_x": "Medals", "Total_y": "Men", "Total": "Women"})
nor_medals_decade["Decade"] = nor_medals_decade["Games"].apply(lambda row: int(row[:3] + "0"))

nor_medals_decade = nor_medals_decade.groupby("Decade", as_index=False)[["Medals", "Men", "Women"]].sum()
nor_medals_decade

# this works but I want 3x4 subplots
fig = make_subplots(rows=1, cols=len(nor_medals_decade), specs=[[{"type": "domain"}] * len(nor_medals_decade)],
					subplot_titles=[f"{decade}s" for decade in nor_medals_decade["Decade"]])

for i, row in nor_medals_decade.iterrows():
	fig.add_trace(go.Pie(labels=["Men", "Women"], values=[row["Men"], row["Women"]], name=f"{row['Decade']}s",
						 marker_colors=["forestgreen", "orange"]), 1, i+1)

fig.update_layout(title_text="Medals won by male and female athletes per decade")
fig.show()

In [272]:
def medal_coloured_bars(df=nor, col="Games"):
    
    def medal_counter():
        df_medals = df.copy()
        df_medals = df_medals.drop_duplicates(subset=["Event", "Games", "Team", "Medal"])
        df_count = df_medals.groupby([col, "Medal"]).size().unstack(fill_value=0)
        df_count["Total"] = df_count[["Bronze", "Silver", "Gold"]].sum(axis=1)
        df_count = df_count.reindex(columns=["Bronze", "Silver", "Gold", "Total"])
        return df_count
    

    df_medal_count = medal_counter()
    df_medal_count = df_medal_count.reset_index()

    fig = px.bar(df_medal_count, 
             x=col, y=["Bronze", "Silver", "Gold"], 
             title="Norwegian Olympic medals", 
             labels={"Total": "Medals", "index": "Sport", "Games": "", "value": "Medals", "variable": ""}, 
             color_discrete_sequence=["#cd7f32", "#c0c0c0", "#ffd700"])
    fig.update_xaxes(tickangle=-90)

    return fig


test = medal_coloured_bars()
test.show()