# Big 4 Analysis

## 1. Import Libraries

In [3]:
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from statistics import median

In [4]:
# define function to load txt files
def load_txt(filename):
    with open(filename, 'r') as f:
        attributes = f.readline()[:-1].split(',')
        df = pd.DataFrame([line[:-1].split(',') for line in f], columns = attributes)
    return df
    
pd.set_option('display.max_columns', None)

In [5]:
# load text
nd = load_txt('m/matches/Novak Djokovic.txt')
rf = load_txt('m/matches/Roger Federer.txt')
rn = load_txt('m/matches/Rafael Nadal.txt')
am = load_txt('m/matches/Andy Murray.txt')

In [4]:
nd.head()

Unnamed: 0,Date,Tournament,Sets,Surface,Rd,Rk,vRk,W,tRk,vtRk,WP,LP,Score,DR,A%,DF%,1stIn,1st%,2nd%,BPSvd,TPW,RPW,vA%,v1st%,v2nd%,BPCnv,TP,Aces,DFs,SP,1SP,2SP,vA,Time
0,29-Oct-2022,Paris,3,Hard,SF,7,5,1,6,5.0,Novak Djokovic,Stefanos Tsitsipas,6-2 3-6 7-6(4),1.7,14.1%,0.0%,62.8%,83.7%,72.4%,0/2,55.7%,34.8%,10.1%,27.9%,50.0%,2/6,167,11,0,78,49,29,9,2:19
1,29-Oct-2022,Paris,3,Hard,QF,7,23,1,6,,Novak Djokovic,Lorenzo Musetti,6-0 6-3,2.39,7.9%,0.0%,55.3%,85.7%,64.7%,0/1,64.8%,56.6%,3.8%,54.5%,60.0%,5/11,91,3,0,38,21,17,2,1:14
2,29-Oct-2022,Paris,3,Hard,R16,7,19,1,6,,Novak Djokovic,Karen Khachanov,6-4 6-1,1.94,4.2%,6.3%,58.3%,85.7%,55.0%,0/1,61.9%,52.6%,3.5%,36.1%,81.0%,4/8,105,2,3,48,28,20,2,1:27
3,29-Oct-2022,Paris,3,Hard,R32,7,34,1,6,,Novak Djokovic,Maxime Cressy,7-6(1) 6-4,3.23,7.4%,3.7%,70.4%,92.1%,81.3%,0/0,57.6%,35.9%,19.2%,23.4%,54.8%,1/5,132,4,2,54,38,16,15,1:43
4,3-Oct-2022,Astana,3,Hard,F,7,6,1,4,3.0,Novak Djokovic,Stefanos Tsitsipas,6-3 6-4,2.38,2.1%,0.0%,79.2%,86.8%,80.0%,0/0,59.8%,34.7%,2.0%,20.0%,57.9%,2/5,97,1,0,48,38,10,1,1:15


In [6]:
# define functions to clean df
def clean_df(df):
    df.replace('', np.NaN, inplace = True)
    df['Date'] = pd.to_datetime(df['Date'])
    attrs = ['Sets', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 'DR', 'A%', 'DF%', '1stIn', '1st%', '2nd%', 'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%', 'TP', 'Aces', 'DFs', 'SP', '1SP', '2SP', 'vA']
    for attr in attrs:
        cleaned = []
        for item in df[attr]:
            try:
                cleaned.append(pd.to_numeric(item))
            except:
                try:
                    cleaned.append(pd.to_numeric(item.strip('%')))
                except:
                    cleaned.append(np.NaN)
        df[attr] = cleaned


In [7]:
# clean df
dfs = [nd, rf, rn, am]
for df in dfs:
    clean_df(df)

In [7]:
nd.head()

Unnamed: 0,Date,Tournament,Sets,Surface,Rd,Rk,vRk,W,tRk,vtRk,WP,LP,Score,DR,A%,DF%,1stIn,1st%,2nd%,BPSvd,TPW,RPW,vA%,v1st%,v2nd%,BPCnv,TP,Aces,DFs,SP,1SP,2SP,vA,Time
0,2022-10-29,Paris,3,Hard,SF,7.0,5.0,1,6.0,5.0,Novak Djokovic,Stefanos Tsitsipas,6-2 3-6 7-6(4),1.7,14.1,0.0,62.8,83.7,72.4,0/2,55.7,34.8,10.1,27.9,50.0,2/6,167.0,11.0,0.0,78.0,49.0,29.0,9.0,2:19
1,2022-10-29,Paris,3,Hard,QF,7.0,23.0,1,6.0,,Novak Djokovic,Lorenzo Musetti,6-0 6-3,2.39,7.9,0.0,55.3,85.7,64.7,0/1,64.8,56.6,3.8,54.5,60.0,5/11,91.0,3.0,0.0,38.0,21.0,17.0,2.0,1:14
2,2022-10-29,Paris,3,Hard,R16,7.0,19.0,1,6.0,,Novak Djokovic,Karen Khachanov,6-4 6-1,1.94,4.2,6.3,58.3,85.7,55.0,0/1,61.9,52.6,3.5,36.1,81.0,4/8,105.0,2.0,3.0,48.0,28.0,20.0,2.0,1:27
3,2022-10-29,Paris,3,Hard,R32,7.0,34.0,1,6.0,,Novak Djokovic,Maxime Cressy,7-6(1) 6-4,3.23,7.4,3.7,70.4,92.1,81.3,0/0,57.6,35.9,19.2,23.4,54.8,1/5,132.0,4.0,2.0,54.0,38.0,16.0,15.0,1:43
4,2022-10-03,Astana,3,Hard,F,7.0,6.0,1,4.0,3.0,Novak Djokovic,Stefanos Tsitsipas,6-3 6-4,2.38,2.1,0.0,79.2,86.8,80.0,0/0,59.8,34.7,2.0,20.0,57.9,2/5,97.0,1.0,0.0,48.0,38.0,10.0,1.0,1:15


In [26]:
# DR
dr_data = [nd["DR"][~np.isnan(nd['DR'])], rf["DR"][~np.isnan(rf['DR'])], rn["DR"][~np.isnan(rn['DR'])], am["DR"][~np.isnan(am['DR'])]]
dr_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(dr_data, dr_labels, bin_size = .05)
fig.update_layout(title_text = 'Dominance Rate Distribution',
                    xaxis_title = "DR",
                    yaxis_title = "Density"
                    )
fig.show()

In [27]:
fig.write_html('big4/drd.html')

In [28]:
# 1stIn
fstIn_data = [nd["1stIn"][~np.isnan(nd['1stIn'])], rf["1stIn"][~np.isnan(rf['1stIn'])], rn["1stIn"][~np.isnan(rn['1stIn'])], am["1stIn"][~np.isnan(am['1stIn'])]]
fstIn_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(fstIn_data, fstIn_labels, bin_size = .5)
fig.update_layout(title_text = '1stServe-In Rate Distribution',
                    xaxis_title = "1stIn (Percent)",
                    yaxis_title = "Density"
                    )
fig.show()

In [29]:
fig.write_html('big4/1stsid.html')

In [32]:
# A%
A_data = [nd["A%"][~np.isnan(nd['A%'])], rf["A%"][~np.isnan(rf['A%'])], rn["A%"][~np.isnan(rn['A%'])], am["A%"][~np.isnan(am['A%'])]]
A_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(A_data, A_labels, bin_size = .5)
fig.update_layout(title_text = 'Ace Rate Distribution',
                    xaxis_title = "Ace (Percent)",
                    yaxis_title = "Density"
                    )
fig.show()

In [33]:
fig.write_html('big4/ard.html')

In [34]:
# TPW
TPW_data = [nd["TPW"][~np.isnan(nd['TPW'])], rf["TPW"][~np.isnan(rf['TPW'])], rn["TPW"][~np.isnan(rn['TPW'])], am["TPW"][~np.isnan(am['TPW'])]]
TPW_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(TPW_data, TPW_labels, bin_size = .5)
fig.update_layout(title_text = 'TPW Distribution',
                    xaxis_title = "TPW (Percent)",
                    yaxis_title = "Density"
                    )
fig.show()

In [35]:
fig.write_html('big4/tpwd.html')

In [36]:
# RPW
RPW_data = [nd["RPW"][~np.isnan(nd['RPW'])], rf["RPW"][~np.isnan(rf['RPW'])], rn["RPW"][~np.isnan(rn['RPW'])], am["RPW"][~np.isnan(am['RPW'])]]
RPW_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(RPW_data, RPW_labels, bin_size = .5)
fig.update_layout(title_text = 'RPW Distribution',
                    xaxis_title = "RPW (Percent)",
                    yaxis_title = "Density"
                    )
fig.show()

In [37]:
fig.write_html('big4/rpwd.html')

In [38]:
# 1st%
fst_data = [nd["1st%"][~np.isnan(nd['1st%'])], rf["1st%"][~np.isnan(rf['1st%'])], rn["1st%"][~np.isnan(rn['1st%'])], am["1st%"][~np.isnan(am['1st%'])]]
fst_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(fst_data, fst_labels, bin_size = .5)
fig.update_layout(title_text = '1st% Distribution',
                    xaxis_title = "First Serve Point Win (Percent)",
                    yaxis_title = "Density"
                    )
fig.show()

In [39]:
fig.write_html('big4/1std.html')

In [40]:
# 2nd%
snd_data = [nd["2nd%"][~np.isnan(nd['2nd%'])], rf["2nd%"][~np.isnan(rf['2nd%'])], rn["2nd%"][~np.isnan(rn['2nd%'])], am["2nd%"][~np.isnan(am['2nd%'])]]
snd_labels = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray']
fig = ff.create_distplot(snd_data, snd_labels, bin_size = 2)
fig.update_layout(title_text = '2nd% Distribution',
                    xaxis_title = "Second Serve Point Win (Percent)",
                    yaxis_title = "Density"
                    )
fig.show()

In [41]:
fig.write_html('big4/2ndd.html')

In [42]:
# winning ratio against Top X players
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [43]:
fig.write_html('big4/wrtpa.html')

In [68]:
# winning ratio against Top X players on Hard
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['Surface'] == 'Hard') & (nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['Surface'] == 'Hard') & (nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['Surface'] == 'Hard') & (rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['Surface'] == 'Hard') & (rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Surface'] == 'Hard') & (rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['Surface'] == 'Hard') & (rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['Surface'] == 'Hard') & (am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['Surface'] == 'Hard') & (am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players on Hard",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [69]:
fig.write_html('big4/plotly/wrtph.html')

In [45]:
# winning ratio against Top X players on Grass
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['Surface'] == 'Grass') & (nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['Surface'] == 'Grass') & (nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['Surface'] == 'Grass') & (rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['Surface'] == 'Grass') & (rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Surface'] == 'Grass') & (rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['Surface'] == 'Grass') & (rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['Surface'] == 'Grass') & (am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['Surface'] == 'Grass') & (am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players on Grass",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [46]:
fig.write_html('big4/wrtpg.html')

In [47]:
# winning ratio against Top X players on Clay
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['Surface'] == 'Clay') & (nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['Surface'] == 'Clay') & (nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['Surface'] == 'Clay') & (rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['Surface'] == 'Clay') & (rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Surface'] == 'Clay') & (rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['Surface'] == 'Clay') &(rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['Surface'] == 'Clay') & (am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['Surface'] == 'Clay') &(am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players on Clay",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [48]:
fig.write_html('big4/wrtpc.html')

In [65]:
# winning rate at different rounds of GS
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
gs = ['Wimbledon', 'US Open', 'Australian Open', 'Roland Garros']
rds = ['R128', 'R64', 'R32', 'R16', 'QF', 'SF', 'F']

for rd in rds:
    dic['Novak Djokovic'].append(len(nd[(nd['Tournament'].isin(gs)) & (nd['Rd'] == rd) & (nd['W'] == 1)]) / len(nd[(nd['Tournament'].isin(gs)) & (nd['Rd'] == rd)]))
    dic['Roger Federer'].append(len(rf[(rf['Tournament'].isin(gs)) & (rf['Rd'] == rd) & (rf['W'] == 1)]) / len(rf[(rf['Tournament'].isin(gs)) & (rf['Rd'] == rd)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Tournament'].isin(gs)) & (rn['Rd'] == rd) & (rn['W'] == 1)]) / len(rn[(rn['Tournament'].isin(gs)) & (rn['Rd'] == rd)]))
    dic['Andy Murray'].append(len(am[(am['Tournament'].isin(gs)) & (am['Rd'] == rd) & (am['W'] == 1)]) / len(am[(am['Tournament'].isin(gs)) & (am['Rd'] == rd)]))

df = pd.DataFrame(dic, index = rds)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio at Different Round of GS",
                    xaxis_title = 'Round',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [67]:
fig.write_html('big4/plotly/wrgsr.html')

In [50]:
# winning ratio against Top X players in GS
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
gs = ['Wimbledon', 'US Open', 'Australian Open', 'Roland Garros']
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['Tournament'].isin(gs)) & (nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['Tournament'].isin(gs)) & (nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['Tournament'].isin(gs)) & (rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['Tournament'].isin(gs)) & (rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Tournament'].isin(gs)) & (rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['Tournament'].isin(gs)) & (rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['Tournament'].isin(gs)) & (am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['Tournament'].isin(gs)) & (am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players in GS",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [51]:
fig.write_html('big4/wrtpgs.html')

In [52]:
# winning ratio against Top X players in ATP1000
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
atp1000 = ['Indian Wells Masters', 'Miami Masters', 'Monte Carlo Masters', 'Madrid Masters', 'Rome Masters', 'Canada Masters', 'Cincinnati Masters', 'Shanghai Masters', 'Paris']
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['Tournament'].isin(atp1000)) & (nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['Tournament'].isin(atp1000)) & (nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['Tournament'].isin(atp1000)) & (rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['Tournament'].isin(atp1000)) & (rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Tournament'].isin(atp1000)) & (rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['Tournament'].isin(atp1000)) & (rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['Tournament'].isin(atp1000)) & (am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['Tournament'].isin(atp1000)) & (am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players in ATP1000",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [53]:
fig.write_html('big4/wrtpatp1000.html')

In [55]:
# winning ratio against Top X players in All Finals
dic = {'Novak Djokovic': [], 'Roger Federer': [], 'Rafael Nadal': [], 'Andy Murray': []}
rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]

for rk in rks:
    dic['Novak Djokovic'].append(len(nd[(nd['Rd'] == 'F') & (nd['vRk'] <= rk) & (nd['W'] == 1)]) / len(nd[(nd['Rd'] == 'F') & (nd['vRk'] <= rk)]))
    dic['Roger Federer'].append(len(rf[(rf['Rd'] == 'F') & (rf['vRk'] <= rk) & (rf['W'] == 1)]) / len(rf[(rf['Rd'] == 'F') & (rf['vRk'] <= rk)]))
    dic['Rafael Nadal'].append(len(rn[(rn['Rd'] == 'F') & (rn['vRk'] <= rk) & (rn['W'] == 1)]) / len(rn[(rn['Rd'] == 'F') & (rn['vRk'] <= rk)]))
    dic['Andy Murray'].append(len(am[(am['Rd'] == 'F') & (am['vRk'] <= rk) & (am['W'] == 1)]) / len(am[(am['Rd'] == 'F') & (am['vRk'] <= rk)]))

df = pd.DataFrame(dic, index = rks)
fig = px.line(df, x = df.index, y = ['Novak Djokovic', 'Roger Federer', 'Rafael Nadal', 'Andy Murray'], markers = True)
fig.update_layout(title_text = "Winning Ratio against Top X Players in All Finals",
                    xaxis_title = 'Top X',
                    yaxis_title = 'Winning Ratio',
                    )

fig.show()

In [56]:
fig.write_html('big4/wrtpf.html')

In [8]:
# moving average winning ratio varies with time
def ma_wR(df, num, name):
    wR = []
    for i in range(len(df) - (num - 1)):
        wR.append(sum(df['W'][(len(df) - i - num):(len(df) - i)]) / num)
    dates = [df['Date'][((len(df) - i - num) + (len(df) - i - 1)) // 2] for i in range(len(df) - (num - 1))]
    fig = px.line(x = dates, y = wR)
    fig.update_layout(
        title_text = 'Moving Average Winning Ratio per ' + str(num) + ' Matches: ' + name,
        xaxis_title = 'Date',
        yaxis_title = 'Winning Ratio'
    )
    fig.show()
    return fig

In [13]:
fig = ma_wR(nd, 100, 'Novak Djokovic')

In [20]:
num = 100
wR = []
fig = px.line()
for i in range(len(nd) - (num - 1)):
    wR.append(sum(nd['W'][(len(nd) - i - num):(len(nd) - i)]) / num)
dates = [nd['Date'][((len(nd) - i - num) + (len(nd) - i - 1)) // 2] for i in range(len(nd) - (num - 1))]
fig.add_scatter(x = dates, y = wR, name = 'Novak')


wR1 = []
for i in range(len(rn) - (num - 1)):
    wR1.append(sum(rn['W'][(len(rn) - i - num):(len(rn) - i)]) / num)
dates1 = [rn['Date'][((len(rn) - i - num) + (len(rn) - i - 1)) // 2] for i in range(len(rn) - (num - 1))]

fig.add_scatter(x = dates1, y = wR1)
fig.update_layout(
    title_text = 'Moving Average Winning Ratio per ' + str(num) + ' Matches',
    xaxis_title = 'Date',
    yaxis_title = 'Winning Ratio'
)

fig.show()