# Tennis Vis Project

In [1]:
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import numpy as np
from difflib import SequenceMatcher

In [4]:
# define functions to get all players' names
def get_names():
    all_names = []
    with open('mwplayerlist_processed.txt', 'r') as f:
        for line in f:
            all_names.append(line[:-1])
    return all_names

# load txt files
def load_txt(name):
    with open(name[1].lower() + '/matches/txt/' + name[4:] + '.txt', 'r') as f:
        attributes = f.readline()[:-1].split(',')
        df = pd.DataFrame([line[:-1].split(',') for line in f], columns = attributes)
    return (df, name[1])

# calculate string similarity
def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

# define functions to clean df
def clean_df(tuple):
    cleaned = tuple[0].replace('', np.NaN)
    cleaned['Date'] = pd.to_datetime(cleaned['Date'])
    if tuple[1] == 'M':
        attrs = ['Sets', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 'DR', 'A%', 'DF%', '1stIn', '1st%', '2nd%', 'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%', 'TP', 'Aces', 'DFs', 'SP', '1SP', '2SP', 'vA']
    else:
        attrs = ['Sets', 'Rk', 'vRk', 'W', 'tRk', 'vtRk', 'DR', 'A%', 'DF%', '1stIn', '1st%', '2nd%', 'TPW', 'RPW', 'vA%', 'v1st%', 'v2nd%']

    for attr in attrs:
        temp = []
        for item in cleaned[attr]:
            try:
                temp.append(pd.to_numeric(item))
            except:
                try:
                    temp.append(pd.to_numeric(item.strip('%')))
                except:
                    temp.append(np.NaN)
        cleaned[attr] = temp
    return cleaned

# DR
def dr(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dr_data = [df["DR"][~np.isnan(df['DR'])] for df in dfs]
    dr_labels = names
    fig = ff.create_distplot(dr_data, dr_labels, bin_size = .05)
    fig.update_layout(title_text = 'Dominance Rate Distribution',
                        xaxis_title = "DR",
                        yaxis_title = "Density"
                        )
    fig.show()

# 1stIn
def fsi(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    fsi_data = [df["1stIn"][~np.isnan(df['1stIn'])] for df in dfs]
    fsi_labels = names
    fig = ff.create_distplot(fsi_data, fsi_labels, bin_size = .5)
    fig.update_layout(title_text = '1stServe-In Rate Distribution',
                        xaxis_title = "1stIn (Percent)",
                        yaxis_title = "Density"
                        )
    fig.show()

# ace rate
def ace(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    a_data = [df["A%"][~np.isnan(df['A%'])] for df in dfs]
    a_labels = names
    fig = ff.create_distplot(a_data, a_labels, bin_size = .5)
    fig.update_layout(title_text = 'Ace Rate Distribution',
                        xaxis_title = "Ace (Percent)",
                        yaxis_title = "Density"
                        )
    fig.show()

# TPW
def tpw(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    tpw_data = [df["TPW"][~np.isnan(df['TPW'])] for df in dfs]
    tpw_labels = names
    fig = ff.create_distplot(tpw_data, tpw_labels, bin_size = .5)
    fig.update_layout(title_text = 'TPW Distribution',
                        xaxis_title = "TPW (Percent)",
                        yaxis_title = "Density"
                        )
    fig.show()

# RPW
def rpw(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    rpw_data = [df["RPW"][~np.isnan(df['RPW'])] for df in dfs]
    rpw_labels = names
    fig = ff.create_distplot(rpw_data, rpw_labels, bin_size = .5)
    fig.update_layout(title_text = 'RPW Distribution',
                        xaxis_title = "RPW (Percent)",
                        yaxis_title = "Density"
                        )
    fig.show()

# 1st%
def fsw(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    fsw_data = [df["1st%"][~np.isnan(df['1st%'])] for df in dfs]
    fsw_labels = names
    fig = ff.create_distplot(fsw_data, fsw_labels, bin_size = .5)
    fig.update_layout(title_text = '1st% Distribution',
                        xaxis_title = "First Serve Point Win (Percent)",
                        yaxis_title = "Density"
                        )
    fig.show()

# 2nd%
def ssw(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    ssw_data = [df["2nd%"][~np.isnan(df['2nd%'])] for df in dfs]
    ssw_labels = names
    fig = ff.create_distplot(ssw_data, ssw_labels, bin_size = 2)
    fig.update_layout(title_text = '2nd% Distribution',
                        xaxis_title = "Second Serve Point Win (Percent)",
                        yaxis_title = "Density"
                        )
    fig.show()

# winning ratio against Top X players
def wrtp(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]
    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1)]) / len(dfs[i][(dfs[i]['vRk'] <= rk)]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning ratio against Top X players on Hard
def wrtph(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]
    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1) & (dfs[i]['Surface'] == 'Hard')]) / len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['Surface'] == 'Hard')]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players on Hard",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning ratio against Top X players on Grass
def wrtpg(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]
    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1) & (dfs[i]['Surface'] == 'Grass')]) / len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['Surface'] == 'Grass')]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players on Grass",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning ratio against Top X players on Clay
def wrtpc(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]
    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1) & (dfs[i]['Surface'] == 'Clay')]) / len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['Surface'] == 'Clay')]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players on Clay",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning ratio against Top X players in GS
def wrtpgs(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    gs = ['Wimbledon', 'US Open', 'Australian Open', 'Roland Garros']
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700]

    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1) & (dfs[i]['Tournament'].isin(gs))]) / len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['Tournament'].isin(gs))]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players in GS",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning ratio against Top X players in ATP1000
def wrtpatp1000(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    atp1000 = ['Indian Wells Masters', 'Miami Masters', 'Monte Carlo Masters', 'Madrid Masters', 'Rome Masters', 'Canada Masters', 'Cincinnati Masters', 'Shanghai Masters', 'Paris']
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700]

    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1) & (dfs[i]['Tournament'].isin(atp1000))]) / len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['Tournament'].isin(atp1000))]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players in ATP1000",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning ratio against Top X players in All Finals
def wrtpf(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    rks = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1500, 2000]

    for rk in rks:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['W'] == 1) & (dfs[i]['Rd'] == 'F')]) / len(dfs[i][(dfs[i]['vRk'] <= rk) & (dfs[i]['Rd'] == 'F')]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rks)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio against Top X Players in All Finals",
                        xaxis_title = 'Top X',
                        yaxis_title = 'Winning Ratio',
                        )

    fig.show()

# winning rate at different rounds of GS
def wrgsr(names):
    dfs = [clean_df(load_txt(name)) for name in names]
    dic = {}
    for name in names:
        dic[name] = []
    gs = ['Wimbledon', 'US Open', 'Australian Open', 'Roland Garros']
    rds = ['R128', 'R64', 'R32', 'R16', 'QF', 'SF', 'F']

    for rd in rds:
        for i, name in enumerate(names):
            try:
                dic[name].append(len(dfs[i][(dfs[i]['Tournament'].isin(gs)) & (dfs[i]['W'] == 1) & (dfs[i]['Rd'] == rd)]) / len(dfs[i][(dfs[i]['Tournament'].isin(gs)) & (dfs[i]['Rd'] == rd)]))
            except:
                dic[name].append(np.nan)

    df = pd.DataFrame(dic, index = rds)
    fig = px.line(df, x = df.index, y = names, markers = True)
    fig.update_layout(title_text = "Winning Ratio at Different Round of GS",
                        xaxis_title = 'Round',
                        yaxis_title = 'Winning Ratio',
                        )
    fig.show()

# moving average winning ratio
def mawr(names, num = 100):
    dfs = [clean_df(load_txt(name)) for name in names]
    fig = px.line()
    for i, name in enumerate(names):
        wr = []
        for j in range(len(dfs[i]) - (num - 1)):
            wr.append(sum(dfs[i]['W'][(len(dfs[i]) - j - num):(len(dfs[i]) - j)]) / num)
        dates = [dfs[i]['Date'][((len(dfs[i]) - j - num) + (len(dfs[i]) - j - 1)) // 2] for j in range(len(dfs[i]) - (num - 1))]
        fig.add_scatter(x = dates, y = wr, name = name)
    fig.update_layout(
    title_text = 'Moving Average Winning Ratio per ' + str(num) + ' Matches',
    xaxis_title = 'Date',
    yaxis_title = 'Winning Ratio',
    showlegend = True,
    )
    fig.show()
    
# define a comprehensive function for visualization
def TennisVis():
    # get type
    raw_type = input('Enter TennisVis Type:')
    # set all params
    types = ['dominance', 'first serve in', 'ace', 'total points win', 'return points win', 
            'first serve win', 'second serve win', 'winning ratio',
            'winning ratio on hard',
            'winning ratio on grass',
            'winning ratio on clay',
            'winning ratio in grand slam',
            'winning ratio on atp1000',
            'winning ratio in finals',
            'winning ratio in grand slam rounds',
            'moving average winning'
            ]
    all_names = get_names()
    type = types[np.argmax([similar(raw_type, t) for t in types])]
    # get players
    raw_names = input('Enter Player Names (separated by comma):').split(',')
    names = [all_names[np.argmax([similar(name, n) for n in all_names])] for name in raw_names]

    if type == 'dominance':
        dr(names)
    elif type == 'first serve in':
        fsi(names)
    elif type == 'ace':
        ace(names)
    elif type == 'total points win':
        tpw(names)
    elif type == 'return points win':
        rpw(names)
    elif type == 'first serve win':
        fsw(names)
    elif type == 'second serve win':
        ssw(names)
    elif type == 'winning ratio':
        wrtp(names)
    elif type == 'winning ratio on hard':
        wrtph(names)
    elif type == 'winning ratio on grass':
        wrtpg(names)
    elif type == 'winning ratio on clay':
        wrtpc(names)
    elif type == 'winning ratio in grand slam':
        wrtpgs(names)
    elif type == 'winning ratio on atp1000':
        wrtpatp1000(names)
    elif type == 'winning ratio in finals':
        wrtpf(names)
    elif type == 'winning ratio in grand slam rounds':
        wrgsr(names)
    elif type == 'moving average winning':
        try:
            num = int(input('Please Specify Moving Size:'))
            mawr(names, num)
        except:
            mawr(names)

In [7]:
TennisVis()