In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import pickle

The goal is to compile the data from the USTA 2017 top 50 players and encode this into datasets of probability of winning a point based off of elo and court surface.
This will be a simple logistic model where the elo term is the difference between the servers elo and returners elo.

In [3]:
rankings = pd.read_csv('Elo_Rankings2017.csv')
rankings.head()

Unnamed: 0,rank,name,country_name,country_id,points,bestRank,bestRankDate,rankDiff,pointsDiff,bestPoints
0,1,Roger Federer,Switzerland,SUI,2444,1,2003-08-11,0,0,2550
1,2,Novak Djokovic,Serbia,SRB,2418,1,2011-03-21,0,0,2629
2,3,Andy Murray,United Kingdom,GBR,2353,2,2009-04-19,0,0,2500
3,4,Rafael Nadal,Spain,ESP,2350,1,2008-06-16,0,0,2552
4,5,Juan Martin Del Potro,Argentina,ARG,2243,3,2010-06-07,0,0,2331


In [75]:
points_matrix = np.array([[0, 1, 2]])

tours = ['ausopen', 'frenchopen', 'usopen', 'wimbledon']

for year in np.arange(2014, 2018):
    for tour in tours:
        matches = pd.read_csv('tennis_data/' + str(year) + '-' + tour + '-matches.csv')
        points = pd.read_csv('tennis_data/' + str(year) + '-' + tour + '-points.csv')
        points = points[points.PointWinner != 0]
        points = points.reset_index()

        points_sub_matrix = np.zeros((np.shape(points)[0], 3))

        for i in range(np.shape(points)[0]):
            p1 = matches.player1.values[matches.match_id == points.match_id[i]][0]
            p2 = matches.player2.values[matches.match_id == points.match_id[i]][0]

            if ((p1 in (rankings.name.values)) == False) | ((p2 in (rankings.name.values)) == False):
                # We don't know one of the elo ratings of the players so we pass it off
                points_sub_matrix[i] = [np.nan, np.nan, np.nan]
                continue

            #print(p1)
            p1_elo = rankings.points[rankings.name == p1].values[0]
            p2_elo = rankings.points[rankings.name == p2].values[0]

            if tour == 'wimbledon':
                court = 1 #'grass'
            elif tour == 'frenchopen':
                court = 2 # 'clay'
            else:
                court = 3 # 'hard'

            winner = points.iloc[i].PointWinner - 1

            points_sub_matrix[i] = [(p1_elo - p2_elo), court, winner]
        
        points_matrix = np.append(points_matrix, points_sub_matrix, axis=0)




In [81]:
colnames = ['elo_diff', 'court', 'winner']
points_df = pd.DataFrame(points_matrix[1:], columns=colnames)
points_df = points_df.dropna()
# points_df.to_csv('point_probs.csv')
points_df['court'] = points_df['court'].astype('category')
points_df['winner'] = points_df['winner'].astype('category')

points_df.head()

Unnamed: 0,elo_diff,court,winner
0,474.0,3.0,1.0
1,474.0,3.0,0.0
2,474.0,3.0,1.0
3,474.0,3.0,1.0
4,474.0,3.0,0.0


In [90]:
logreg = LogisticRegression(random_state=143).fit(points_df[['elo_diff', 'court']], points_df['winner'])

In [94]:
# Save model to be used in streaks script
pickle.dump(logreg, open('point_prob_model.sav', 'wb'))

In [95]:
logreg_loaded = pickle.load(open('point_prob_model.sav', 'rb'))

logreg_loaded.coef_

array([[-0.00055141, -0.00276414]])