In [None]:
DEBUG = False

# Packages imports

In [None]:
import csv
import re
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import interactive
import matplotlib as mpl
%matplotlib inline
import random
import time

# Raw data import and manipulation

We have 3 data sources from which we want to create our data:
1. atp matches (1991-2018).csv which is the atp matches data from each year from 1991 to 2018
2. player overviews unindexed csv.csv which holds data of atp players, including birth date, height, weight, name and nationality
3. ranking data from the 90's- from which we will fill the rankings missing from the other files

In [None]:
# Import the basic sataset
DataSets_path =r'C:\\tennis\\\datasets\\raw_datasets\\1991_2018' # use your path

os.chdir(DataSets_path)
dfs = [pd.read_csv(f)
        for f in os.listdir(os.getcwd()) if f.endswith('.csv')]
os.chdir('C:\\tennis')

matches_data = pd.concat(dfs)#, axis=0, join='inner').sort_index()
matches_data.sort_values(by=['tourney_date'])
matches_data = matches_data.rename(index=str, columns={'winner_ht': 'winner_height', 'loser_ht': 'loser_height'})
matches_data['tourney_name'].replace('Us Open', 'US Open', inplace=True)
matches_data.to_csv('C:\\tennis\\\datasets\\manipulated_datasets\\atp_matches_combined_1991-2018.csv')


In [None]:
matches_data = pd.read_csv('datasets\\manipulated_datasets\\atp_matches_combined_1991-2018.csv')
# Now let's check how good our dataset:
# How many rows are there in the dataset?
print("number of lines in the original dataset: ", matches_data.shape[0])
#How many lines with ranking data?
print("number of lines with rankings in the original dataset ", matches_data.dropna(subset=['loser_rank', 'winner_rank']).shape[0])
#How many lines with height data?
print("number of lines with heights in the original dataset: ", matches_data.dropna(subset=['loser_height', 'winner_height']).shape[0])
#How many lines with rank and height data?
print("number of lines with heights and rankings in the original dataset: ", matches_data.dropna(subset=['loser_rank', 'winner_rank', 'loser_height', 'winner_height']).shape[0])

number of lines in the original dataset:  90879
number of lines with rankings in the original dataset  86426
number of lines with heights in the original dataset:  80006
number of lines with heights and rankings in the original dataset:  77874


In [None]:
# import players overview data
players_overview = pd.read_csv('datasets\\raw_datasets\\player_overviews_unindexed_csv.csv') 

# player names in this files appear in two columns- first name and last name.
# in the other files, the name appears in one column, full name, so in order to join the datasets we created a new column
# in this dataset that holds the full name:
players_overview['player_name'] = players_overview['first_name'] + ' ' + players_overview['last_name']

# We assume that the height of a player plays an important role in predicting the result of the match,
# therefore we only want the lines that have a valid player height
height_indexes = players_overview['height_cm'] > 0
players_overview = players_overview[height_indexes]

# Let's save the extracted data in a new file
players_overview.to_csv('players_overview.csv')

In [None]:
start = time.time()

winner_no_height_indexes = ~matches_data['winner_height'].isin(range (100,300))
loser_no_height_indexes = ~matches_data['loser_height'].isin(range (100,300))

# Let's cross the data to fill rows without height in the matches data
for ind, row in matches_data[winner_no_height_indexes].iterrows():
    current_player_overview_row = players_overview[players_overview['player_name'] == row['winner_name']]
    if (current_player_overview_row.shape[0] > 0):
        matches_data.at[ind, 'winner_height'] = current_player_overview_row['height_cm']
        
for ind, row in matches_data[loser_no_height_indexes].iterrows():        
    current_player_overview_row = players_overview[players_overview['player_name'] == row['loser_name']]
    if (current_player_overview_row.shape[0] > 0):
        matches_data.at[ind, 'loser_height'] = current_player_overview_row['height_cm']  

matches_data.sort_values(by = ['tourney_date','match_num']).to_csv('datasets\manipulated_datasets\combined_1991-2018_heights.csv')

#How many lines with height data do we have now?
print("number of lines with heights in the dataset: ", matches_data.dropna(subset=['loser_height', 'winner_height']).shape[0])
        
end = time.time()
print(end-start)

number of lines with heights in the dataset:  84742
16.951380968093872


In [None]:
# import rankings data
rankings_data = pd.read_csv('datasets\\raw_datasets\\atp_rankings_90s.csv', header=None) 
rankings_data.columns = ["ranking_date", "ranking", "player_id", "ranking_points"]
rankings_data = rankings_data.sort_values(by='ranking_date')
rankings_data.to_csv('datasets\\manipulated_datasets\\rankings_data_sorted.csv')

In [None]:
matches_data = pd.read_csv('datasets\manipulated_datasets\combined_1991-2018_heights.csv').sort_values(by = ['tourney_date','match_num'])

winner_row_indexes_without_rankings_in_the_original_data = ~(matches_data['winner_rank'] > 0)
loser_row_indexes_without_rankings_in_the_original_data = ~(matches_data['loser_rank'] > 0)

max_winner_rank = matches_data['winner_rank'].max()
max_loser_rank = matches_data['loser_rank'].max()

max_rank = int(max_winner_rank if max_winner_rank>max_loser_rank else max_loser_rank)

for match_index, match_row in matches_data[winner_row_indexes_without_rankings_in_the_original_data].iterrows():
    winner_id = match_row['winner_id']
    match_date = match_row['tourney_date']
    winner_ranking_indexes = (rankings_data['player_id'] == winner_id)
#     print(match_row)
    for ranking_index, ranking_row in rankings_data[winner_ranking_indexes].iterrows():
        if ranking_row['ranking_date'] <= match_date:
#             print(ranking_row['ranking_date'], match_date)
            winner_ranking = ranking_row['ranking']
            winner_ranking_points = ranking_row['ranking_points']
            matches_data.at[match_index, 'winner_rank'] = winner_ranking
            matches_data.at[match_index, 'winner_rank_points'] = winner_ranking_points
#             print(ranking_row['ranking'])
        else:
            break
            
for match_index, match_row in matches_data[loser_row_indexes_without_rankings_in_the_original_data].iterrows():#matches_data[row_indexes_without_rankings_in_the_original_data].iterrows():
    loser_id = match_row['loser_id']
    match_date = match_row['tourney_date']
    loser_ranking_indexes = (rankings_data['player_id'] == loser_id)
#         print(match_row)
    for ranking_index, ranking_row in rankings_data[loser_ranking_indexes].iterrows():
#         print(ranking_row['ranking_date'], match_date)
        if ranking_row['ranking_date'] <= match_date:
            loser_ranking = ranking_row['ranking']
            loser_ranking_points = ranking_row['ranking_points']
            matches_data.at[match_index, 'loser_rank'] = loser_ranking
            matches_data.at[match_index, 'loser_rank_points'] = loser_ranking_points
        else:
            break
#         print(loser_ranking)

In [None]:
# matches_data = pd.read_csv('datasets\\manipulated_datasets\\atp_matches_combined_1991-2018.csv')
import pandas as pd
matches_data = pd.read_csv('datasets\manipulated_datasets\combined_1991-2018_heights.csv').sort_values(by = ['tourney_date','match_num'])

print(matches_data[matches_data['winner_rank']==matches_data['loser_rank']]['winner_rank'])
print(matches_data[matches_data['winner_rank']==matches_data['loser_rank']]['loser_rank'])

FileNotFoundError: [Errno 2] File datasets\manipulated_datasets\combined_1991-2018_heights.csv does not exist: 'datasets\\manipulated_datasets\\combined_1991-2018_heights.csv'

In [None]:
#row_indexes_without_rankings_in_the_original_data = ~(matches_data['winner_rank'] > 0) | ~(matches_data['loser_rank'] > 0)
print(matches_data.shape[0])

matches_data.sort_values(by = ['tourney_date','match_num']).to_csv('datasets\manipulated_datasets\combined_1991-2018_rankings.csv')

90879


In [None]:
matches_data = pd.read_csv('datasets\manipulated_datasets\combined_1991-2018_rankings.csv')
# Now let's check how good our new dataset:
# How many rows are there in the dataset?
print("number of lines in the new dataset: ", matches_data.shape[0])
#How many lines with ranking data?
print("number of lines with rankings in the new dataset ", matches_data.dropna(subset=['loser_rank', 'winner_rank']).shape[0])
#How many lines with height data?
print("number of lines with heights in the new dataset: ", matches_data.dropna(subset=['loser_height', 'winner_height']).shape[0])
#How many lines with rank and height data?
print("number of lines with heights and rankings in the new dataset: ", matches_data.dropna(subset=['loser_rank', 'winner_rank', 'loser_height', 'winner_height']).shape[0])


number of lines in the new dataset:  90879
number of lines with rankings in the new dataset  88862
number of lines with heights in the new dataset:  84742
number of lines with heights and rankings in the new dataset:  84567


In [None]:
winners_data = matches_data[['winner_id', 'winner_name', 'winner_height']]
winners_data.columns = ['player_id', 'player_name', 'player_height']

losers_data = matches_data[['loser_id', 'loser_name', 'loser_height']]
losers_data.columns = ['player_id', 'player_name','player_height']

players_data = pd.concat([winners_data, losers_data])
players_data = players_data.drop_duplicates(subset='player_id')

players_data.to_csv('player_overview_oop_csv.csv')

## Data exploration

In [None]:
# data_base = pd.read_csv('datasets\\manipulated_datasets\\all_matches_with_our_features_clean.csv')
data_base = pd.read_csv('datasets\manipulated_datasets\combined_1991-2018_rankings.csv')

In [None]:
data_base.surface.value_counts()

Hard      44062
Clay      30807
Grass      8597
Carpet     7215
None         80
Name: surface, dtype: int64

In [None]:
data_base.tourney_level.value_counts()

A    52470
M    15383
G    13843
D     8699
F      364
C      120
Name: tourney_level, dtype: int64

In [None]:
# level G means Grand-Slam tournament - The highest level
# level M means Masters - The second highest level
# level G means 

data_base[data_base['tourney_level']=='G']['tourney_name'].unique()

array(['Australian Open', 'Roland Garros', 'Wimbledon', 'US Open'],
      dtype=object)

In [None]:
# data_base['match_num_normalized']=data_base['match_num']-19

## More learning preprocessing

Currently we have two columns of ranks- winner rank and loser rank. If we will pass these two columns, there will be a 1-to-1 connection between these columns and the tags, and we expect the algorithm to get around 100% accuracy. This, of course, is not realistic. In reality, we do not have the results in advance.. Therefore, we want to create new columns with "player 1" and "player 2" instead of "winner" and "loser".

In [None]:
players_data = pd.read_csv('player_overview_oop_csv.csv')
player_list = players_data['player_id']

In [None]:
data_base = pd.read_csv('datasets\manipulated_datasets\combined_1991-2018_rankings.csv')#.sort_values(by = ['tourney_date','match_num'])

# Reorder the data such that instead of having one half of winner data and one half of loser data, 
# which is problematic because we don't know how to arrange future matches without knowing who won,
# we'll rearrange the data such that the higher ranked player data will appear in one half of the data,
# and the lower ranked player data will appear in the second half.

data_base['higher_ranked_player_id'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_id'], data_base['loser_id'])
data_base['lower_ranked_player_id'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_id'], data_base['winner_id'])

data_base['higher_ranked_player_name'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_name'], data_base['loser_name'])
data_base['lower_ranked_player_name'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_name'], data_base['winner_name'])

data_base['higher_ranked_player_hand'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_hand'], data_base['loser_hand'])
data_base['lower_ranked_player_hand'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_hand'], data_base['winner_hand'])

data_base['higher_ranked_player_height'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_height'], data_base['loser_height'])
data_base['lower_ranked_player_height'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_height'], data_base['winner_height'])

data_base['higher_ranked_player_age'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_age'], data_base['loser_age'])
data_base['lower_ranked_player_age'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_age'], data_base['winner_age'])

data_base['higher_ranked_player_rank'] = data_base[['winner_rank','loser_rank']].min(axis=1)
data_base['lower_ranked_player_rank'] = data_base[['winner_rank','loser_rank']].max(axis=1)

data_base['higher_ranked_player_rank_points'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_rank_points'], data_base['loser_rank_points'])
data_base['lower_ranked_player_rank_points'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_rank_points'], data_base['winner_rank_points'])



columns = ['winner_id', 'loser_id', 'higher_ranked_player_id', 'lower_ranked_player_id', 
           'winner_name', 'loser_name','higher_ranked_player_name', 'lower_ranked_player_name',
           'winner_hand', 'loser_hand', 'higher_ranked_player_hand', 'lower_ranked_player_hand',
           'winner_height', 'loser_height', 'higher_ranked_player_height', 'lower_ranked_player_height',
           'winner_age', 'loser_age', 'higher_ranked_player_age', 'lower_ranked_player_age',
           'winner_rank', 'loser_rank', 'higher_ranked_player_rank', 'lower_ranked_player_rank',
           'winner_rank_points', 'loser_rank_points', 'higher_ranked_player_rank_points', 'lower_ranked_player_rank_points',
           'tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level', 'tourney_date', 'match_num',
           'score', 'best_of', 'round', 'minutes']

data_base[columns].to_csv('datasets\\manipulated_datasets\\combined_1991-2018_rankings_clean.csv')

In [None]:
data_base = pd.read_csv('datasets\manipulated_datasets\combined_1991-2018_rankings_clean.csv')#.sort_values(by = ['tourney_date','match_num'])

In [None]:
data_base = pd.get_dummies(data_base, columns=['higher_ranked_player_hand'], prefix = ['higher_ranked_player_hand'])
data_base = pd.get_dummies(data_base, columns=['lower_ranked_player_hand'], prefix = ['lower_ranked_player_hand'])

data_base['height_differences'] = (data_base['higher_ranked_player_height']-data_base['lower_ranked_player_height'])

data_base['rank_differences'] = (data_base['lower_ranked_player_rank']-data_base['higher_ranked_player_rank'])
data_base['rank_differences_normalized'] = (data_base['higher_ranked_player_rank']-data_base['lower_ranked_player_rank'])/data_base['lower_ranked_player_rank']
data_base['rank_differences_multiplied'] = (data_base['higher_ranked_player_rank']-data_base['lower_ranked_player_rank'])*data_base['lower_ranked_player_rank']

mapping = {'R128': 1, 'R64':2, 'R32':3, 'R16':4, 'QF':5, 'SF':6, 'F':7, 'RR':0, 'BR':0}
data_base['round'] = data_base['round'].replace(mapping)

data_base['tag'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], 1, -1)


points_difference = []
data_base['last_match_points_difference'] = None

for match in data_base['score'].str.split(' '):
    winner_points = 0
    loser_points = 0
    try:
        for game in match:
            winner_points += int(re.sub(r"\(.*\)", "", game).split('-')[0])
            loser_points  += int(re.sub(r"\(.*\)", "", game).split('-')[1])
    except:
        pass
    points_difference.append(winner_points - loser_points)

data_base['last_match_sets'] = data_base['score'].str.count('-')
data_base['last_match_sets_difference'] = data_base['best_of'] - data_base['last_match_sets'] + 1
    
data_base['last_match_points_difference'] = np.array(points_difference)
data_base['last_match_points_difference_normalized'] = data_base['last_match_points_difference'] / data_base['last_match_sets']



# data_base = pd.get_dummies(data_base, columns=['surface'], prefix = ['surface'])

data_base.sort_values(by = ['tourney_date','match_num']).to_csv('datasets\\manipulated_datasets\\matches_data_1991-2018.csv')



# Building history

In [None]:
dates = (data_base['tourney_date']/10000).astype(int)
# pd.to_numeric(dates, downcast='integer')
data_base['match_year'] = dates
years = dates.unique()
# data_base['dates_normalized'] = data_base['tourney_date']/dates.value_counts()
# dates.value_counts()
num_of_players_per_year = {}
for year in years:
    winners = data_base[data_base['match_year']==year].winner_id
    winners.columns = ['player_id']
    losers = data_base[data_base['match_year']==year].loser_id
    losers.columns = ['player_id']
    players = pd.concat([winners, losers])
    num_of_players_per_year[year] = players.drop_duplicates().shape[0]
    data_base['match_year'] = data_base['match_year'].replace(year, num_of_players_per_year[year])

data_base['winner_rank_normalized_by_num_of_players'] = data_base['winner_rank'] / data_base['match_year']
data_base['loser_rank_normalized_by_num_of_players'] = data_base['loser_rank'] / data_base['match_year']

data_base['higher_ranked_player_rank_normalized_by_num_of_players'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['winner_rank_normalized_by_num_of_players'], data_base['loser_rank_normalized_by_num_of_players'])
data_base['lower_ranked_player_rank_normalized_by_num_of_players'] = np.where(data_base['winner_rank'] < data_base['loser_rank'], data_base['loser_rank_normalized_by_num_of_players'], data_base['winner_rank_normalized_by_num_of_players'])

data_base.sort_values(by = ['tourney_date','match_num']).to_csv('datasets\\manipulated_datasets\\matches_data_1991-2018.csv')

In [None]:
history_levels = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
                  "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty"]


winner_one_level_history = ['winner_one_level_history_outcome', 'winner_one_level_history_sets_difference']
loser_one_level_history = ['loser_one_level_history_outcome', 'loser_one_level_history_sets_difference']


higher_ranked_player_one_level_history = ['higher_ranked_player_one_level_history_outcome', 'higher_ranked_player_one_level_history_upset_rank_difference', 'higher_ranked_player_one_level_history_no_upset_rank_difference']
lower_ranked_player_one_level_history = ['lower_ranked_player_one_level_history_outcome', 'higher_ranked_player_one_level_history_upset_rank_difference', 'lower_ranked_player_one_level_history_no_upset_rank_difference']


for ii in range (1, len(history_levels)):
    exec("winner_{0}_level_history = winner_{1}_level_history + ['winner_{0}_level_history_outcome', 'winner_{0}_level_history_sets_difference']".format(history_levels[ii],history_levels[ii-1]))
    exec("loser_{0}_level_history = loser_{1}_level_history + ['loser_{0}_level_history_outcome', 'loser_{0}_level_history_sets_difference']".format(history_levels[ii],history_levels[ii-1]))
    exec("{0}_level_history = winner_{0}_level_history + loser_{0}_level_history".format(history_levels[ii-1]))
    exec("{0}_level_history_reversed = loser_{0}_level_history + winner_{0}_level_history".format(history_levels[ii-1]))

 
    exec("higher_ranked_player_{0}_level_history = higher_ranked_player_{1}_level_history + ['higher_ranked_player_{0}_level_history_outcome', 'higher_ranked_player_{0}_level_history_upset_rank_difference', 'higher_ranked_player_{0}_level_history_no_upset_rank_difference']".format(history_levels[ii],history_levels[ii-1]))
    exec("lower_ranked_player_{0}_level_history = lower_ranked_player_{1}_level_history + ['lower_ranked_player_{0}_level_history_outcome', 'lower_ranked_player_{0}_level_history_upset_rank_difference', 'lower_ranked_player_{0}_level_history_no_upset_rank_difference']".format(history_levels[ii],history_levels[ii-1]))
   
    
    exec("{0}_level_history_higher_first = higher_ranked_player_{0}_level_history + lower_ranked_player_{0}_level_history".format(history_levels[ii-1]))
    exec("{0}_level_history_lower_first = lower_ranked_player_{0}_level_history + higher_ranked_player_{0}_level_history".format(history_levels[ii-1]))

        



ranks = ['higher_ranked_player_rank', 'lower_ranked_player_rank']

rank_difference = ['rank_differences']

ages = ['higher_ranked_player_age', 'lower_ranked_player_age']

heights = ['higher_ranked_player_height', 'lower_ranked_player_height']

hands = ['higher_ranked_player_hand_L', 'higher_ranked_player_hand_R', 'higher_ranked_player_hand_U', 'lower_ranked_player_hand_L', 'lower_ranked_player_hand_R', 'lower_ranked_player_hand_U']

# print(data_base.columns)

In [None]:
def create_history(data_frame, file_name):
    data_frame['winner_last_match_minutes'] = None
    data_frame['loser_last_match_minutes'] = None
    data_frame['higher_ranked_player_last_match_minutes'] = None
    data_frame['lower_ranked_player_last_match_minutes'] = None
    
    data_frame['higher_ranked_player_last_level_reached_minus_current_level'] = None
    data_frame['lower_ranked_player_last_level_reached_minus_current_level'] = None
    
    data_frame['winner_one_level_history_points_difference'] = None
    data_frame['loser_one_level_history_points_difference'] = None
    

    data_frame['player_last_match_points_difference'] = None
    data_frame['player_last_match_points_difference_normalized'] = None
    
    data_frame['last_level_history_rank_difference'] = None
    data_frame['last_level_upset_rank_difference'] = 0
    data_frame['last_level_no_upset_rank_difference'] = 0

    data_frame['player_upsets_for'] = 0
    data_frame['player_upsets_against'] = 0
    
    data_frame['winner_upsets_for'] = 0
    data_frame['winner_upsets_against'] = 0
    data_frame['loser_upsets_for'] = 0
    data_frame['loser_upsets_against'] = 0
    
    data_frame['winner_upsets_for_normalized'] = 0
    data_frame['winner_upsets_against_normalized'] = 0
    data_frame['loser_upsets_for_normalized'] = 0
    data_frame['loser_upsets_against_normalized'] = 0

    for level in history_levels:
        data_frame['winner_{}_level_history_outcome'.format(level)] = None
        data_frame['loser_{}_level_history_outcome'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_outcome'.format(level)] = None
        data_frame['lower_ranked_player_{}_level_history_outcome'.format(level)] = None
        
#         data_frame['winner_{}_level_history_rank_difference'.format(level)] = None
#         data_frame['loser_{}_level_history_rank_difference'.format(level)] = None
#         data_frame['higher_ranked_player_{}_level_history_rank_difference'.format(level)] = None
#         data_frame['lower_ranked_player_{}_level_history_rank_difference'.format(level)] = None
        
        data_frame['winner_{}_level_history_sets_difference'.format(level)] = None
        data_frame['loser_{}_level_history_sets_difference'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_sets_difference'.format(level)] = None
        data_frame['lower_ranked_player_{}_level_history_sets_difference'.format(level)] = None

        data_frame['winner_{}_level_history_upset_rank_difference'.format(level)] = None
        data_frame['loser_{}_level_history_upset_rank_difference'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_sets_difference'.format(level)] = None
        data_frame['lower_ranked_player_{}_level_history_sets_difference'.format(level)] = None
        
        data_frame['winner_{}_level_history_no_upset_rank_difference'.format(level)] = None
        data_frame['loser_{}_level_history_no_upset_rank_difference'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_no_upset_rank_difference'.format(level)] = None
        data_frame['lower_ranked_player_{}_level_history_no_upset_rank_difference'.format(level)] = None

        
        data_frame['winner_{}_level_history_points_difference'.format(level)] = None
        data_frame['loser_{}_level_history_points_difference'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_points_difference'.format(level)] = None
        data_frame['lower_ranked_player_{}_level_history_points_difference'.format(level)] = None
        
        data_frame['winner_{}_level_history_points_difference_normalized'.format(level)] = None
        data_frame['loser_{}_level_history_points_difference_normalized'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_points_difference_normalized'.format(level)] = None
        data_frame['lower_ranked_player_{}_level_history_points_difference_normalized'.format(level)] = None
        
        data_frame['higher_ranked_player_{}_level_history_wins'.format(level)] = None
        data_frame['higher_ranked_player_{}_level_history_wins_percentage'.format(level)] = None

        
    
    pd.options.mode.chained_assignment = None  # default='warn'

    winners_data = data_frame[['winner_id']]
    winners_data.columns = ['player_id']
    losers_data = data_frame[['loser_id']]
    losers_data.columns = ['player_id']

    players_data = pd.concat([winners_data, losers_data])
    players_data = players_data.drop_duplicates(subset='player_id')
    player_list = players_data['player_id']

    
    
    data_frame['winning_percentage'] = None
    data_frame['winner_winning_percentage'] = None
    data_frame['loser_winning_percentage'] = None
    data_frame['higher_ranked_player_winning_percentage'] = None
    data_frame['lower_ranked_player_winning_percentage'] = None
    
    data_frame['winner_winning_streak']=None
    data_frame['loser_winning_streak']=None
    data_frame['winner_longest_winning_streak']=None
    data_frame['loser_longest_winning_streak']=None
    
    data_frame['winner_losing_streak']=None
    data_frame['loser_losing_streak']=None
    data_frame['winner_longest_losing_streak']=None
    data_frame['loser_longest_losing_streak']=None

    
    data_frame['last_level_reached'] = None
    data_frame['winner_last_level_reached'] = None
    data_frame['loser_last_level_reached'] = None

    data_frame['player_last_match_sets_difference'] = None
    
    
    data_frame['winner_games_played'] = None
    data_frame['loser_games_played'] = None
    
    
    for current_player_id in player_list:
        
        data_frame['games_played'] = None
        
        data_frame['player_upsets_for'] = 0
        data_frame['player_upsets_against'] = 0
        

        data_frame['last_match_outcome'] = None
#         data_frame['winning_percentage'] = None
#         data_frame['games_won'] = None
        data_frame['games_won'] = np.where(data_frame['winner_id'] == current_player_id, 1, 0)
    
        
        
        
        
        indexes = (data_frame['winner_id']==current_player_id) | (data_frame['loser_id']==current_player_id)
        winner_indexes = data_frame['winner_id']==current_player_id
        loser_indexes = data_frame['loser_id']==current_player_id

        

        
        data_frame['player_upsets_for'][winner_indexes] = np.where(data_frame['tag'][winner_indexes]==-1, 1, 0)
        data_frame['player_upsets_for'][loser_indexes] = 0
        data_frame['player_upsets_for'][indexes] = data_frame['player_upsets_for'][indexes].shift(1)
        data_frame['player_upsets_for'][indexes] = np.cumsum(data_frame['player_upsets_for'][indexes])
        
        
        data_frame['player_upsets_against'][loser_indexes] = np.where(data_frame['tag'][loser_indexes]==-1, 1, 0)
        data_frame['player_upsets_against'][winner_indexes] = 0
        data_frame['player_upsets_against'][indexes] = data_frame['player_upsets_against'][indexes].shift(1)
        data_frame['player_upsets_against'][indexes] = np.cumsum(data_frame['player_upsets_against'][indexes])
#         data_frame['player_upsets_for'][winner_indexes] = data_frame['player_upsets_for'][winner_indexes]
        
        data_frame['winner_upsets_for'][winner_indexes] = data_frame['player_upsets_for'][winner_indexes]
        data_frame['loser_upsets_for'][loser_indexes] = data_frame['player_upsets_for'][loser_indexes]
        data_frame['winner_upsets_for_normalized'][winner_indexes] = data_frame['player_upsets_for'][winner_indexes]/data_frame['games_played']
        data_frame['loser_upsets_for_normalized'][loser_indexes] = data_frame['player_upsets_for'][loser_indexes]/data_frame['games_played']
        
        data_frame['winner_upsets_against'][winner_indexes] = data_frame['player_upsets_against'][winner_indexes]
        data_frame['loser_upsets_against'][loser_indexes] = data_frame['player_upsets_against'][loser_indexes]
        data_frame['winner_upsets_against_normalized'][winner_indexes] = data_frame['player_upsets_against'][winner_indexes]/data_frame['games_played']
        data_frame['loser_upsets_against_normalized'][loser_indexes] = data_frame['player_upsets_against'][loser_indexes]/data_frame['games_played']
    
        data_frame['player_last_match_points_difference'][winner_indexes] = data_frame['last_match_points_difference'][winner_indexes]
        data_frame['player_last_match_points_difference'][loser_indexes] = -data_frame['last_match_points_difference'][loser_indexes]
        data_frame['player_last_match_points_difference_normalized'][winner_indexes] = data_frame['last_match_points_difference_normalized'][winner_indexes]
        data_frame['player_last_match_points_difference_normalized'][loser_indexes] = -data_frame['last_match_points_difference_normalized'][loser_indexes]
        
        data_frame['player_last_match_sets_difference'] = data_frame['last_match_sets_difference']
        data_frame['player_last_match_sets_difference'][winner_indexes] = data_frame['last_match_sets_difference']
        data_frame['player_last_match_sets_difference'][loser_indexes] = -data_frame['last_match_sets_difference']
        
        
        
        data_frame['last_match_outcome'][indexes] = np.where(data_frame['winner_id']==current_player_id, 1, -1)

        minutes_df = data_frame['minutes'][indexes].shift(1)
        data_frame['winner_last_match_minutes'][winner_indexes] = minutes_df[winner_indexes]
        data_frame['loser_last_match_minutes'][loser_indexes] = minutes_df[loser_indexes]
        

        data_frame['last_level_history_rank_difference'][indexes] = data_frame['winner_rank'][indexes]-data_frame['loser_rank'][indexes]

        data_frame['rank_difference'] = abs(data_frame['lower_ranked_player_rank'] - data_frame['higher_ranked_player_rank'])
        data_frame['last_level_upset_rank_difference'][winner_indexes] = np.where(data_frame['tag'][winner_indexes]==-1,data_frame['rank_difference'][winner_indexes], 0)
        data_frame['last_level_upset_rank_difference'][loser_indexes] = np.where(data_frame['tag'][loser_indexes]==-1,-data_frame['rank_difference'][loser_indexes], 0)

        
        data_frame['last_level_no_upset_rank_difference'][winner_indexes] = np.where(data_frame['tag'][winner_indexes]==1, data_frame['rank_difference'][winner_indexes],0)
        data_frame['last_level_no_upset_rank_difference'][loser_indexes] = np.where(data_frame['tag'][loser_indexes]==1, -data_frame['rank_difference'][loser_indexes],0)
 
        
        
        data_frame['last_level_reached'] = None
        data_frame['last_level_reached'][loser_indexes] = data_frame['round']
        data_frame['last_level_reached'][indexes] = data_frame['last_level_reached'][indexes].fillna(method='ffill').shift(1)
        data_frame['winner_last_level_reached'][winner_indexes]= data_frame['last_level_reached'][winner_indexes]
        data_frame['loser_last_level_reached'][loser_indexes] = data_frame['last_level_reached'][loser_indexes]
    
        
        
        
        
    
# Creating winning winning_streak columns
        current_winning_streak = 0
        longest_winning_streak = 0
        
        current_losing_streak = 0
        longest_losing_streak = 0
        # Iterate over each match of the current player
        for index, row in data_frame[indexes].iterrows():

            if (row['games_won']==1):
                
                data_frame.at[index,'winner_winning_streak'] = current_winning_streak
                data_frame.at[index,'winner_longest_winning_streak'] = longest_winning_streak
                current_winning_streak += 1
                
                data_frame.at[index,'winner_losing_streak'] = current_losing_streak
                data_frame.at[index,'winner_longest_losing_streak'] = longest_losing_streak
                current_losing_streak = 0
                
            else:
            
                
                data_frame.at[index,'loser_winning_streak'] = current_winning_streak
                data_frame.at[index,'loser_longest_winning_streak'] = longest_winning_streak
                current_winning_streak = 0
                
                data_frame.at[index,'loser_losing_streak'] = current_losing_streak
                data_frame.at[index,'loser_longest_losing_streak'] = longest_losing_streak
                current_losing_streak += 1 
                
            longest_winning_streak = current_winning_streak if current_winning_streak>longest_winning_streak else longest_winning_streak
            longest_losing_streak = current_losing_streak if current_losing_streak>longest_losing_streak else longest_losing_streak

# Creating career winning percentage
        
        
        data_frame['games_played'][indexes] = range(1, len(data_frame[indexes]) +1)
        data_frame['winner_games_played'][winner_indexes] = data_frame['games_played'][winner_indexes]
        data_frame['loser_games_played'][loser_indexes] = data_frame['games_played'][loser_indexes]
        data_frame['winning_percentage'][indexes] = (data_frame['games_won'][indexes].cumsum() / data_frame['games_played'][indexes]).shift(1)
        
        data_frame['winner_winning_percentage'][winner_indexes] = data_frame['winning_percentage'][winner_indexes]
        data_frame['loser_winning_percentage'][loser_indexes] = data_frame['winning_percentage'][loser_indexes]

# Creating matches history        
        for level in history_levels:
        
            data_frame['player_last_match_sets_difference'][indexes] = data_frame['player_last_match_sets_difference'][indexes].shift(1)
            data_frame['winner_{}_level_history_sets_difference'.format(level)][winner_indexes] = data_frame['player_last_match_sets_difference'][winner_indexes]
            data_frame['loser_{}_level_history_sets_difference'.format(level)][loser_indexes] = data_frame['player_last_match_sets_difference'][loser_indexes]

        
            data_frame['last_match_outcome'][indexes] = data_frame['last_match_outcome'][indexes].shift(1)
            data_frame['winner_{}_level_history_outcome'.format(level)][winner_indexes] = data_frame['last_match_outcome'][winner_indexes]
            data_frame['loser_{}_level_history_outcome'.format(level)][loser_indexes] = data_frame['last_match_outcome'][loser_indexes]
            
            
            
        
            data_frame['last_level_upset_rank_difference'][indexes] = data_frame['last_level_upset_rank_difference'][indexes].shift(1)
            data_frame['winner_{0}_level_history_upset_rank_difference'.format(level)][winner_indexes] = data_frame['last_level_upset_rank_difference'][winner_indexes]
            data_frame['loser_{0}_level_history_upset_rank_difference'.format(level)][loser_indexes] = data_frame['last_level_upset_rank_difference'][loser_indexes]

            
            
            data_frame['last_level_no_upset_rank_difference'][indexes] = data_frame['last_level_no_upset_rank_difference'][indexes].shift(1)
            data_frame['winner_{0}_level_history_no_upset_rank_difference'.format(level)][winner_indexes] = data_frame['last_level_no_upset_rank_difference'][winner_indexes]
            data_frame['loser_{0}_level_history_no_upset_rank_difference'.format(level)][loser_indexes] = data_frame['last_level_no_upset_rank_difference'][loser_indexes]

            
        
        
            data_frame['player_last_match_points_difference'][indexes] = data_frame['player_last_match_points_difference'][indexes].shift(1)
            data_frame['winner_{0}_level_history_points_difference'.format(level)][winner_indexes] = data_frame['player_last_match_points_difference'][winner_indexes]
            data_frame['loser_{0}_level_history_points_difference'.format(level)][loser_indexes] = data_frame['player_last_match_points_difference'][loser_indexes]
            
            data_frame['player_last_match_points_difference_normalized'][indexes] = data_frame['player_last_match_points_difference_normalized'][indexes].shift(1)
            data_frame['winner_{0}_level_history_points_difference_normalized'.format(level)][winner_indexes] = data_frame['player_last_match_points_difference_normalized'][winner_indexes]
            data_frame['loser_{0}_level_history_points_difference_normalized'.format(level)][loser_indexes] = data_frame['player_last_match_points_difference_normalized'][loser_indexes]
    
    # End of for each player
    
    for level in history_levels:
        
        data_frame['higher_ranked_player_{0}_level_history_upset_rank_difference'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{0}_level_history_upset_rank_difference'.format(level)], data_frame['loser_{0}_level_history_upset_rank_difference'.format(level)])
        data_frame['lower_ranked_player_{0}_level_history_upset_rank_difference'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{0}_level_history_upset_rank_difference'.format(level)], data_frame['loser_{0}_level_history_upset_rank_difference'.format(level)])

            
        data_frame['higher_ranked_player_{}_level_history_outcome'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{}_level_history_outcome'.format(level)], data_frame['loser_{}_level_history_outcome'.format(level)])
        data_frame['lower_ranked_player_{}_level_history_outcome'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{}_level_history_outcome'.format(level)], data_frame['loser_{}_level_history_outcome'.format(level)])

        data_frame['higher_ranked_player_{}_level_history_points_difference'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{}_level_history_points_difference'.format(level)], data_frame['loser_{}_level_history_points_difference'.format(level)])
        data_frame['lower_ranked_player_{}_level_history_points_difference'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{}_level_history_points_difference'.format(level)], data_frame['loser_{}_level_history_points_difference'.format(level)])

        data_frame['higher_ranked_player_{}_level_history_points_difference_normalized'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{}_level_history_points_difference_normalized'.format(level)], data_frame['loser_{}_level_history_points_difference_normalized'.format(level)])
        data_frame['lower_ranked_player_{}_level_history_points_difference_normalized'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{}_level_history_points_difference_normalized'.format(level)], data_frame['loser_{}_level_history_points_difference_normalized'.format(level)])


#         data_frame['higher_ranked_player_{0}_level_history_rank_difference'.format(level)] = np.where(data_frame['winner_rank'] < data_frame['loser_rank'], data_frame['winner_{0}_level_history_rank_difference'.format(level)], data_frame['loser_{0}_level_history_rank_difference'.format(level)])
#         data_frame['lower_ranked_player_{0}_level_history_rank_difference'.format(level)] = np.where(data_frame['winner_rank'] > data_frame['loser_rank'], data_frame['winner_{0}_level_history_rank_difference'.format(level)], data_frame['loser_{0}_level_history_rank_difference'.format(level)])


        data_frame['higher_ranked_player_{0}_level_history_no_upset_rank_difference'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{0}_level_history_no_upset_rank_difference'.format(level)], data_frame['loser_{0}_level_history_no_upset_rank_difference'.format(level)])
        data_frame['lower_ranked_player_{0}_level_history_no_upset_rank_difference'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{0}_level_history_no_upset_rank_difference'.format(level)], data_frame['loser_{0}_level_history_no_upset_rank_difference'.format(level)])

        
        data_frame['higher_ranked_player_{0}_level_history_upset_rank_difference'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{0}_level_history_upset_rank_difference'.format(level)], data_frame['loser_{0}_level_history_upset_rank_difference'.format(level)])
        data_frame['lower_ranked_player_{0}_level_history_upset_rank_difference'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{0}_level_history_upset_rank_difference'.format(level)], data_frame['loser_{0}_level_history_upset_rank_difference'.format(level)])

        
        data_frame['higher_ranked_player_{}_level_history_sets_difference'.format(level)] = np.where(data_frame['tag']==1, data_frame['winner_{}_level_history_sets_difference'.format(level)], data_frame['loser_{}_level_history_sets_difference'.format(level)])
        data_frame['lower_ranked_player_{}_level_history_sets_difference'.format(level)] = np.where(data_frame['tag']==-1, data_frame['winner_{}_level_history_sets_difference'.format(level)], data_frame['loser_{}_level_history_sets_difference'.format(level)])

                
        data_frame['higher_ranked_player_winning_percentage'] = np.where(data_frame['tag']==1, data_frame['winner_winning_percentage'], data_frame['loser_winning_percentage'])
        data_frame['lower_ranked_player_winning_percentage'] = np.where(data_frame['tag']==-1, data_frame['winner_winning_percentage'], data_frame['loser_winning_percentage'])


    data_frame['higher_ranked_player_one_level_history_wins'] = (1+data_frame['higher_ranked_player_one_level_history_outcome'])/2
    data_frame['lower_ranked_player_one_level_history_wins'] = (1+data_frame['lower_ranked_player_one_level_history_outcome'])/2
    
    for i in range(1, len(history_levels)):
        data_frame['higher_ranked_player_{}_level_history_wins'.format(history_levels[i])] = data_frame['higher_ranked_player_{}_level_history_wins'.format(history_levels[i-1])]+ (1+data_frame['higher_ranked_player_{}_level_history_outcome'.format(history_levels[i])])/2
        data_frame['lower_ranked_player_{}_level_history_wins'.format(history_levels[i])] = data_frame['lower_ranked_player_{}_level_history_wins'.format(history_levels[i-1])]+ (1+data_frame['lower_ranked_player_{}_level_history_outcome'.format(history_levels[i])])/2
    
    #  data_frame['last_match_sets_difference'][indexes] = data_frame['last_match_sets_difference'][indexes].shift(1)
        
#         data_frame['higher_ranked_player_{}_level_history_sets_difference'.format(history_levels[i])] = np.where(data_frame['winner_rank'] < data_frame['loser_rank'], data_frame['winner_{}_level_history_sets_difference'.format(history_levels[i])], data_frame['loser_{}_level_history_sets_difference'.format(history_levels[i])])
#         data_frame['lower_ranked_player_{}_level_history_sets_difference'.format(history_levels[i])] = np.where(data_frame['winner_rank'] > data_frame['loser_rank'], data_frame['winner_{}_level_history_sets_difference'.format(history_levels[i])], data_frame['loser_{}_level_history_sets_difference'.format(history_levels[i])])


    data_frame['higher_ranked_player_games_played'] = np.where(data_frame['tag']==1, data_frame['winner_games_played'], data_frame['loser_games_played'])
    data_frame['lower_ranked_player_games_played'] = np.where(data_frame['tag']==-1, data_frame['winner_games_played'], data_frame['loser_games_played'])


    data_frame['higher_ranked_player_upsets_for'] = np.where(data_frame['tag']==1, data_frame['winner_upsets_for'], data_frame['loser_upsets_for'])
    data_frame['higher_ranked_player_upsets_for_normalized'] = data_frame['higher_ranked_player_upsets_for']/data_frame['higher_ranked_player_games_played']
    data_frame['lower_ranked_player_upsets_for'] = np.where(data_frame['tag']==-1, data_frame['winner_upsets_for'], data_frame['loser_upsets_for'])
    data_frame['lower_ranked_player_upsets_for_normalized'] = data_frame['lower_ranked_player_upsets_for']/data_frame['lower_ranked_player_games_played']
    
    data_frame['higher_ranked_player_upsets_against'] = np.where(data_frame['tag']==1, data_frame['winner_upsets_against'], data_frame['loser_upsets_against'])
    data_frame['higher_ranked_player_upsets_against_normalized'] = data_frame['higher_ranked_player_upsets_against']/data_frame['higher_ranked_player_games_played']
    data_frame['lower_ranked_player_upsets_against'] = np.where(data_frame['tag']==-1, data_frame['winner_upsets_against'], data_frame['loser_upsets_against'])
    data_frame['lower_ranked_player_upsets_against_normalized'] = data_frame['lower_ranked_player_upsets_against']/data_frame['lower_ranked_player_games_played']

    
    data_frame['higher_ranked_player_winning_streak'] = np.where(data_frame['tag']==1, data_frame['winner_winning_streak'], data_frame['loser_winning_streak'])
    data_frame['lower_ranked_player_winning_streak'] = np.where(data_frame['tag']==-1, data_frame['winner_winning_streak'], data_frame['loser_winning_streak'])
    data_frame['higher_ranked_player_longest_winning_streak'] = np.where(data_frame['tag']==1, data_frame['winner_longest_winning_streak'], data_frame['loser_longest_winning_streak'])
    data_frame['lower_ranked_player_longest_winning_streak'] = np.where(data_frame['tag']==-1, data_frame['winner_longest_winning_streak'], data_frame['loser_longest_winning_streak'])
       
    data_frame['higher_ranked_player_losing_streak'] = np.where(data_frame['tag']==1, data_frame['winner_losing_streak'], data_frame['loser_losing_streak'])
    data_frame['lower_ranked_player_losing_streak'] = np.where(data_frame['tag']==-1, data_frame['winner_losing_streak'], data_frame['loser_losing_streak'])
    data_frame['higher_ranked_player_longest_losing_streak'] = np.where(data_frame['tag']==1, data_frame['winner_longest_losing_streak'], data_frame['loser_longest_losing_streak'])
    data_frame['lower_ranked_player_longest_losing_streak'] = np.where(data_frame['tag']==-1, data_frame['winner_longest_losing_streak'], data_frame['loser_longest_losing_streak'])
    
    data_frame['higher_ranked_player_longest_winning_streak_minus_current_winning_streak'] = data_frame['higher_ranked_player_longest_winning_streak'] - data_frame['higher_ranked_player_winning_streak']
    data_frame['lower_ranked_player_longest_winning_streak_minus_current_winning_streak'] = data_frame['lower_ranked_player_longest_winning_streak'] - data_frame['lower_ranked_player_winning_streak']
    data_frame['higher_ranked_player_longest_losing_streak_minus_current_losing_streak'] = data_frame['higher_ranked_player_longest_losing_streak'] - data_frame['higher_ranked_player_losing_streak']
    data_frame['lower_ranked_player_longest_losing_streak_minus_current_losing_streak'] =data_frame['lower_ranked_player_longest_losing_streak'] - data_frame['lower_ranked_player_losing_streak']
       
        
    data_frame['higher_ranked_player_last_level_reached'] = np.where(data_frame['tag']==1, data_frame['winner_last_level_reached'], data_frame['loser_last_level_reached'])
    data_frame['lower_ranked_player_last_level_reached'] = np.where(data_frame['tag']==-1, data_frame['winner_last_level_reached'], data_frame['loser_last_level_reached'])

    data_frame['higher_ranked_player_last_level_reached_minus_current_level'] = data_frame['higher_ranked_player_last_level_reached'] - data_frame['round']
    data_frame['lower_ranked_player_last_level_reached_minus_current_level'] = data_frame['lower_ranked_player_last_level_reached'] - data_frame['round']

#     data_frame['higher_ranked_player_last_match_points_difference'] = np.where(data_frame['winner_rank'] < data_frame['loser_rank'], data_frame['winner_last_match_points_difference'], data_frame['loser_last_match_points_difference'])
#     data_frame['lower_ranked_player_last_match_points_difference'] = np.where(data_frame['winner_rank'] > data_frame['loser_rank'], data_frame['winner_last_match_points_difference'], data_frame['loser_last_match_points_difference'])

      
    data_frame['higher_ranked_player_last_match_minutes'] = np.where(data_frame['tag']==1, data_frame['winner_last_match_minutes'], data_frame['loser_last_match_minutes'])
    data_frame['lower_ranked_player_last_match_minutes'] = np.where(data_frame['tag']==-1, data_frame['winner_last_match_minutes'], data_frame['loser_last_match_minutes'])

    
    data_frame['winner_upsets_for'][winner_indexes] = np.cumsum(data_frame['player_upsets_for'][winner_indexes].shift(1))
    data_frame['loser_upsets_for'][loser_indexes] = np.cumsum(data_frame['player_upsets_against'][loser_indexes].shift(1))
      
        
    data_frame['higher_ranked_player_one_level_history_wins_percentage'] = data_frame['higher_ranked_player_one_level_history_wins'] 
    data_frame['lower_ranked_player_one_level_history_wins_percentage'] = data_frame['lower_ranked_player_one_level_history_wins']
    for i in range(0, len(history_levels)):
        data_frame['higher_ranked_player_{}_level_history_wins_percentage'.format(history_levels[i])] = (data_frame['higher_ranked_player_{}_level_history_wins'.format(history_levels[i])])/(i+1)
        data_frame['lower_ranked_player_{}_level_history_wins_percentage'.format(history_levels[i])] = (data_frame['lower_ranked_player_{}_level_history_wins'.format(history_levels[i])])/(i+1)

    
    auxiliary_columns = ['minutes',
                         'last_match_outcome', 'last_match_sets', 'last_match_sets_difference', 'last_level_history_rank_difference',
                         'player_last_match_points_difference','player_last_match_points_difference_normalized',
                         'last_level_upset_rank_difference', 'last_level_no_upset_rank_difference', 'last_level_reached',
                         'games_played', 'last_match_points_difference']
    
    winner_features = [col for col in data_frame.columns if col.startswith('winner')]
    loser_features = [col for col in data_frame.columns if col.startswith('loser')]
    winner_loser_features = winner_features+ loser_features
        
#     data_frame.sort_values(by = ['tourney_date','match_num']).to_csv(file_name)

    data_frame.drop(auxiliary_columns, axis=1, inplace=True)
    data_frame.drop(winner_loser_features, axis=1).sort_values(by = ['tourney_date','match_num']).to_csv(file_name)

In [None]:
data_base = pd.read_csv('datasets\manipulated_datasets\matches_data_1991-2018.csv')

start = time.time()

data_name = "datasets\\manipulated_datasets\\all_matches_with_our_features.csv"
create_history(data_base, data_name)

end = time.time()
print(end-start)

14147.187840223312


In [None]:
data_base = pd.read_csv('datasets\manipulated_datasets\matches_data_1991-2018.csv')
# surfaces = data_base.surface.unique()
surfaces = ['Grass', 'Hard']
# surfaces = ['Carpet']
# surfaces = ['None']
# surfaces = ['Clay']

for surface in surfaces:
    start = time.time()
    surface_data_frame = data_base[data_base['surface']==surface]
#     surface_data_frame = surface_data_frame[surface_data_frame['minutes'] > 200 ]
    surface_data_name = "datasets\\manipulated_datasets\\{}_surface_matches.csv".format(surface)
    create_history(surface_data_frame, surface_data_name)
    
#     surface_data.to_csv(surface_data_name)
    
    end = time.time()
    print(end-start)

1480.6041195392609
5991.434074878693


In [None]:
data_base = pd.read_csv('datasets\manipulated_datasets\matches_data_1991-2018.csv')
# data_base['tourney_name'].value_counts()
# tournaments = ['Wimbledon', 'Australian Open', 'Roland Garros', 'US Open']
tournaments = ['US Open']


for tournament in tournaments:
    start = time.time()
    tournament_data_frame = data_base[data_base['tourney_name']==tournament]
    tournament_data_name = "datasets\\manipulated_datasets\\{}_tournament_matches.csv".format(tournament)
    create_history(tournament_data_frame, tournament_data_name)
    
#     surface_data.to_csv(surface_data_name)
    
    end = time.time()
    print(end-start)

875.9371383190155


In [None]:
data_base = pd.read_csv('datasets\manipulated_datasets\matches_data_1991-2018.csv')
# tournament_levels=[['D', 'F', 'C']]
# tournament_levels=['M']
tournament_levels=['G']#, ['D', 'F', 'C']]

for tournament_level in tournament_levels:
    start = time.time()
    tournament_data_frame = data_base[data_base['tourney_level']==tournament_level]
    level_name = "datasets\manipulated_datasets\{}_level_matches.csv".format(tournament_level)
    create_history(tournament_data_frame, level_name)
    

end = time.time()
print(end-start)

1512.3749792575836


In [None]:
data_base[data_base['tourney_level']=='G']['tourney_name'].unique()
# grand_slam_tournaments = ['Australian Open', 'Roland Garros', 'Wimbledon', 'US Open', 'Us Open']

array(['Australian Open', 'Roland Garros', 'Wimbledon', 'US Open'],
      dtype=object)