In [1]:
## Importing required libraries
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import itertools
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
from time import time
from sklearn.decomposition import PCA, FastICA
from sklearn.pipeline import Pipeline
import warnings

warnings.simplefilter("ignore")

In [2]:
pd.options.display.max_columns = 200  #displaying 200 columns
pd.options.display.max_rows = 1000  #displaying 1000 rows

In [3]:
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_float_dtype, is_integer_dtype, is_categorical_dtype, \
    is_datetime64_ns_dtype


def type_conversion(df, dt_col=None, conv_to_int=None):
    '''
    input:
    df: dataframe
    dt_col(list of column name): column which needs to be converted to datetime
    conv_to_int(list of column name): columns which needs to be converted to int

    output:
    df_n: new dataframe, original dataframe is still intact
    '''

    print(df.info(memory_usage='deep'))
    print("=============")

    #     int8 = {'min': -128, 'max': 127}
    #     uint8 = {'min': 0, 'max': 255}
    #     int16 = {'min': -32768, 'max': 32767}
    #     uint16 = {'min': 0, 'max': 65535}
    #     int32 = {'min': -2147483648, 'max': 2147483647}
    #     uint32 = {'min': 0, 'max': 4294967295}
    #     int64 = {'min': -9223372036854775808, 'max': 9223372036854775807}
    #     uint64 = {'min': 0, 'max': 18446744073709551615}

    cols = df.columns
    df_n = df.copy(deep=True)
    cols_non_string = []
    cols_null = []
    if dt_col:
        for c in dt_col:
            if df_n[c].isnull().sum() == 0:
                df_n[c] = pd.to_datetime(df_n[c])
            else:
                cols_null.append(c)
    for c in cols:
        if is_categorical_dtype(df_n[c]):
            continue
        elif is_string_dtype(df_n[c]) and c not in cols_null:
            if df_n[c].isnull().sum() == 0:
                df_n[c] = df_n[c].astype('category')
            else:
                cols_null.append(c)
        else:
            cols_non_string.append(c)
    for c in cols_non_string:
        if df_n[c].isnull().sum():
            cols_null.append(c)
        else:
            if is_integer_dtype(df_n[c]) or c in conv_to_int:
                cmin = df_n[c].min()
                cmax = df_n[c].max()
                if cmin >= 0:
                    if cmax < 256:
                        df_n[c] = df_n[c].astype(np.uint8)
                    elif cmax < 65536:
                        df_n[c] = df_n[c].astype(np.uint16)
                    elif cmax < 4294967296:
                        df_n[c] = df_n[c].astype(np.uint32)
                    else:
                        df_n[c] = df_n[c].astype(np.uint64)
                else:
                    if cmin > -129 and cmax < 128:
                        df_n[c] = df_n[c].astype(np.int8)
                    elif cmin > -32769 and cmax < 32768:
                        df_n[c] = df_n[c].astype(np.int16)
                    elif cmin > -2147483649 and cmax < 2147483648:
                        df_n[c] = df_n[c].astype(np.int32)
                    else:
                        df_n[c] = df_n[c].astype(np.int64)

    print(df_n.info(memory_usage='deep'))
    print("=============")
    print(f'Columns with nulls {cols_null}')
    return df_n

In [4]:
def get_last_matches(matches, date, team, last_against_x=10):
    ''' 
    Get the last x matches of a given team
    matches: complete match data
    date: date of the specific match
    team: team from that match for which you want to pull the data
    last_against_x: how many matches from the past for creating stats
    '''

    #Filter team matches from matches
    team_matches = matches[(matches['home_team_api_id'] == team)
                           | (matches['away_team_api_id'] == team)]

    #Filter x last matches from team matches
    last_matches = team_matches[team_matches.date < date].sort_values(
        by='date', ascending=False).iloc[0:last_against_x, :]

    #Return last x matches
    return last_matches

In [5]:
def get_last_matches_against_eachother(matches,
                                       date,
                                       home_team,
                                       away_team,
                                       last_against_x=3):
    ''' 
    Get the last x matches of two given teams.
    matches: complete match data
    date: date of the specific match
    home_team: home team
    away_team: away team
    last_against_x: how many matches from the past against each other for creating stats
    '''

    #Find matches of both teams
    home_matches = matches[(matches['home_team_api_id'] == home_team)
                           & (matches['away_team_api_id'] == away_team)]
    away_matches = matches[(matches['home_team_api_id'] == away_team)
                           & (matches['away_team_api_id'] == home_team)]
    total_matches = pd.concat([home_matches, away_matches])

    #Get last x matches
    try:
        last_matches = total_matches[total_matches.date < date].sort_values(
            by='date', ascending=False).iloc[0:last_against_x, :]
    except:
        last_matches = total_matches[total_matches.date < date].sort_values(
            by='date', ascending=False).iloc[0:total_matches.shape[0], :]

        #Check for error in data
        if (last_matches.shape[0] > x):
            print("Error in obtaining matches. Not suffcient data points")

    #Return data
    return last_matches

In [6]:
def get_goals(matches, team):
    ''' 
    Get the goals of a specfic team from a set of matches. 
    '''

    #Find home and away goals
    home_goals = int(
        matches.home_team_goal[matches.home_team_api_id == team].sum())
    away_goals = int(
        matches.away_team_goal[matches.away_team_api_id == team].sum())

    total_goals = home_goals + away_goals

    #Return total goals
    return total_goals

In [7]:
def get_goals_conceided(matches, team):
    ''' 
    Get the goals conceided of a specfic team from a set of matches.
    '''

    #Find home and away goals
    home_goals = int(
        matches.home_team_goal[matches.away_team_api_id == team].sum())
    away_goals = int(
        matches.away_team_goal[matches.home_team_api_id == team].sum())

    total_goals = home_goals + away_goals

    #Return total goals
    return total_goals

In [8]:
def get_wins(matches, team):
    ''' Get the number of wins of a specfic team from a set of matches. '''

    #Find home and away wins
    home_wins = int(
        matches.home_team_goal[(matches.home_team_api_id == team) & (
            matches.home_team_goal > matches.away_team_goal)].count())
    away_wins = int(
        matches.away_team_goal[(matches.away_team_api_id == team) & (
            matches.away_team_goal > matches.home_team_goal)].count())

    total_wins = home_wins + away_wins

    #Return total wins
    return total_wins

In [9]:
def get_match_features(match, matches, last_ind_x=10, last_against_x=3):
    ''' 
    Create match specific features for a given match. 
    match: single match
    matches: match data complete
    last_ind_x: how many matches from the past to be considered for individual team
    last_against_x: how many matches from the past to be considered for both teams clashes
    '''

    #Define variables
    date = match.date  #storing match date
    home_team = match.home_team_api_id  #home id
    away_team = match.away_team_api_id  #away id

    #Get last x matches of home and away team separately
    matches_home_team = get_last_matches(matches, date, home_team, last_ind_x)
    matches_away_team = get_last_matches(matches, date, away_team, last_ind_x)

    #Get last x matches of both teams against each other
    last_matches_against = get_last_matches_against_eachother(
        matches, date, home_team, away_team, last_against_x)

    #Create goal variables
    #getting stats from the matches filtered above
    home_goals = get_goals(matches_home_team, home_team)
    away_goals = get_goals(matches_away_team, away_team)
    home_goals_conceided = get_goals_conceided(matches_home_team, home_team)
    away_goals_conceided = get_goals_conceided(matches_away_team, away_team)

    #Define result data frame
    result = pd.DataFrame()

    #Define ID features
    result.loc[0, 'match_api_id'] = match.match_api_id
    result.loc[0, 'league_id'] = match.league_id

    #Create match features
    #from the individual matches, extracting goal difference and games won
    result.loc[
        0, 'home_team_goals_difference'] = home_goals - home_goals_conceided
    result.loc[
        0, 'away_team_goals_difference'] = away_goals - away_goals_conceided
    result.loc[0, 'games_won_home_team'] = get_wins(matches_home_team,
                                                    home_team)
    result.loc[0, 'games_won_away_team'] = get_wins(matches_away_team,
                                                    away_team)
    #from the past tie-ups how many times home team has won and lost
    result.loc[0, 'games_against_won_home'] = get_wins(last_matches_against,
                                                       home_team)
    result.loc[0, 'games_against_lost_home'] = get_wins(
        last_matches_against, away_team)

    #Return match features
    return result.loc[0]

In [10]:
def convert_odds_to_prob(match_odds):
    ''' 
    Converts bookkeeper odds to probabilities. 
    match_odds: odds for the specific bookkeeper for win, loss and draw
    output: return the probability derived from this odd so that it sums to 1
    '''

    #Define variables
    match_id = match_odds.loc[:, 'match_api_id']
    bookkeeper = match_odds.loc[:, 'bookkeeper']
    win_odd = match_odds.loc[:, 'Win']
    draw_odd = match_odds.loc[:, 'Draw']
    loss_odd = match_odds.loc[:, 'Defeat']

    #Converts odds to prob
    win_prob = 1 / win_odd
    draw_prob = 1 / draw_odd
    loss_prob = 1 / loss_odd

    total_prob = win_prob + draw_prob + loss_prob

    probs = pd.DataFrame()

    #Define output format and scale probs by sum over all probs
    probs.loc[:, 'match_api_id'] = match_id
    probs.loc[:, 'bookkeeper'] = bookkeeper
    probs.loc[:, 'Win'] = win_prob / total_prob
    probs.loc[:, 'Draw'] = draw_prob / total_prob
    probs.loc[:, 'Defeat'] = loss_prob / total_prob

    #Return probs and meta data
    return probs

In [11]:
def get_bookkeeper_data(matches, bookkeepers, horizontal=True):
    ''' Aggregates bookkeeper data for all matches and bookkeepers. '''

    bk_data = pd.DataFrame()

    #Loop through bookkeepers
    for bookkeeper in bookkeepers:

        #Find columns containing data of bookkeeper
        #pull columns for the bookkeeper
        temp_data = matches.loc[:, (matches.columns.str.contains(bookkeeper))]
        #adding a column containing abbreviation of bookkeeper
        temp_data.loc[:, 'bookkeeper'] = str(bookkeeper)
        #adding column for the match ID
        temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']

        #Rename odds columns and convert to numeric
        #renaming columns
        cols = temp_data.columns.values
        cols[:3] = ['Win', 'Draw', 'Defeat']
        temp_data.columns = cols
        temp_data.loc[:, 'Win'] = pd.to_numeric(temp_data['Win'])
        temp_data.loc[:, 'Draw'] = pd.to_numeric(temp_data['Draw'])
        temp_data.loc[:, 'Defeat'] = pd.to_numeric(temp_data['Defeat'])

        #Check if data should be aggregated horizontally
        if (horizontal == True):

            #Convert data to probs
            temp_data = convert_odds_to_prob(temp_data)
            temp_data.drop('match_api_id', axis=1, inplace=True)
            temp_data.drop('bookkeeper', axis=1, inplace=True)

            #Rename columns with bookkeeper names
            win_name = bookkeeper + "_" + "Win"
            draw_name = bookkeeper + "_" + "Draw"
            defeat_name = bookkeeper + "_" + "Defeat"
            temp_data.columns.values[:3] = [win_name, draw_name, defeat_name]

            #Aggregate data
            bk_data = pd.concat([bk_data, temp_data], axis=1)
        else:
            #Aggregate vertically
            bk_data = bk_data.append(temp_data, ignore_index=True)

    #If horizontal add match api id to data
    if (horizontal == True):
        temp_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']

    #Return bookkeeper data
    return bk_data

In [12]:
def create_features(matches,
                    bk_list,
                    last_ind_x=10,
                    last_against_x=3,
                    verbose=True,
                    dropna=False):
    ''' 
    Create and aggregate features and labels for all matches. 
    matches: match data
    bk_list: list of bookkeepers to be considered
    last_ind_x: how many matches from the past to be considered for individual team
    last_against_x: how many matches from the past to be considered for both teams clashes
    dropna: True if rows with NA have to be dropped
    '''
    if verbose == True:
        print("Generating match features...")
    start = time()

    #Get match features for all matches
    match_stats = matches.apply(
        lambda x: get_match_features(x, matches, last_ind_x, last_against_x),
        axis=1)
    end = time()
    if verbose == True:
        print("Match features generated in {:.1f} minutes".format(
            (end - start) / 60))

    if verbose == True:
        print("Generating match labels...")
    start = time()

    #Create match labels
    matches['outcome'] = np.sign(
        matches.home_team_goal - matches.away_team_goal)
    end = time()
    if verbose == True:
        print("Match labels generated in {:.1f} minutes".format(
            (end - start) / 60))

    if verbose == True:
        print("Generating bookkeeper data...")
    start = time()

    #Get bookkeeper quotas for all matches
    #not required for now
    #     bk_data = get_bookkeeper_data(matches, bookkeepers, horizontal=True)
    #     bk_data.loc[:, 'match_api_id'] = matches.loc[:, 'match_api_id']
    #     end = time()
    #     if verbose == True:
    #         print("Bookkeeper data generated in {:.1f} minutes".format(
    #             (end - start) / 60))

    #Merges features and labels into one frame
    features = pd.merge(matches, match_stats, on='match_api_id', how='left')
    #     features = pd.merge(features, bk_data, on='match_api_id', how='left')
    #     feables = pd.merge(features, labels, on='match_api_id', how='left')

    #Drop NA values
    if dropna: features.dropna(inplace=True)

    #Return preprocessed data
    return features

### Loading data

In [19]:
start = time()
## Fetching data
#Connecting to database
path = "data/deep_odds/"  #path
database = path + 'database.sqlite'
conn = sqlite3.connect(database)

#Fetching required data tables
player_data = pd.read_sql("SELECT * FROM Player;", conn)
team_data = pd.read_sql("SELECT * FROM Team;", conn)
country = pd.read_sql("SELECT * FROM Country;", conn)
league = pd.read_sql("SELECT * FROM League;", conn)
team_stats_data = pd.read_sql("SELECT * FROM Team_Attributes;", conn)
match_data = pd.read_sql("SELECT * FROM Match;", conn)
player_stats_data = pd.read_sql("SELECT * FROM Player_Attributes;", conn)

Looking at the tables

In [17]:
match_data.head(2)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6


In [9]:
match_data.shape

(25979, 115)

In [10]:
player_data.head(2)

Unnamed: 0,id,player_api_id,player_name,player_fifa_api_id,birthday,height,weight
0,1,505942,Aaron Appindangoye,218353,1992-02-29 00:00:00,182.88,187
1,2,155782,Aaron Cresswell,189615,1989-12-15 00:00:00,170.18,146


In [12]:
player_data.shape

(11060, 7)

We have data for 11060 players

In [18]:
player_stats_data.head(2)

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,preferred_foot,attacking_work_rate,defensive_work_rate,crossing,finishing,heading_accuracy,short_passing,volleys,dribbling,curve,free_kick_accuracy,long_passing,ball_control,acceleration,sprint_speed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,aggression,interceptions,positioning,vision,penalties,marking,standing_tackle,sliding_tackle,gk_diving,gk_handling,gk_kicking,gk_positioning,gk_reflexes
0,1,218353,505942,2016-02-18 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0
1,2,218353,505942,2015-11-19 00:00:00,67.0,71.0,right,medium,medium,49.0,44.0,71.0,61.0,44.0,51.0,45.0,39.0,64.0,49.0,60.0,64.0,59.0,47.0,65.0,55.0,58.0,54.0,76.0,35.0,71.0,70.0,45.0,54.0,48.0,65.0,69.0,69.0,6.0,11.0,10.0,8.0,8.0


In [24]:
player_stats_data.shape

(183978, 42)

In [24]:
league.head()

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League
2,4769,4769,France Ligue 1
3,7809,7809,Germany 1. Bundesliga
4,10257,10257,Italy Serie A


In [26]:
team_data.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [27]:
team_stats_data.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,Organised,60,Normal,65,Normal,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,Organised,54,Normal,63,Normal,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
2,3,434,9930,2015-09-10 00:00:00,47,Balanced,41.0,Normal,54,Mixed,Organised,54,Normal,63,Normal,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover
3,4,77,8485,2010-02-22 00:00:00,70,Fast,,Little,70,Long,Organised,70,Risky,70,Lots,70,Lots,Organised,60,Medium,70,Double,70,Wide,Cover
4,5,77,8485,2011-02-22 00:00:00,47,Balanced,,Little,52,Mixed,Organised,53,Normal,48,Normal,52,Normal,Organised,47,Medium,47,Press,52,Normal,Cover


In [28]:
match_data.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.73,3.4,5.0,1.75,3.35,4.2,1.85,3.2,3.5,1.8,3.3,3.75,,,,1.7,3.3,4.33,1.9,3.3,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.95,3.2,3.6,1.8,3.3,3.95,1.9,3.2,3.5,1.9,3.2,3.5,,,,1.83,3.3,3.6,1.95,3.3,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.38,3.3,2.75,2.4,3.3,2.55,2.6,3.1,2.3,2.5,3.2,2.5,,,,2.5,3.25,2.4,2.63,3.3,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.44,3.75,7.5,1.4,4.0,6.8,1.4,3.9,6.0,1.44,3.6,6.5,,,,1.44,3.75,6.0,1.44,4.0,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,3.5,1.65,5.0,3.5,1.6,4.0,3.3,1.7,4.0,3.4,1.72,,,,4.2,3.4,1.7,4.5,3.5,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [23]:
player_stats_data.groupby(by = 'player_api_id').first().shape

(11060, 41)

players stats are pulled from FIFA game which is updated roughly every 6 months

In [13]:
team_data.head(2)

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC


In [14]:
team_data.shape

(299, 5)

We have data for 299 teams in our db

In [19]:
team_stats_data.head(2)

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22 00:00:00,60,Balanced,,Little,50,Mixed,Organised,60,Normal,65,Normal,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover
1,2,434,9930,2014-09-19 00:00:00,52,Balanced,48.0,Normal,56,Mixed,Organised,54,Normal,63,Normal,64,Normal,Organised,47,Medium,44,Press,54,Normal,Cover


In [20]:
team_stats_data.shape

(1458, 25)

In [196]:
team_stats_data.isnull().sum()

id                                  0
team_fifa_api_id                    0
team_api_id                         0
date                                0
buildUpPlaySpeed                    0
buildUpPlaySpeedClass               0
buildUpPlayDribbling              969
buildUpPlayDribblingClass           0
buildUpPlayPassing                  0
buildUpPlayPassingClass             0
buildUpPlayPositioningClass         0
chanceCreationPassing               0
chanceCreationPassingClass          0
chanceCreationCrossing              0
chanceCreationCrossingClass         0
chanceCreationShooting              0
chanceCreationShootingClass         0
chanceCreationPositioningClass      0
defencePressure                     0
defencePressureClass                0
defenceAggression                   0
defenceAggressionClass              0
defenceTeamWidth                    0
defenceTeamWidthClass               0
defenceDefenderLineClass            0
dtype: int64

In [197]:
team_stats_data.shape

(1458, 25)

In [20]:
country.head()

Unnamed: 0,id,name
0,1,Belgium
1,1729,England
2,4769,France
3,7809,Germany
4,10257,Italy


In [21]:
country.shape

(11, 2)

In [22]:
league.head(2)

Unnamed: 0,id,country_id,name
0,1,1,Belgium Jupiler League
1,1729,1729,England Premier League


In [23]:
league.shape

(11, 3)

We have data for 11 leagues

In [107]:
#converting date column to correct date format
match_data['date'] = pd.to_datetime(match_data['date'])

player_stats_data['date'] = pd.to_datetime(player_stats_data.date)

In [108]:
# removing rows with null for the selected columns

#Reduce match data to fulfill run time requirements
rows = [
    "country_id", "league_id", "season", "stage", "date", "match_api_id",
    "home_team_api_id", "away_team_api_id", "home_team_goal", "away_team_goal",
    "home_player_1", "home_player_2", "home_player_3", "home_player_4",
    "home_player_5", "home_player_6", "home_player_7", "home_player_8",
    "home_player_9", "home_player_10", "home_player_11", "away_player_1",
    "away_player_2", "away_player_3", "away_player_4", "away_player_5",
    "away_player_6", "away_player_7", "away_player_8", "away_player_9",
    "away_player_10", "away_player_11"
]
match_data.dropna(
    subset=rows, inplace=True
)  #removing rows with nulls in the given columns: roughly 4000 rows

In [109]:
match_data.shape

(21374, 115)

In [110]:
#filtering data for spanish, german and english leagues

main_leagues = [1729, 7809, 21518]  #league ids

match_data = match_data.loc[match_data.league_id.isin(main_leagues)]

In [111]:
match_data.shape

(8045, 115)

### Creating features

In [None]:
# Pulling statistics for players involved in the match
#this cell takes a lot of time to run
#sorting player stats data based on date
player_stats_data = player_stats_data.sort_values('date', ascending=False)

columns = players + ['date', 'match_api_id']

#columns in match data containing the ID of players
players = [
    'home_player_1', 'home_player_2', 'home_player_3', "home_player_4",
    "home_player_5", "home_player_6", "home_player_7", "home_player_8",
    "home_player_9", "home_player_10", "home_player_11", "away_player_1",
    "away_player_2", "away_player_3", "away_player_4", "away_player_5",
    "away_player_6", "away_player_7", "away_player_8", "away_player_9",
    "away_player_10", "away_player_11"
]

feature_list = [
    'player_api_id', 'overall_rating', 'potential', 'preferred_foot',
    'attacking_work_rate', 'defensive_work_rate', 'crossing', 'finishing',
    'heading_accuracy', 'short_passing', 'volleys', 'dribbling', 'curve',
    'free_kick_accuracy', 'long_passing', 'ball_control', 'acceleration',
    'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping',
    'stamina', 'strength', 'long_shots', 'aggression', 'interceptions',
    'positioning', 'vision', 'penalties', 'marking', 'standing_tackle',
    'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking',
    'gk_positioning', 'gk_reflexes'
]

cols = []
for i in itertools.product(players, feature_list[1:]):
    cols.append('_'.join(i))

fifa_stats = pd.DataFrame()
for _, match in match_data.iterrows():
    first_match = match[
        columns]  #filter the required columns from each row from match_data
    player_idxs = first_match[players].values
    date = first_match.date
    match_id = first_match.match_api_id
    sub_data = player_stats_data[player_stats_data.date < date]
    player_stats = sub_data[sub_data['player_api_id'].isin(player_idxs)].\
        groupby('player_api_id').first().reset_index()
    player_idxs = pd.DataFrame({'player_api_id': player_idxs})
    df = pd.merge(
        player_idxs,
        player_stats[feature_list],
        how='left',
        on='player_api_id')
    df = df.drop('player_api_id', axis=1)
    df = pd.Series(df.values.flatten())
    df.index = cols
    match_stats_df = pd.DataFrame(df).T
    match_stats_df['match_api_id'] = match_id
    fifa_stats = pd.concat([fifa_stats, match_stats_df])

match_data_new = pd.merge(
    match_data, fifa_stats, how='left', on='match_api_id', copy=False)

In [None]:
match_data_new.to_feather('data/deep_odds/match_data_new')

match_data_new.to_csv('data/deep_odds/match_data_new.csv', index=False)

In [112]:
match_data_new = pd.read_feather('data/deep_odds/match_data_new')

In [114]:
match_data_new.shape

(8045, 951)

In [147]:
match_data_new.head(1)

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,...,away_player_9_ball_control,away_player_9_acceleration,away_player_9_sprint_speed,away_player_9_agility,away_player_9_reactions,away_player_9_balance,away_player_9_shot_power,away_player_9_jumping,away_player_9_stamina,away_player_9_strength,away_player_9_long_shots,away_player_9_aggression,away_player_9_interceptions,away_player_9_positioning,away_player_9_vision,away_player_9_penalties,away_player_9_marking,away_player_9_standing_tackle,away_player_9_sliding_tackle,away_player_9_gk_diving,away_player_9_gk_handling,away_player_9_gk_kicking,away_player_9_gk_positioning,away_player_9_gk_reflexes,away_player_10_overall_rating,away_player_10_potential,away_player_10_preferred_foot,away_player_10_attacking_work_rate,away_player_10_defensive_work_rate,away_player_10_crossing,away_player_10_finishing,away_player_10_heading_accuracy,away_player_10_short_passing,away_player_10_volleys,away_player_10_dribbling,away_player_10_curve,away_player_10_free_kick_accuracy,away_player_10_long_passing,away_player_10_ball_control,away_player_10_acceleration,away_player_10_sprint_speed,away_player_10_agility,away_player_10_reactions,away_player_10_balance,away_player_10_shot_power,away_player_10_jumping,away_player_10_stamina,away_player_10_strength,away_player_10_long_shots,away_player_10_aggression,away_player_10_interceptions,away_player_10_positioning,away_player_10_vision,away_player_10_penalties,away_player_10_marking,away_player_10_standing_tackle,away_player_10_sliding_tackle,away_player_10_gk_diving,away_player_10_gk_handling,away_player_10_gk_kicking,away_player_10_gk_positioning,away_player_10_gk_reflexes,away_player_11_overall_rating,away_player_11_potential,away_player_11_preferred_foot,away_player_11_attacking_work_rate,away_player_11_defensive_work_rate,away_player_11_crossing,away_player_11_finishing,away_player_11_heading_accuracy,away_player_11_short_passing,away_player_11_volleys,away_player_11_dribbling,away_player_11_curve,away_player_11_free_kick_accuracy,away_player_11_long_passing,away_player_11_ball_control,away_player_11_acceleration,away_player_11_sprint_speed,away_player_11_agility,away_player_11_reactions,away_player_11_balance,away_player_11_shot_power,away_player_11_jumping,away_player_11_stamina,away_player_11_strength,away_player_11_long_shots,away_player_11_aggression,away_player_11_interceptions,away_player_11_positioning,away_player_11_vision,away_player_11_penalties,away_player_11_marking,away_player_11_standing_tackle,away_player_11_sliding_tackle,away_player_11_gk_diving,away_player_11_gk_handling,away_player_11_gk_kicking,away_player_11_gk_positioning,away_player_11_gk_reflexes
0,1729,1729,1729,2008/2009,1,2008-08-17,489042,10260,10261,1,1,1.0,2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0,4.0,6.0,1.0,2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0,5.0,5.0,1.0,3.0,3.0,3.0,3.0,7.0,7.0,7.0,7.0,10.0,10.0,1.0,3.0,3.0,3.0,3.0,7.0,7.0,7.0,7.0,9.0,11.0,30726.0,30362.0,30620.0,30865.0,32569.0,24148.0,34944.0,30373.0,24154.0,24157.0,30829.0,24224.0,25518.0,24228.0,30929.0,29581.0,38807.0,40565.0,30360.0,33852.0,34574.0,37799.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>56</comment><event...,1.29,5.5,11.0,1.3,4.75,8.25,1.3,4.4,8.5,1.25,4.5,10.0,,,,...,75.0,68.0,74.0,77.0,71.0,68.0,58.0,61.0,78.0,74.0,51.0,57.0,73.0,51.0,78.0,60.0,21.0,54.0,60.0,15.0,23.0,77.0,23.0,23.0,76.0,82.0,left,medium,medium,70.0,67.0,52.0,72.0,80.0,80.0,68.0,59.0,61.0,77.0,84.0,85.0,83.0,70.0,83.0,59.0,51.0,74.0,65.0,64.0,35.0,69.0,70.0,75.0,74.0,38.0,45.0,42.0,13.0,20.0,61.0,20.0,20.0,83.0,85.0,left,medium,low,52.0,77.0,77.0,73.0,83.0,84.0,44.0,44.0,48.0,79.0,96.0,95.0,84.0,88.0,78.0,86.0,79.0,82.0,84.0,74.0,77.0,76.0,79.0,70.0,80.0,26.0,40.0,20.0,12.0,22.0,48.0,22.0,22.0


In [159]:
#bookkeepers odd
bk_abb = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']
bk_abb_select = ['B365', 'BW']
features = create_features(
    match_data_new,
    bk_abb_select,
    last_ind_x=10,
    last_against_x=3,
    verbose=True,
    dropna=False)

Generating match features...
Match features generated in 8.9 minutes
Generating match labels...
Match labels generated in 0.0 minutes
Generating bookkeeper data...


In [160]:
features.shape

(8045, 959)

Removing betting odds data

In [167]:
bk_cols

['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']

In [177]:
#storing bookkeepers data separately
bk_data = pd.DataFrame()
for bk in bk_cols:
    cols_to_drop = features.columns.str.contains(bk)
    tmp = features.loc[:, (cols_to_drop)]
    bk_data = pd.concat((bk_data, tmp), axis=1)
    features = features[features.columns.drop(list(features.filter(regex=bk)))]

In [311]:
bk_data['match_api_id'] = data_final.match_api_id

In [312]:
bk_data.to_feather('data/deep_odds/bookkeeper-data')

In [178]:
bk_data.head()

Unnamed: 0,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH,SJD,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1.29,5.5,11.0,1.3,4.75,8.25,1.3,4.4,8.5,1.25,4.5,10.0,,,,1.25,4.5,10.0,1.25,5.0,10.0,1.28,5.5,12.0,1.3,4.75,10.0,1.29,4.5,11.0
1,1.2,6.5,15.0,1.22,5.5,10.0,1.2,5.2,11.0,1.2,5.0,11.0,,,,1.17,5.5,12.0,1.2,5.5,12.0,1.25,6.0,13.0,1.22,5.5,13.0,1.22,5.0,13.0
2,5.5,3.6,1.67,5.0,3.35,1.67,4.5,3.5,1.65,4.5,3.3,1.67,,,,5.5,3.3,1.57,4.33,3.4,1.73,5.5,3.8,1.65,5.0,3.4,1.7,4.5,3.4,1.73
3,1.91,3.4,4.2,1.9,3.2,3.8,1.8,3.3,3.8,1.8,3.2,4.0,,,,1.83,3.2,3.75,1.91,3.25,3.75,1.9,3.5,4.35,1.91,3.25,4.0,1.91,3.25,3.8
4,2.0,3.3,4.0,1.85,3.25,4.0,2.0,3.2,3.3,1.8,3.2,4.0,,,,1.95,3.1,3.5,2.0,3.25,3.4,2.05,3.3,4.0,2.0,3.25,3.75,2.0,3.25,3.5


In [180]:
features.head(2)

Unnamed: 0,id,country_id,league_id_x,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,home_player_1_overall_rating,home_player_1_potential,home_player_1_preferred_foot,home_player_1_attacking_work_rate,home_player_1_defensive_work_rate,home_player_1_crossing,home_player_1_finishing,home_player_1_heading_accuracy,home_player_1_short_passing,home_player_1_volleys,home_player_1_dribbling,home_player_1_curve,home_player_1_free_kick_accuracy,home_player_1_long_passing,home_player_1_ball_control,...,away_player_9_stamina,away_player_9_strength,away_player_9_long_shots,away_player_9_aggression,away_player_9_interceptions,away_player_9_positioning,away_player_9_vision,away_player_9_penalties,away_player_9_marking,away_player_9_standing_tackle,away_player_9_sliding_tackle,away_player_9_gk_diving,away_player_9_gk_handling,away_player_9_gk_kicking,away_player_9_gk_positioning,away_player_9_gk_reflexes,away_player_10_overall_rating,away_player_10_potential,away_player_10_preferred_foot,away_player_10_attacking_work_rate,away_player_10_defensive_work_rate,away_player_10_crossing,away_player_10_finishing,away_player_10_heading_accuracy,away_player_10_short_passing,away_player_10_volleys,away_player_10_dribbling,away_player_10_curve,away_player_10_free_kick_accuracy,away_player_10_long_passing,away_player_10_ball_control,away_player_10_acceleration,away_player_10_sprint_speed,away_player_10_agility,away_player_10_reactions,away_player_10_balance,away_player_10_shot_power,away_player_10_jumping,away_player_10_stamina,away_player_10_strength,away_player_10_long_shots,away_player_10_aggression,away_player_10_interceptions,away_player_10_positioning,away_player_10_vision,away_player_10_penalties,away_player_10_marking,away_player_10_standing_tackle,away_player_10_sliding_tackle,away_player_10_gk_diving,away_player_10_gk_handling,away_player_10_gk_kicking,away_player_10_gk_positioning,away_player_10_gk_reflexes,away_player_11_overall_rating,away_player_11_potential,away_player_11_preferred_foot,away_player_11_attacking_work_rate,away_player_11_defensive_work_rate,away_player_11_crossing,away_player_11_finishing,away_player_11_heading_accuracy,away_player_11_short_passing,away_player_11_volleys,away_player_11_dribbling,away_player_11_curve,away_player_11_free_kick_accuracy,away_player_11_long_passing,away_player_11_ball_control,away_player_11_acceleration,away_player_11_sprint_speed,away_player_11_agility,away_player_11_reactions,away_player_11_balance,away_player_11_shot_power,away_player_11_jumping,away_player_11_stamina,away_player_11_strength,away_player_11_long_shots,away_player_11_aggression,away_player_11_interceptions,away_player_11_positioning,away_player_11_vision,away_player_11_penalties,away_player_11_marking,away_player_11_standing_tackle,away_player_11_sliding_tackle,away_player_11_gk_diving,away_player_11_gk_handling,away_player_11_gk_kicking,away_player_11_gk_positioning,away_player_11_gk_reflexes,outcome,league_id_y,home_team_goals_difference,away_team_goals_difference,games_won_home_team,games_won_away_team,games_against_won_home,games_against_lost_home
0,1729,1729,1729,2008/2009,1,2008-08-17,489042,10260,10261,1,1,1.0,2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0,4.0,6.0,1.0,2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0,5.0,5.0,1.0,3.0,3.0,3.0,3.0,7.0,7.0,7.0,7.0,10.0,10.0,1.0,3.0,3.0,3.0,3.0,7.0,7.0,7.0,7.0,9.0,11.0,30726.0,30362.0,30620.0,30865.0,32569.0,24148.0,34944.0,30373.0,24154.0,24157.0,30829.0,24224.0,25518.0,24228.0,30929.0,29581.0,38807.0,40565.0,30360.0,33852.0,34574.0,37799.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>56</comment><event...,85.0,88.0,right,y,es,20.0,20.0,20.0,27.0,12.0,20.0,12.0,11.0,85.0,22.0,...,78.0,74.0,51.0,57.0,73.0,51.0,78.0,60.0,21.0,54.0,60.0,15.0,23.0,77.0,23.0,23.0,76.0,82.0,left,medium,medium,70.0,67.0,52.0,72.0,80.0,80.0,68.0,59.0,61.0,77.0,84.0,85.0,83.0,70.0,83.0,59.0,51.0,74.0,65.0,64.0,35.0,69.0,70.0,75.0,74.0,38.0,45.0,42.0,13.0,20.0,61.0,20.0,20.0,83.0,85.0,left,medium,low,52.0,77.0,77.0,73.0,83.0,84.0,44.0,44.0,48.0,79.0,96.0,95.0,84.0,88.0,78.0,86.0,79.0,82.0,84.0,74.0,77.0,76.0,79.0,70.0,80.0,26.0,40.0,20.0,12.0,22.0,48.0,22.0,22.0,0,1729.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1730,1729,1729,2008/2009,1,2008-08-16,489043,9825,8659,1,0,1.0,2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0,4.0,6.0,1.0,2.0,4.0,6.0,8.0,5.0,7.0,9.0,1.0,3.0,5.0,1.0,3.0,3.0,3.0,3.0,7.0,7.0,7.0,7.0,10.0,10.0,1.0,3.0,3.0,3.0,3.0,7.0,7.0,7.0,7.0,7.0,11.0,23686.0,26111.0,38835.0,30986.0,31291.0,31013.0,30935.0,39297.0,26181.0,30960.0,36410.0,36373.0,36832.0,23115.0,37280.0,24728.0,24664.0,31088.0,23257.0,24171.0,25922.0,27267.0,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card />,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>65</comment><event...,75.0,78.0,right,medium,medium,22.0,22.0,22.0,22.0,10.0,22.0,11.0,19.0,77.0,22.0,...,75.0,56.0,46.0,70.0,70.0,71.0,67.0,65.0,56.0,60.0,52.0,5.0,21.0,62.0,21.0,21.0,74.0,75.0,left,high,medium,66.0,58.0,64.0,75.0,57.0,68.0,58.0,63.0,64.0,75.0,81.0,79.0,66.0,71.0,70.0,64.0,70.0,79.0,67.0,70.0,64.0,76.0,72.0,70.0,73.0,72.0,73.0,66.0,7.0,21.0,64.0,21.0,21.0,68.0,78.0,right,medium,medium,50.0,71.0,68.0,55.0,64.0,67.0,41.0,38.0,35.0,66.0,81.0,84.0,68.0,59.0,71.0,70.0,76.0,64.0,80.0,45.0,66.0,59.0,63.0,54.0,57.0,22.0,31.0,20.0,10.0,22.0,35.0,22.0,22.0,1,1729.0,0.0,0.0,0.0,0.0,0.0,0.0


In [182]:
cols_float = [c for c in features.columns if is_float_dtype(features[c])]

Filling null values with -1. Alternate option could be to remove those rows

In [189]:
features.to_feather('data/deep_odds/features_with_na')

In [190]:
features.fillna(value=-1, inplace=True)

In [193]:
match_data_final = type_conversion(
    features, dt_col=['date'], conv_to_int=cols_float)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8045 entries, 0 to 8044
Columns: 929 entries, id to games_against_lost_home
dtypes: datetime64[ns](1), float64(843), int64(10), object(75)
memory usage: 250.7 MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8045 entries, 0 to 8044
Columns: 929 entries, id to games_against_lost_home
dtypes: category(75), datetime64[ns](1), int8(159), uint16(6), uint32(23), uint8(665)
memory usage: 174.7 MB
None
Columns with nulls []


In [194]:
match_data_final.head()

Unnamed: 0,id,country_id,league_id_x,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,home_player_X4,home_player_X5,home_player_X6,home_player_X7,home_player_X8,home_player_X9,home_player_X10,home_player_X11,away_player_X1,away_player_X2,away_player_X3,away_player_X4,away_player_X5,away_player_X6,away_player_X7,away_player_X8,away_player_X9,away_player_X10,away_player_X11,home_player_Y1,home_player_Y2,home_player_Y3,home_player_Y4,home_player_Y5,home_player_Y6,home_player_Y7,home_player_Y8,home_player_Y9,home_player_Y10,home_player_Y11,away_player_Y1,away_player_Y2,away_player_Y3,away_player_Y4,away_player_Y5,away_player_Y6,away_player_Y7,away_player_Y8,away_player_Y9,away_player_Y10,away_player_Y11,home_player_1,home_player_2,home_player_3,home_player_4,home_player_5,home_player_6,home_player_7,home_player_8,home_player_9,home_player_10,home_player_11,away_player_1,away_player_2,away_player_3,away_player_4,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,goal,shoton,shotoff,foulcommit,card,cross,corner,possession,home_player_1_overall_rating,home_player_1_potential,home_player_1_preferred_foot,home_player_1_attacking_work_rate,home_player_1_defensive_work_rate,home_player_1_crossing,home_player_1_finishing,home_player_1_heading_accuracy,home_player_1_short_passing,home_player_1_volleys,home_player_1_dribbling,home_player_1_curve,home_player_1_free_kick_accuracy,home_player_1_long_passing,home_player_1_ball_control,...,away_player_9_stamina,away_player_9_strength,away_player_9_long_shots,away_player_9_aggression,away_player_9_interceptions,away_player_9_positioning,away_player_9_vision,away_player_9_penalties,away_player_9_marking,away_player_9_standing_tackle,away_player_9_sliding_tackle,away_player_9_gk_diving,away_player_9_gk_handling,away_player_9_gk_kicking,away_player_9_gk_positioning,away_player_9_gk_reflexes,away_player_10_overall_rating,away_player_10_potential,away_player_10_preferred_foot,away_player_10_attacking_work_rate,away_player_10_defensive_work_rate,away_player_10_crossing,away_player_10_finishing,away_player_10_heading_accuracy,away_player_10_short_passing,away_player_10_volleys,away_player_10_dribbling,away_player_10_curve,away_player_10_free_kick_accuracy,away_player_10_long_passing,away_player_10_ball_control,away_player_10_acceleration,away_player_10_sprint_speed,away_player_10_agility,away_player_10_reactions,away_player_10_balance,away_player_10_shot_power,away_player_10_jumping,away_player_10_stamina,away_player_10_strength,away_player_10_long_shots,away_player_10_aggression,away_player_10_interceptions,away_player_10_positioning,away_player_10_vision,away_player_10_penalties,away_player_10_marking,away_player_10_standing_tackle,away_player_10_sliding_tackle,away_player_10_gk_diving,away_player_10_gk_handling,away_player_10_gk_kicking,away_player_10_gk_positioning,away_player_10_gk_reflexes,away_player_11_overall_rating,away_player_11_potential,away_player_11_preferred_foot,away_player_11_attacking_work_rate,away_player_11_defensive_work_rate,away_player_11_crossing,away_player_11_finishing,away_player_11_heading_accuracy,away_player_11_short_passing,away_player_11_volleys,away_player_11_dribbling,away_player_11_curve,away_player_11_free_kick_accuracy,away_player_11_long_passing,away_player_11_ball_control,away_player_11_acceleration,away_player_11_sprint_speed,away_player_11_agility,away_player_11_reactions,away_player_11_balance,away_player_11_shot_power,away_player_11_jumping,away_player_11_stamina,away_player_11_strength,away_player_11_long_shots,away_player_11_aggression,away_player_11_interceptions,away_player_11_positioning,away_player_11_vision,away_player_11_penalties,away_player_11_marking,away_player_11_standing_tackle,away_player_11_sliding_tackle,away_player_11_gk_diving,away_player_11_gk_handling,away_player_11_gk_kicking,away_player_11_gk_positioning,away_player_11_gk_reflexes,outcome,league_id_y,home_team_goals_difference,away_team_goals_difference,games_won_home_team,games_won_away_team,games_against_won_home,games_against_lost_home
0,1729,1729,1729,2008/2009,1,2008-08-17,489042,10260,10261,1,1,1,2,4,6,8,2,4,6,8,4,6,1,2,4,6,8,2,4,6,8,5,5,1,3,3,3,3,7,7,7,7,10,10,1,3,3,3,3,7,7,7,7,9,11,30726,30362,30620,30865,32569,24148,34944,30373,24154,24157,30829,24224,25518,24228,30929,29581,38807,40565,30360,33852,34574,37799,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>56</comment><event...,85,88,right,y,es,20,20,20,27,12,20,12,11,85,22,...,78,74,51,57,73,51,78,60,21,54,60,15,23,77,23,23,76,82,left,medium,medium,70,67,52,72,80,80,68,59,61,77,84,85,83,70,83,59,51,74,65,64,35,69,70,75,74,38,45,42,13,20,61,20,20,83,85,left,medium,low,52,77,77,73,83,84,44,44,48,79,96,95,84,88,78,86,79,82,84,74,77,76,79,70,80,26,40,20,12,22,48,22,22,0,1729,0,0,0,0,0,0
1,1730,1729,1729,2008/2009,1,2008-08-16,489043,9825,8659,1,0,1,2,4,6,8,2,4,6,8,4,6,1,2,4,6,8,5,7,9,1,3,5,1,3,3,3,3,7,7,7,7,10,10,1,3,3,3,3,7,7,7,7,7,11,23686,26111,38835,30986,31291,31013,30935,39297,26181,30960,36410,36373,36832,23115,37280,24728,24664,31088,23257,24171,25922,27267,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card />,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>65</comment><event...,75,78,right,medium,medium,22,22,22,22,10,22,11,19,77,22,...,75,56,46,70,70,71,67,65,56,60,52,5,21,62,21,21,74,75,left,high,medium,66,58,64,75,57,68,58,63,64,75,81,79,66,71,70,64,70,79,67,70,64,76,72,70,73,72,73,66,7,21,64,21,21,68,78,right,medium,medium,50,71,68,55,64,67,41,38,35,66,81,84,68,59,71,70,76,64,80,45,66,59,63,54,57,22,31,20,10,22,35,22,22,1,1729,0,0,0,0,0,0
2,1731,1729,1729,2008/2009,1,2008-08-16,489044,8472,8650,0,1,1,2,4,6,8,2,4,6,8,4,6,1,2,4,6,8,2,4,6,8,4,6,1,3,3,3,3,7,7,7,7,10,10,1,3,3,3,3,7,7,7,7,10,10,32562,38836,24446,24408,36786,38802,24655,17866,30352,23927,24410,30660,37442,30617,24134,414792,37139,30618,40701,24800,24635,30853,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><blocked>1</blocked></st...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>45</comment><event...,78,85,left,medium,medium,22,21,21,21,9,21,9,17,74,23,...,80,44,69,42,80,82,84,70,38,49,41,9,22,58,22,22,84,84,right,high,high,69,88,62,75,84,85,73,52,48,85,83,81,69,88,82,81,65,76,67,76,59,80,84,83,85,27,25,16,14,21,48,21,21,87,90,right,high,low,69,88,78,78,90,87,79,68,62,87,91,90,85,89,78,86,82,78,76,75,63,79,89,81,86,21,24,29,10,20,62,20,20,-1,1729,0,0,0,0,0,0
3,1732,1729,1729,2008/2009,1,2008-08-16,489045,8654,8528,2,1,1,2,4,6,8,2,4,6,8,4,6,1,2,6,8,4,2,4,6,8,4,6,1,3,3,3,3,7,7,7,7,10,10,1,3,3,3,3,7,7,7,7,10,10,36374,30966,23818,37277,30687,36394,37169,24223,24773,34543,23139,34421,34987,35472,111865,25005,35327,25150,97988,41877,127857,34466,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>50</comment><event...,77,80,right,medium,medium,20,20,20,25,11,20,15,18,78,20,...,76,78,79,68,67,70,68,79,8,12,7,12,14,60,5,11,71,78,right,medium,medium,56,75,72,59,66,64,59,56,26,75,77,75,66,60,65,72,76,61,77,59,59,62,69,61,64,25,21,25,7,8,15,11,11,77,78,right,high,high,74,69,83,68,75,72,68,38,47,73,74,79,50,65,71,86,91,78,94,56,79,71,69,81,64,58,45,52,9,21,47,21,21,1,1729,0,0,0,0,0,0
4,1734,1729,1729,2008/2009,1,2008-08-16,489047,8668,8655,2,3,1,2,4,6,8,1,3,5,7,9,5,1,2,4,6,8,4,6,8,2,6,4,1,3,3,3,3,7,7,7,7,7,11,1,3,3,3,3,7,7,7,7,10,10,31465,30371,24004,33086,30857,24011,109058,23268,24846,24006,24160,30622,37764,19020,23921,24136,30342,23889,23916,23922,34176,30646,<goal><value><comment>n</comment><stats><goals...,<shoton><value><stats><shoton>1</shoton></stat...,<shotoff><value><stats><shotoff>1</shotoff></s...,<foulcommit><value><stats><foulscommitted>1</f...,<card><value><comment>y</comment><stats><ycard...,<cross><value><stats><crosses>1</crosses></sta...,<corner><value><stats><corners>1</corners></st...,<possession><value><comment>46</comment><event...,79,81,right,medium,medium,20,24,20,23,21,21,9,17,69,20,...,85,76,82,77,71,74,76,68,44,69,72,14,22,78,22,22,76,78,right,high,low,28,79,74,48,65,70,68,46,27,69,81,82,67,80,80,75,74,77,92,74,81,68,68,70,67,33,33,32,8,21,27,21,21,79,82,right,medium,medium,62,77,86,75,81,78,79,51,41,78,74,80,72,69,79,83,89,74,78,72,72,68,71,77,80,27,34,30,6,21,41,21,21,-1,1729,0,0,0,0,0,0


adding team statistics data

In [198]:
match_data_final.columns

Index(['id', 'country_id', 'league_id_x', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id',
       'home_team_goal',
       ...
       'away_player_11_gk_positioning', 'away_player_11_gk_reflexes',
       'outcome', 'league_id_y', 'home_team_goals_difference',
       'away_team_goals_difference', 'games_won_home_team',
       'games_won_away_team', 'games_against_won_home',
       'games_against_lost_home'],
      dtype='object', length=929)

In [199]:
#pulling match id, date, home and away team ID
tmp = match_data_final.loc[:, [
    'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id'
]]

In [208]:
team_stats_data['date'] = pd.to_datetime(team_stats_data['date'])

In [209]:
team_stats_data = team_stats_data.sort_values('date', ascending=False)

In [210]:
team_stats_data.head()

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
1457,1458,15005,10000,2015-09-10,54,Balanced,42.0,Normal,51,Mixed,Organised,47,Normal,52,Normal,32,Little,Organised,44,Medium,58,Press,37,Normal,Cover
241,242,190,9858,2015-09-10,42,Balanced,56.0,Normal,43,Mixed,Organised,41,Normal,48,Normal,69,Lots,Organised,41,Medium,52,Press,57,Normal,Cover
253,254,1796,8191,2015-09-10,54,Balanced,32.0,Little,67,Long,Organised,72,Risky,57,Normal,47,Normal,Organised,63,Medium,62,Press,58,Normal,Cover
828,829,68,8550,2015-09-10,55,Balanced,48.0,Normal,49,Mixed,Organised,46,Normal,54,Normal,53,Normal,Organised,39,Medium,46,Press,42,Normal,Cover
834,835,12,8549,2015-09-10,60,Balanced,32.0,Little,46,Mixed,Organised,43,Normal,55,Normal,37,Normal,Organised,38,Medium,42,Press,42,Normal,Cover


In [227]:
date = '2010-08-17'
match_id = j.match_api_id
home = j.home_team_api_id
away = j.away_team_api_id
sub_data = team_stats_data[team_stats_data.date < date]
home_stats = sub_data[sub_data['team_api_id'] == home].groupby(
    'team_api_id').first().reset_index()
away_stats = sub_data[sub_data['team_api_id'] == away].groupby(
    'team_api_id').first().reset_index()

In [236]:
team_stats_data.columns

Index(['id', 'team_fifa_api_id', 'team_api_id', 'date', 'buildUpPlaySpeed',
       'buildUpPlaySpeedClass', 'buildUpPlayDribbling',
       'buildUpPlayDribblingClass', 'buildUpPlayPassing',
       'buildUpPlayPassingClass', 'buildUpPlayPositioningClass',
       'chanceCreationPassing', 'chanceCreationPassingClass',
       'chanceCreationCrossing', 'chanceCreationCrossingClass',
       'chanceCreationShooting', 'chanceCreationShootingClass',
       'chanceCreationPositioningClass', 'defencePressure',
       'defencePressureClass', 'defenceAggression', 'defenceAggressionClass',
       'defenceTeamWidth', 'defenceTeamWidthClass',
       'defenceDefenderLineClass'],
      dtype='object')

In [231]:
cols_new = [
    'buildUpPlaySpeed', 'buildUpPlaySpeedClass', 'buildUpPlayDribblingClass',
    'buildUpPlayPassing', 'buildUpPlayPassingClass',
    'buildUpPlayPositioningClass', 'chanceCreationPassing',
    'chanceCreationPassingClass', 'chanceCreationCrossing',
    'chanceCreationCrossingClass', 'chanceCreationShooting',
    'chanceCreationShootingClass', 'chanceCreationPositioningClass',
    'defencePressure', 'defencePressureClass', 'defenceAggression',
    'defenceAggressionClass', 'defenceTeamWidth', 'defenceTeamWidthClass',
    'defenceDefenderLineClass'
]

In [234]:
cols_final = [f'{k}_{c}' for c in cols_new for k in ['home', 'away']]

In [246]:
team_stats_df = pd.DataFrame()
for _, match in tmp.iterrows():
    date = match.date
    match_id = match.match_api_id
    home = match.home_team_api_id
    away = match.away_team_api_id
    sub_data = team_stats_data[team_stats_data.date <= date]
    home_stats = sub_data[sub_data['team_api_id'] == home].groupby(
        'team_api_id').first().reset_index()
    away_stats = sub_data[sub_data['team_api_id'] == away].groupby(
        'team_api_id').first().reset_index()

    team_stats_df = pd.concat(
        (team_stats_df, pd.concat((home_stats, away_stats), axis=1).drop(
            [
                'id', 'team_fifa_api_id', 'team_api_id', 'date',
                'buildUpPlayDribbling'
            ],
            axis=1)),
        axis=0)

In [248]:
team_stats_df.shape

(6560, 40)

In [249]:
team_stats_df.columns = cols_final

In [251]:
team_stats_df.shape

(6560, 40)

In [267]:
team_stats_data.sort_values('date').head(1)

Unnamed: 0,id,team_fifa_api_id,team_api_id,date,buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribbling,buildUpPlayDribblingClass,buildUpPlayPassing,buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
0,1,434,9930,2010-02-22,60,Balanced,,Little,50,Mixed,Organised,60,Normal,65,Normal,55,Normal,Organised,50,Medium,55,Press,45,Normal,Cover


In [268]:
dt = pd.to_datetime('2010-02-22')

In [269]:
tmp1 = match_data_final.loc[match_data_final.date > dt]

In [270]:
tmp1.shape

(6560, 929)

In [272]:
match_data_final.to_csv(
    'data/deep_odds/match_data_final_without_team_stats.csv', index=False)

In [275]:
tmp1 = tmp1.reset_index(drop=True)

In [276]:
team_stats_df = team_stats_df.reset_index(drop=True)

In [278]:
team_stats_df.head()

Unnamed: 0,home_buildUpPlaySpeed,away_buildUpPlaySpeed,home_buildUpPlaySpeedClass,away_buildUpPlaySpeedClass,home_buildUpPlayDribblingClass,away_buildUpPlayDribblingClass,home_buildUpPlayPassing,away_buildUpPlayPassing,home_buildUpPlayPassingClass,away_buildUpPlayPassingClass,home_buildUpPlayPositioningClass,away_buildUpPlayPositioningClass,home_chanceCreationPassing,away_chanceCreationPassing,home_chanceCreationPassingClass,away_chanceCreationPassingClass,home_chanceCreationCrossing,away_chanceCreationCrossing,home_chanceCreationCrossingClass,away_chanceCreationCrossingClass,home_chanceCreationShooting,away_chanceCreationShooting,home_chanceCreationShootingClass,away_chanceCreationShootingClass,home_chanceCreationPositioningClass,away_chanceCreationPositioningClass,home_defencePressure,away_defencePressure,home_defencePressureClass,away_defencePressureClass,home_defenceAggression,away_defenceAggression,home_defenceAggressionClass,away_defenceAggressionClass,home_defenceTeamWidth,away_defenceTeamWidth,home_defenceTeamWidthClass,away_defenceTeamWidthClass,home_defenceDefenderLineClass,away_defenceDefenderLineClass
0,70.0,Fast,Little,60.0,Mixed,Organised,55.0,Normal,70.0,Lots,70.0,Lots,Organised,45.0,Medium,55.0,Press,45.0,Normal,Cover,60.0,Balanced,Little,65.0,Mixed,Organised,60.0,Normal,70.0,Lots,45.0,Normal,Organised,40.0,Medium,70.0,Double,40.0,Normal,Cover
1,70.0,Fast,Little,70.0,Long,Organised,70.0,Risky,70.0,Lots,50.0,Normal,Organised,35.0,Medium,70.0,Double,35.0,Normal,Cover,70.0,Fast,Little,59.0,Mixed,Organised,65.0,Normal,70.0,Lots,50.0,Normal,Free Form,30.0,Deep,70.0,Double,30.0,Narrow,Cover
2,58.0,Balanced,Little,30.0,Short,Organised,31.0,Safe,70.0,Lots,50.0,Normal,Organised,30.0,Deep,70.0,Double,30.0,Narrow,Cover,70.0,Fast,Little,70.0,Long,Organised,70.0,Risky,70.0,Lots,70.0,Lots,Organised,70.0,High,70.0,Double,70.0,Wide,Cover
3,70.0,Fast,Little,65.0,Mixed,Organised,70.0,Risky,70.0,Lots,70.0,Lots,Organised,65.0,Medium,70.0,Double,70.0,Wide,Cover,65.0,Balanced,Little,70.0,Long,Organised,70.0,Risky,70.0,Lots,55.0,Normal,Organised,35.0,Medium,70.0,Double,35.0,Normal,Cover
4,60.0,Balanced,Little,70.0,Long,Organised,70.0,Risky,70.0,Lots,55.0,Normal,Organised,35.0,Medium,70.0,Double,35.0,Normal,Cover,55.0,Balanced,Little,70.0,Long,Organised,70.0,Risky,70.0,Lots,45.0,Normal,Organised,35.0,Medium,70.0,Double,35.0,Normal,Cover


In [281]:
team_stats_df['match_api_id'] = tmp1.match_api_id

In [283]:
cols_float = [
    c for c in team_stats_df.columns if is_float_dtype(team_stats_df[c])
]

In [294]:
team_stats_df = team_stats_df.fillna(-1)  #null values replaced by -1

In [296]:
data_final = pd.merge(
    match_data_final, team_stats_df, how='left', on='match_api_id', copy=False)

In [301]:
data_final.info(memory_usage=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8045 entries, 0 to 8044
Columns: 969 entries, id to away_defenceDefenderLineClass
dtypes: category(75), datetime64[ns](1), float64(16), int8(159), object(24), uint16(6), uint32(23), uint8(665)
memory usage: 12.3+ MB


In [308]:
data_final.to_pickle('data/deep_odds/data-final')

In [313]:
data_final.shape

(8045, 969)

In [314]:
data_final.to_csv('data/deep_odds/data-final.csv', index=False)

In [318]:
tmp2 = pd.read_pickle('data/deep_odds/data-final')

In [317]:
!ls data/deep_odds/

bookkeeper-data                         features_with_na
data-final                              match_data
data-final.csv                          match_data_final_without_team_stats.csv
data-final.zip                          match_data_new
[31mdatabase.sqlite[m[m                         player_data
