In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import seaborn

In [2]:
data = pd.read_csv('E0.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Columns: 106 entries, Div to AvgCAHA
dtypes: float64(82), int64(16), object(8)
memory usage: 82.9+ KB


In [4]:
data.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,09/08/2019,20:00,Liverpool,Norwich,4,1,H,4,0,...,3.43,-2.25,1.91,1.99,1.94,1.98,1.99,2.07,1.9,1.99
1,E0,10/08/2019,12:30,West Ham,Man City,0,5,A,0,1,...,2.91,1.75,1.95,1.95,1.96,1.97,2.07,1.98,1.97,1.92
2,E0,10/08/2019,15:00,Bournemouth,Sheffield United,1,1,D,0,0,...,1.92,-0.5,1.95,1.95,1.98,1.95,2.0,1.96,1.96,1.92
3,E0,10/08/2019,15:00,Burnley,Southampton,3,0,H,0,0,...,1.71,0.0,1.87,2.03,1.89,2.03,1.9,2.07,1.86,2.02
4,E0,10/08/2019,15:00,Crystal Palace,Everton,0,0,D,0,0,...,1.71,0.25,1.82,2.08,1.97,1.96,2.03,2.08,1.96,1.93


FTR - Final Time Result
H = Home D = Draw A = Away
#HTGD - Home team goal difference
#ATGD - away team goal difference
#HTP - Home team points
#ATP - Away team points
#DiffFormPts Diff in points
#DiffLP - Differnece in last years prediction

In [5]:
df = pd.DataFrame(columns = ['HomeTeam','AwayTeam','FTHG','FTAG'])
df = pd.DataFrame.rename(data, columns = {'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})
df.columns.values

array(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'HomeGoals',
       'AwayGoals', 'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS',
       'HST', 'AST', 'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR',
       'B365H', 'B365D', 'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD',
       'IWA', 'PSH', 'PSD', 'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD',
       'VCA', 'MaxH', 'MaxD', 'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5',
       'B365<2.5', 'P>2.5', 'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5',
       'Avg<2.5', 'AHh', 'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH',
       'MaxAHA', 'AvgAHH', 'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH',
       'BWCD', 'BWCA', 'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA',
       'WHCH', 'WHCD', 'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD',
       'MaxCA', 'AvgCH', 'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5',
       'PC>2.5', 'PC<2.5', 'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5',
       'AHCh', 'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH',
    

In [18]:
columns_req = ['Date','HomeTeam','AwayTeam','HomeGoals','AwayGoals','FTR']
feature_table = df[columns_req]
feature_table
# feature_table = feature_table.drop(['Div','Date','Time','Referee','HTR'],axis=1)
# home_feature_table = feature_table.groupby('HomeTeam')
# away_feature_table = feature_table.groupby('AwayTeam')
# for home_team_name, group in home_feature_table:
#     print(home_team_name)
# print('=========')
# for away_team_name, group in away_feature_table:
#     print(away_team_name)
def get_goals_scored(feature_table):
    teams = {}
    for i in feature_table.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    for i in range(len(feature_table)):
        HTGS = feature_table.iloc[i]['HomeGoals']
        ATGS = feature_table.iloc[i]['AwayGoals']
        teams[feature_table.iloc[i].HomeTeam].append(HTGS)
        teams[feature_table.iloc[i].AwayTeam].append(ATGS)
    GoalsScored = pd.DataFrame(data=teams, index=list(range(1, 11, 1))).T
#     print(GoalsScored)
    GoalsScored[0] = 0
    for i in range(2,11):
        GoalsScored[i] = GoalsScored[i] + GoalsScored[i-1]
    return GoalsScored

def get_goals_conceded(feature_table):
    teams = {}
    for i in feature_table.groupby('HomeTeam').mean().T.columns:
        teams[i] = []
    # the value corresponding to keys is a list containing the match location.
    for i in range(len(feature_table)):
        ATGC = feature_table.iloc[i]['HomeGoals']
        HTGC = feature_table.iloc[i]['AwayGoals']
        teams[feature_table.iloc[i].HomeTeam].append(HTGC)
        teams[feature_table.iloc[i].AwayTeam].append(ATGC)
    GoalsConceded = pd.DataFrame(data=teams, index=list(range(1, 11, 1))).T
#     print(GoalsConceded)
    GoalsConceded[0] = 0
    for i in range(2,11):
        GoalsConceded[i] = GoalsConceded[i] + GoalsConceded[i-1]
    return GoalsConceded

def get_gss(feature_table):
    GC = get_goals_conceded(feature_table)
    GS = get_goals_scored(feature_table)
    j = 0
    HTGS = []
    ATGS = []
    HTGC = []
    ATGC = []
    for i in range(100):
        ht = feature_table.iloc[i].HomeTeam
        at = feature_table.iloc[i].AwayTeam
        HTGS.append(GS.loc[ht][j])
        ATGS.append(GS.loc[at][j])
        HTGC.append(GC.loc[ht][j])
        ATGC.append(GC.loc[at][j])
        
        if ((i + 1)% 10) == 0:
            j = j + 1
        
    feature_table['HTGS'] = HTGS
    feature_table['ATGS'] = ATGS
    feature_table['HTGC'] = HTGC
    feature_table['ATGC'] = ATGC
#     print(feature_table)
    return feature_table
e0_data = get_gss(feature_table)
print(e0_data)

          Date          HomeTeam          AwayTeam  HomeGoals  AwayGoals FTR  \
0   09/08/2019         Liverpool           Norwich          4          1   H   
1   10/08/2019          West Ham          Man City          0          5   A   
2   10/08/2019       Bournemouth  Sheffield United          1          1   D   
3   10/08/2019           Burnley       Southampton          3          0   H   
4   10/08/2019    Crystal Palace           Everton          0          0   D   
5   10/08/2019           Watford          Brighton          0          3   A   
6   10/08/2019         Tottenham       Aston Villa          3          1   H   
7   11/08/2019         Leicester            Wolves          0          0   D   
8   11/08/2019         Newcastle           Arsenal          0          1   A   
9   11/08/2019        Man United           Chelsea          4          0   H   
10  17/08/2019           Arsenal           Burnley          2          1   H   
11  17/08/2019       Aston Villa       B

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
