In [1]:
# In case iPython does not find our personalized modules and we want to import them manually
# import sys
# sys.path.append('my/path/to/module/folder')
# import module_of_interest

# We can also make sure what's the main directory iPhython consider for running
# import os
# os.getcwd()

import constants
import requests
import pandas as pd
import numpy as np
import psycopg2
import csv
import datetime

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)


In [2]:
website_frame = pd.read_csv(constants.CSV_SOURCE_URL)

In [3]:
website_frame.shape

(380, 106)

In [4]:
# Remove unwanted columns

unwanted_cols = ["Div", "BWH", "BWD", "BWA", "IWH", "IWD", "IWA", "PSH", "PSD", "PSA", "WHH", "WHD", "WHA", "VCH", "VCD", "VCA",
                 "P>2.5", "P<2.5","AHh", "B365AHH", "B365AHA", "PAHH", "PAHA", "MaxAHH", "MaxAHA", "AvgAHH", "AvgAHA", "B365CH",
                 "B365CD", "B365CA", "BWCH", "BWCD", "BWCA", "IWCH", "IWCD", "IWCA", "PSCH", "PSCD", "PSCA", "WHCH", "WHCD", "WHCA",
                 "VCCH", "VCCD", "VCCA", "MaxCH", "MaxCD", "MaxCA", "AvgCH", "AvgCD", "AvgCA", "B365C>2.5", "B365C<2.5", "PC>2.5",
                 "PC<2.5", "MaxC>2.5", "MaxC<2.5", "AvgC>2.5", "AvgC<2.5", "AHCh", "B365CAHH", "B365CAHA", "PCAHH", "PCAHA", "MaxCAHH",
                 "MaxCAHA", "AvgCAHH", "AvgCAHA"]

website_frame.drop(columns = unwanted_cols, inplace = True)

In [5]:
# Rename columns
website_frame.rename(columns = {"FTHG": "FullTimeHomeTeamGoals",
                               "FTAG": "FullTimeAwayTeamGoals",
                               "FTR": "FullTimeResult",
                               "HTHG": "HalfTimeHomeTeamGoals",
                               "HTAG": "HalfTimeAwayTeamGoals",
                               "HTR": "HalfTimeResult",
                               "HS": "HomeTeamShots",
                               "AS": "AwayTeamShots",
                               "HST": "HomeTeamShotsOnTarget",
                               "AST": "AwayTeamShotsOnTarget",
                               "HF": "HomeTeamFouls",
                               "AF": "AwayTeamFouls",
                               "HC": "HomeTeamCorners",
                               "AC": "AwayTeamCorners",
                               "HY": "HomeTeamYellowCards",
                               "AY": "AwayTeamYellowCards",
                               "HR": "HomeTeamRedCards",
                               "AR": "AwayTeamRedCards",
                               "B365H": "B365HomeTeam",
                               "B365D": "B365Draw",
                               "B365A": "B365AwayTeam",
                               "MaxH": "MarketMaxHomeTeam",
                               "MaxD": "MarketMaxDraw",
                               "MaxA": "MarketMaxAwayTeam",
                               "AvgH": "MarketAvgHomeTeam",
                               "AvgD": "MarketAvgDraw",
                               "AvgA": "MarketAvgAwayTeam",
                               "B365>2.5": "B365Over2.5Goals",
                               "B365<2.5": "B365Under2.5Goals",
                               "Max>2.5": "MarketMaxOver2.5Goals",
                               "Max<2.5": "MarketMaxUnder2.5Goals",
                               "Avg>2.5": "MarketAvgOver2.5Goals",
                               "Avg<2.5": "MarketAvgUnder2.5Goals"},
                   inplace = True)


In [6]:
# Add MatchID column

website_frame.insert(0, "MatchID", constants.CURRENT_SEASON_TAG + "_" + website_frame["HomeTeam"] + "_" + website_frame["AwayTeam"])

In [7]:
# Add season column

website_frame.insert(1, "Season", constants.CURRENT_SEASON_TAG)

In [8]:
# Add MatchWeek column

website_frame.insert(2, "MatchWeek", constants.DEFAULT_MATCHWEEK)

In [9]:
website_frame.head()

Unnamed: 0,MatchID,Season,MatchWeek,Date,Time,HomeTeam,AwayTeam,FullTimeHomeTeamGoals,FullTimeAwayTeamGoals,FullTimeResult,HalfTimeHomeTeamGoals,HalfTimeAwayTeamGoals,HalfTimeResult,Referee,HomeTeamShots,AwayTeamShots,HomeTeamShotsOnTarget,AwayTeamShotsOnTarget,HomeTeamFouls,AwayTeamFouls,HomeTeamCorners,AwayTeamCorners,HomeTeamYellowCards,AwayTeamYellowCards,HomeTeamRedCards,AwayTeamRedCards,B365HomeTeam,B365Draw,B365AwayTeam,MarketMaxHomeTeam,MarketMaxDraw,MarketMaxAwayTeam,MarketAvgHomeTeam,MarketAvgDraw,MarketAvgAwayTeam,B365Over2.5Goals,B365Under2.5Goals,MarketMaxOver2.5Goals,MarketMaxUnder2.5Goals,MarketAvgOver2.5Goals,MarketAvgUnder2.5Goals
0,2023-2024_Burnley_Man City,2023-2024,1,11/08/2023,20:00,Burnley,Man City,0,3,A,0,2,A,C Pawson,6,17,1,8,11,8,6,5,0,0,1,0,8.0,5.5,1.33,9.5,5.68,1.39,9.02,5.35,1.35,1.67,2.2,1.71,2.4,1.65,2.27
1,2023-2024_Arsenal_Nott'm Forest,2023-2024,1,12/08/2023,12:30,Arsenal,Nott'm Forest,2,1,H,2,0,H,M Oliver,15,6,7,2,12,12,8,3,2,2,0,0,1.18,7.0,15.0,1.21,8.5,17.5,1.18,7.64,15.67,1.44,2.75,1.45,2.98,1.42,2.85
2,2023-2024_Bournemouth_West Ham,2023-2024,1,12/08/2023,15:00,Bournemouth,West Ham,1,1,D,0,0,D,P Bankes,14,16,5,3,9,14,10,4,1,4,0,0,2.7,3.4,2.55,2.8,3.62,2.75,2.69,3.44,2.64,1.9,2.0,1.95,2.03,1.88,1.94
3,2023-2024_Brighton_Luton,2023-2024,1,12/08/2023,15:00,Brighton,Luton,4,1,H,1,0,H,D Coote,27,9,12,3,11,12,6,7,2,2,0,0,1.33,5.5,9.0,1.36,6.0,10.5,1.33,5.52,9.61,1.62,2.3,1.65,2.45,1.61,2.34
4,2023-2024_Everton_Fulham,2023-2024,1,12/08/2023,15:00,Everton,Fulham,0,1,A,0,0,D,S Attwell,19,9,9,2,12,6,10,4,0,2,0,0,2.2,3.4,3.3,2.3,3.57,3.45,2.24,3.43,3.3,2.01,1.89,2.04,1.92,1.97,1.86


In [10]:
# Add Points columns

conditions = [
     website_frame["FullTimeResult"] == 'H',
     website_frame["FullTimeResult"] == 'D',
     website_frame["FullTimeResult"] == 'A'
]

home_points = [ 3, 1, 0]
away_points = [ 0, 1, 3]

website_frame["HomeTeamPoints"] = np.select(conditions, home_points)
website_frame["AwayTeamPoints"] = np.select(conditions, away_points)

In [11]:
website_frame.shape

(380, 43)

In [12]:
# Stablish a connection to Database data source and fetch all matches stored from current season

try:
    connection = psycopg2.connect(
        host = constants.DB_SERVER,
        port = constants.DB_PORT,
        user = constants.DB_USER,
        password = constants.DB_PASSWORD,
        database = constants.DB_NAME
    )
except psycopg2.Error as e:
    print (f'Can not connect to the postgress database "{constants.DB_NAME}". Make sure database server is running')
    print (e)
else:
    print (f'Connection to database "{constants.DB_NAME}" stablished. Listening at port {constants.DB_PORT}')

season_query = f"SELECT * FROM public.match_history WHERE \"Season\" = '{constants.CURRENT_SEASON_TAG}'"

cursor = connection.cursor()
cursor.execute(season_query)
matches_in_db = cursor.fetchall()

Connection to database "premier_league" stablished. Listening at port 5432


In [13]:
# Copy cursor into a dataframe
postgres_frame = pd.DataFrame(data = matches_in_db, columns = website_frame.columns)

In [21]:
# !! Getting error --> ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects
# This does not work as our dataframes have identical lables, but different indexes as website_frame is always likely to have
# more entries that what we have persisted in DataBase
new_entries = website_frame.compare(postgres_frame)

In [22]:
new_entries

Unnamed: 0_level_0,MatchWeek,MatchWeek,Date,Date,Time,Time,HomeTeamFouls,HomeTeamFouls,AwayTeamFouls,AwayTeamFouls,HomeTeamCorners,HomeTeamCorners,AwayTeamCorners,AwayTeamCorners,MarketMaxHomeTeam,MarketMaxHomeTeam,MarketMaxDraw,MarketMaxDraw,MarketMaxAwayTeam,MarketMaxAwayTeam,MarketAvgHomeTeam,MarketAvgHomeTeam,MarketAvgDraw,MarketAvgDraw,MarketAvgAwayTeam,MarketAvgAwayTeam,B365Over2.5Goals,B365Over2.5Goals,B365Under2.5Goals,B365Under2.5Goals
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other,self,other
0,,,11/08/2023,2023-08-11,20:00,08:00:00 p. m.,11.0,6.0,8.0,5.0,6.0,11.0,5.0,8.0,9.50,1.67,5.68,2.20,1.39,9.50,9.02,5.68,5.35,1.39,1.35,9.02,1.67,5.35,2.20,1.35
1,,,12/08/2023,2023-08-12,12:30,12:30:00 p. m.,12.0,8.0,12.0,3.0,8.0,12.0,3.0,12.0,1.21,1.44,8.50,2.75,17.50,1.21,1.18,8.50,7.64,17.50,15.67,1.18,1.44,7.64,2.75,15.67
2,,,12/08/2023,2023-08-12,15:00,03:00:00 p. m.,9.0,10.0,14.0,4.0,10.0,9.0,4.0,14.0,2.80,1.90,3.62,2.00,2.75,2.80,2.69,3.62,3.44,2.75,2.64,2.69,1.90,3.44,2.00,2.64
3,,,12/08/2023,2023-08-12,15:00,03:00:00 p. m.,11.0,6.0,12.0,7.0,6.0,11.0,7.0,12.0,1.36,1.62,6.00,2.30,10.50,1.36,1.33,6.00,5.52,10.50,9.61,1.33,1.62,5.52,2.30,9.61
4,,,12/08/2023,2023-08-12,15:00,03:00:00 p. m.,12.0,10.0,6.0,4.0,10.0,12.0,4.0,6.0,2.30,2.01,3.57,1.89,3.45,2.30,2.24,3.57,3.43,3.45,3.30,2.24,2.01,3.43,1.89,3.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,1.0,38.0,19/05/2024,2024-05-19,16:00,04:00:00 p. m.,10.0,2.0,8.0,4.0,2.0,10.0,4.0,8.0,1.93,1.40,4.50,3.00,3.84,1.93,1.87,4.50,4.28,3.84,3.67,1.87,1.40,4.28,3.00,3.67
376,1.0,38.0,19/05/2024,2024-05-19,16:00,04:00:00 p. m.,14.0,10.0,11.0,2.0,10.0,14.0,2.0,11.0,1.18,1.22,10.00,4.33,17.00,1.18,1.16,10.00,8.71,17.00,14.55,1.16,1.22,8.71,4.33,14.55
377,1.0,38.0,19/05/2024,2024-05-19,16:00,04:00:00 p. m.,15.0,4.0,20.0,4.0,4.0,15.0,4.0,20.0,3.00,1.44,4.35,2.75,2.30,3.00,2.92,4.35,3.95,2.30,2.23,2.92,1.44,3.95,2.75,2.23
378,1.0,38.0,19/05/2024,2024-05-19,16:00,04:00:00 p. m.,3.0,11.0,12.0,2.0,11.0,3.0,2.0,12.0,1.11,1.20,14.00,4.50,25.00,1.11,1.10,14.00,11.72,25.00,21.81,1.10,1.20,11.72,4.50,21.81


In [15]:
# Copy depurated matchweeks from

website_frame.index


RangeIndex(start=0, stop=380, step=1)

In [16]:
postgres_frame.index

RangeIndex(start=0, stop=380, step=1)

In [17]:
# Now let's get only new match entries by contrasting what is comming from website datasource vs what DataBase has


In [18]:
type(matches_in_db)

list

In [19]:
website_frame.head()

Unnamed: 0,MatchID,Season,MatchWeek,Date,Time,HomeTeam,AwayTeam,FullTimeHomeTeamGoals,FullTimeAwayTeamGoals,FullTimeResult,HalfTimeHomeTeamGoals,HalfTimeAwayTeamGoals,HalfTimeResult,Referee,HomeTeamShots,AwayTeamShots,HomeTeamShotsOnTarget,AwayTeamShotsOnTarget,HomeTeamFouls,AwayTeamFouls,HomeTeamCorners,AwayTeamCorners,HomeTeamYellowCards,AwayTeamYellowCards,HomeTeamRedCards,AwayTeamRedCards,B365HomeTeam,B365Draw,B365AwayTeam,MarketMaxHomeTeam,MarketMaxDraw,MarketMaxAwayTeam,MarketAvgHomeTeam,MarketAvgDraw,MarketAvgAwayTeam,B365Over2.5Goals,B365Under2.5Goals,MarketMaxOver2.5Goals,MarketMaxUnder2.5Goals,MarketAvgOver2.5Goals,MarketAvgUnder2.5Goals,HomeTeamPoints,AwayTeamPoints
0,2023-2024_Burnley_Man City,2023-2024,1,11/08/2023,20:00,Burnley,Man City,0,3,A,0,2,A,C Pawson,6,17,1,8,11,8,6,5,0,0,1,0,8.0,5.5,1.33,9.5,5.68,1.39,9.02,5.35,1.35,1.67,2.2,1.71,2.4,1.65,2.27,0,3
1,2023-2024_Arsenal_Nott'm Forest,2023-2024,1,12/08/2023,12:30,Arsenal,Nott'm Forest,2,1,H,2,0,H,M Oliver,15,6,7,2,12,12,8,3,2,2,0,0,1.18,7.0,15.0,1.21,8.5,17.5,1.18,7.64,15.67,1.44,2.75,1.45,2.98,1.42,2.85,3,0
2,2023-2024_Bournemouth_West Ham,2023-2024,1,12/08/2023,15:00,Bournemouth,West Ham,1,1,D,0,0,D,P Bankes,14,16,5,3,9,14,10,4,1,4,0,0,2.7,3.4,2.55,2.8,3.62,2.75,2.69,3.44,2.64,1.9,2.0,1.95,2.03,1.88,1.94,1,1
3,2023-2024_Brighton_Luton,2023-2024,1,12/08/2023,15:00,Brighton,Luton,4,1,H,1,0,H,D Coote,27,9,12,3,11,12,6,7,2,2,0,0,1.33,5.5,9.0,1.36,6.0,10.5,1.33,5.52,9.61,1.62,2.3,1.65,2.45,1.61,2.34,3,0
4,2023-2024_Everton_Fulham,2023-2024,1,12/08/2023,15:00,Everton,Fulham,0,1,A,0,0,D,S Attwell,19,9,9,2,12,6,10,4,0,2,0,0,2.2,3.4,3.3,2.3,3.57,3.45,2.24,3.43,3.3,2.01,1.89,2.04,1.92,1.97,1.86,0,3
