In [1]:
from requests import get
from lxml import etree
import io
import pandas as pd
from pymongo import MongoClient
from bson import ObjectId
from bson.json_util import dumps


In [2]:
url_serieA = "https://www.betclic.fr/football/italie-serie-a-e6"
url_liga = "https://www.betclic.fr/football/espagne-liga-primera-e7"
url_premierLeague = "https://www.betclic.fr/football/angl-premier-league-e3"
url_ligue1 = "https://www.betclic.fr/football/ligue-1-conforama-e4"
url_bundesliga = "https://www.betclic.fr/football/allemagne-bundesliga-e5"

headers = {"encoding": "utf-8"} 
page_serieA = get(url_serieA, headers=headers).content
page_liga = get(url_liga, headers=headers).content
page_premierLeague = get(url_premierLeague, headers=headers).content
page_ligue1 = get(url_ligue1, headers=headers).content
page_bundesliga = get(url_bundesliga, headers=headers).content

In [3]:
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)

In [4]:
def clean_body(raw_page):

    start = '<div class="entry day-entry grid-9 nm"'
    end = 'var refreshTime = 20000;'

    page = raw_page.decode()

    body = page[page.find(start):page.rfind(end)]
    body = body.replace("&", "")
    body = body.replace("<a", "")
    body = body.replace("</a>", "")

    body = rreplace(body, "</div>", "", 2)
    body = rreplace(body, '<script type="text/javascript" language="javascript">', "", 1)
    body = rreplace(body, '</section>', "", 1)

    body = "<root>"+body+"</root>"
    return body

In [5]:
import numpy as np

def crawl_info(cleaned_body, league):

    byte = str.encode(cleaned_body)

    file = io.BytesIO(byte)

    data = pd.DataFrame(columns=["match","teamA", "teamH","date","hour","odd_H","odd_D","odd_A",\
                                 "pred_H", "pred_D", "pred_A","RI_H","RI_D","RI_A","league"])

    match=""
    date=""
    hour=""
    odds = [0,0,0]
    i = 0

    for event, element in etree.iterparse(file):
        if element.tag == "time":
            date = element.get("datetime")
        if element.tag == "div" and element.get("class") == "hour":
            hour = element.text
        if element.tag == "div" and element.get("class") == "match-name":
            match = element.text.splitlines()[2].lstrip()

        if element.tag == "div" and element.get("class") == "match-odds":
            children = element.getchildren() 
            c=0

            for child in children:
                span = child.getchildren()[0]
                odds[c] = span.text
                c+=1
                
            teamH = match.split(" - ")[0]
            teamA = match.split(" - ")[1]
            data.loc[i] = [match, teamA, teamH, date, hour,\
                           float(odds[0].replace(',','.')), \
                           float(odds[1].replace(',','.')), \
                           float(odds[2].replace(',','.')), \
                           float("-1"), float("-1"), float("-1"), \
                           float("-1"), float("-1"), float("-1"), \
                           league]
            i+=1


    return data

In [6]:
def get_odds_by_league(page, league):
    return crawl_info(clean_body(page), league)

In [7]:
def give_id(odds):
    
    odds["_id"] = odds.date+"/"+odds.hour+"/"+odds.match
    return odds

In [8]:
def fetch_odds():
    odds = get_odds_by_league(page_ligue1,"ligue1").append(get_odds_by_league(page_bundesliga,"bundesliga"), ignore_index=True)
    odds = odds.append(get_odds_by_league(page_liga,"liga"), ignore_index=True)
    odds = odds.append(get_odds_by_league(page_serieA,"serieA"), ignore_index=True)
    odds = odds.append(get_odds_by_league(page_premierLeague,"premierLeague"), ignore_index=True)
    odds = odds.sort_values(by='date')
    return give_id(odds)

# PERSISTENCE LAYER
==================================

In [9]:
client = MongoClient("mongodb://plp_mongodb:27017")

coll = client["plp"]["apibase"]

In [10]:
def update_db(data):

    for index, row in data.iterrows():
        coll.replace_one({'_id':row['_id']}, row.to_dict(), True)

In [11]:
def fetch_db_matches():
    return pd.read_json(dumps(coll.find()))

In [12]:
def remove_match_db(id):
    query = { "_id": id}
    coll.delete_one(query)     

In [13]:
def remove_old_matches(old, new):
    passed_matches = old.merge(new,how='left', on='_id', suffixes=('_old', '_new'))

    passed_matches.where(passed_matches["match_new"]==passed_matches["match_old"], inplace = True)
    
    for index, row in passed_matches.iterrows():
        remove_match_db(row["_id"])

# PREDICTION FUNCTIONS
==================================

In [14]:
def predict_match(match):
    match.pred_H, match.pred_D, match.pred_A = predict_teams(match.teamH, match.teamA) #model.predict(teamH, teamA) ### TO DO

In [15]:
def predict_teams(teamH, teamA):
    return np.random.rand(), np.random.rand(), np.random.rand() ### TO DO

In [16]:
def calculate_RI(match):
    match.RI_H = RI_formula(match.odd_H, match.pred_H)
    match.RI_D = RI_formula(match.odd_D, match.pred_D)
    match.RI_A = RI_formula(match.odd_A, match.pred_A)

In [17]:
def RI_formula(odd, prediction):
    return (1 - (1/odd)) / (1 - prediction)  ### TO DO

In [18]:
test = fetch_odds()[3:4]
for index, match_row in test.iterrows():
    predict_match(match_row)
    calculate_RI(match_row)
    test.loc[index] = match_row

# MAIN 
==================================

In [19]:
def main_odds():
    
    old = fetch_db_matches()
    new = fetch_odds()

    if len(old)==0:
        old = pd.DataFrame(columns=["_id", "match", "pred_H", "pred_D", "pred_A"])
        
    merge = new.merge(old,how='outer', on='_id', suffixes=('_new', '_old'))

    data_to_add = merge[merge["match_old"].isna()]
    data_to_delete = merge[merge["match_new"].isna()]
    data_to_update = merge[merge["match_new"] == merge["match_old"]]
   
    #Filter and add new data
    add_filtered = new[new["_id"].isin(data_to_add["_id"])]
    for index, match_row in add_filtered.iterrows():
        predict_match(match_row)
        calculate_RI(match_row)
        add_filtered.loc[index] = match_row

    update_db(add_filtered)
    


    #Filter and delete old data
    delete_filtered = old[old["_id"].isin(data_to_delete["_id"])]
    for _id in delete_filtered["_id"].values:
        remove_match_db(_id)


    #Filter existing data, and update it
    update_filtered = new[new["_id"].isin(data_to_update["_id"])]
    old_filtered = old[old["_id"].isin(data_to_update["_id"])]

    update_filtered.loc[:,"pred_H"] = old_filtered["pred_H"].values
    update_filtered.loc[:,"pred_D"] = old_filtered["pred_D"].values
    update_filtered.loc[:,"pred_A"] = old_filtered["pred_A"].values

    for index, match_row in update_filtered.iterrows():
        calculate_RI(match_row)
        update_filtered.loc[index] = match_row

    update_db(update_filtered)
    
    print("Done.")
    print("New : "+str(len(add_filtered)))
    print("Updated : "+str(len(update_filtered)))
    print("Removed : "+str(len(delete_filtered)))
    print("Total matches in base : "+str(coll.estimated_document_count()))

In [20]:
remove_match_db("2020-1-18/21:00/Eibar - Atletico Madrid")

In [22]:
### MAIN

main_odds()

Done.
New : 0
Updated : 107
Removed : 0
Total matches in base : 107
