In [886]:
from requests import get
from lxml import etree
import io
import pandas as pd
from pymongo import MongoClient
from bson import ObjectId
from bson.json_util import dumps
import os
import tensorflow as tf
import unicodedata

In [887]:
url_serieA = "https://www.betclic.fr/football/italie-serie-a-e6"
url_liga = "https://www.betclic.fr/football/espagne-liga-primera-e7"
url_premierLeague = "https://www.betclic.fr/football/angl-premier-league-e3"
url_ligue1 = "https://www.betclic.fr/football/ligue-1-conforama-e4"
url_bundesliga = "https://www.betclic.fr/football/allemagne-bundesliga-e5"

headers = {"encoding": "utf-8"} 
page_serieA = get(url_serieA, headers=headers).content
page_liga = get(url_liga, headers=headers).content
page_premierLeague = get(url_premierLeague, headers=headers).content
page_ligue1 = get(url_ligue1, headers=headers).content
page_bundesliga = get(url_bundesliga, headers=headers).content

In [888]:
def rreplace(s, old, new, occurrence):
    li = s.rsplit(old, occurrence)
    return new.join(li)

In [889]:
def clean_body(raw_page):

    start = '<div class="entry day-entry grid-9 nm"'
    end = 'var refreshTime = 20000;'

    page = raw_page.decode()

    body = page[page.find(start):page.rfind(end)]
    body = body.replace("&", "")
    body = body.replace("<a", "")
    body = body.replace("</a>", "")

    body = rreplace(body, "</div>", "", 2)
    body = rreplace(body, '<script type="text/javascript" language="javascript">', "", 1)
    body = rreplace(body, '</section>', "", 1)

    body = "<root>"+body+"</root>"
    return body

In [890]:
import numpy as np

def crawl_info(cleaned_body, league):

    byte = str.encode(cleaned_body)

    file = io.BytesIO(byte)

    data = pd.DataFrame(columns=["match","teamA", "teamH","date","hour","odd_H","odd_D","odd_A",\
                                 "pred_H", "pred_D", "pred_A","RI_H","RI_D","RI_A","league"])

    match=""
    date=""
    hour=""
    odds = [0,0,0]
    i = 0

    for event, element in etree.iterparse(file):
        if element.tag == "time":
            date = element.get("datetime")
        if element.tag == "div" and element.get("class") == "hour":
            hour = element.text
        if element.tag == "div" and element.get("class") == "match-name":
            match = element.text.splitlines()[2].lstrip()

        if element.tag == "div" and element.get("class") == "match-odds":
            children = element.getchildren() 
            c=0

            for child in children:
                span = child.getchildren()[0]
                odds[c] = span.text
                c+=1
                
            teamH = match.split(" - ")[0]
            teamA = match.split(" - ")[1]
            data.loc[i] = [match, teamA, teamH, date, hour,\
                           float(odds[0].replace(',','.')), \
                           float(odds[1].replace(',','.')), \
                           float(odds[2].replace(',','.')), \
                           float("-1"), float("-1"), float("-1"), \
                           float("-1"), float("-1"), float("-1"), \
                           league]
            i+=1


    return data

In [891]:
def get_odds_by_league(page, league):
    return crawl_info(clean_body(page), league)

In [892]:
def give_id(odds):
    
    odds["_id"] = odds.date+"/"+odds.hour+"/"+odds.match
    return odds

In [893]:
def fetch_odds():
    odds = get_odds_by_league(page_ligue1,"ligue1").append(get_odds_by_league(page_bundesliga,"bundesliga"), ignore_index=True)
    odds = odds.append(get_odds_by_league(page_liga,"liga"), ignore_index=True)
    odds = odds.append(get_odds_by_league(page_serieA,"serieA"), ignore_index=True)
    odds = odds.append(get_odds_by_league(page_premierLeague,"premierLeague"), ignore_index=True)
    odds = odds.sort_values(by='date')
    return give_id(odds)

# PERSISTENCE LAYER
==================================

In [894]:
client = MongoClient("mongodb://192.168.99.100:27017")

coll = client["plp"]["apibase"]

In [895]:
matching = pd.read_csv("team_matching.csv")

In [896]:
def update_db(data):

    for index, row in data.iterrows():
        coll.replace_one({'_id':row['_id']}, row.to_dict(), True)

In [897]:
def fetch_db_matches():
    return pd.read_json(dumps(coll.find()))

In [898]:
def remove_match_db(id):
    query = { "_id": id}
    coll.delete_one(query)     

In [899]:
def remove_old_matches(old, new):
    passed_matches = old.merge(new,how='left', on='_id', suffixes=('_old', '_new'))

    passed_matches.where(passed_matches["match_new"]==passed_matches["match_old"], inplace = True)
    
    for index, row in passed_matches.iterrows():
        remove_match_db(row["_id"])

In [900]:
def get_team_features(team_name):
    client = MongoClient("mongodb://192.168.99.100:27017")
    db = client['plp']
    print(team_name)
    dic = db["team"].find_one({"_id":{'$regex':'^'+matching["id"][matching["odds"] == team_name].values[0]}})
    if(dic == None):
        return None
    del dic['_id']
    return np.fromiter(dic.values(),dtype = float)

In [901]:
def string_distance(s1, s2):
    return sum(1 for a, b in zip(s1, s2) if a != b) + abs(len(s1) - len(s2))

# PREDICTION FUNCTIONS
==================================

In [902]:
def create_model():
    model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
  ])

    model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=[
                      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
                      tf.keras.metrics.Precision(name='precision'),
                      tf.keras.metrics.Recall(name='recall'),
                      tf.keras.metrics.AUC(name='auc')])

    return model


In [903]:
def names_to_features(names):
    away = get_team_features(names[0])
    home = get_team_features(names[1])
    feature_vec = np.array([away, home])
    vec_out = feature_vec.reshape((1,np.prod(feature_vec.shape)))
    return vec_out

In [904]:
def remove_accents(s):
    return ''.join((c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn'))

In [905]:
checkpoint_path = "../models/model1.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
model = create_model()
model.load_weights(checkpoint_path)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x1d7eec1c788>

In [906]:
def predict_match(match):
    match.pred_H, match.pred_D, match.pred_A = predict_teams(remove_accents(match.teamH), remove_accents(match.teamA))

In [907]:
def predict_teams(teamH, teamA):
    teamA = teamA.replace("Saint","St").replace("-","_").replace(" ","_").lower()
    teamH = teamH.replace("Saint","St").replace("-","_").replace(" ","_").lower()
    X = names_to_features([teamA,teamH])
    print(np.shape(X))
    try:
        Y = model.predict(x = X, batch_size = 1)
        print(Y)
        return Y[0][0],Y[0][1],Y[0][2]
    except:
        return 0.0,0.0,0.0

In [908]:
def calculate_RI(match):
    match.RI_H = RI_formula(match.odd_H, match.pred_H)
    match.RI_D = RI_formula(match.odd_D, match.pred_D)
    match.RI_A = RI_formula(match.odd_A, match.pred_A)

In [909]:
def RI_formula(odd, prediction):
    return (prediction  - (1/odd))**2

# MAIN 
==================================

In [910]:
def main_odds():
    
    old = fetch_db_matches()
    new = fetch_odds()

    if len(old)==0:
        old = pd.DataFrame(columns=["_id", "match", "pred_H", "pred_D", "pred_A"])
        
    merge = new.merge(old,how='outer', on='_id', suffixes=('_new', '_old'))

    data_to_add = merge[merge["match_old"].isna()]
    data_to_delete = merge[merge["match_new"].isna()]
    data_to_update = merge[merge["match_new"] == merge["match_old"]]
   
    #Filter and add new data
    add_filtered = new[new["_id"].isin(data_to_add["_id"])]
    for index, match_row in add_filtered.iterrows():
        predict_match(match_row)
        calculate_RI(match_row)
        add_filtered.loc[index] = match_row

    update_db(add_filtered)
    


    #Filter and delete old data
    delete_filtered = old[old["_id"].isin(data_to_delete["_id"])]
    for _id in delete_filtered["_id"].values:
        remove_match_db(_id)


    #Filter existing data, and update it
    update_filtered = new[new["_id"].isin(data_to_update["_id"])]
    old_filtered = old[old["_id"].isin(data_to_update["_id"])]

    update_filtered.loc[:,"pred_H"] = old_filtered["pred_H"].values
    update_filtered.loc[:,"pred_D"] = old_filtered["pred_D"].values
    update_filtered.loc[:,"pred_A"] = old_filtered["pred_A"].values

    for index, match_row in update_filtered.iterrows():
        calculate_RI(match_row)
        update_filtered.loc[index] = match_row

    update_db(update_filtered)
    
    print("Done.")
    print("New : "+str(len(add_filtered)))
    print("Updated : "+str(len(update_filtered)))
    print("Removed : "+str(len(delete_filtered)))
    print("Total matches in base : "+str(coll.estimated_document_count()))

In [911]:
### MAIN

main_odds()

Done.
New : 0
Updated : 88
Removed : 0
Total matches in base : 88
