In [12]:
import pandas as pd
import numpy as np
import collections

Была использована статья https://habr.com/ru/post/456226/

In [13]:
# Файл с футбольной статистикой (https://fbref.com/en/comps/1/World-Cup-Stats)
data = pd.read_csv("data.csv", encoding='cp1251', delimiter=',')

# Файл с матчами (нужны победы и проигрыши)
matches = pd.read_csv("matches.csv", encoding='cp1251', delimiter=',')
data.head()
#matches.head()


Unnamed: 0,Country,Wins,Draws,Losses,Scored,Missed,Differance,Points,xG,xGA,xGD,xGD/90
0,Netherlands,2,1,0,5,1,4,7,2.4,2.7,-0.3,-0.11
1,Senegal,2,0,1,5,4,1,6,3.8,2.5,1.3,0.43
2,Ecuador,1,1,1,4,3,1,4,3.7,2.6,1.2,0.39
3,Qatar,0,0,3,1,7,-6,0,1.4,3.5,-2.1,-0.71
4,England,2,1,0,9,2,7,7,5.2,2.3,2.9,0.97


In [14]:
# Записали команды в переменную
teams = data['Country']

# Получение массива значений статистических данных для определенной команды - назовем "вектор данных для команды"
def GetTeamStat(team):
    for i in range(len(data)):
        if data['Country'][i] == team:
            return [data['Wins'][i], data['Draws'][i], data['Losses'][i],
                    data['Scored'][i], data['Missed'][i],data['Differance'][i], data['Points'][i], data['xG'][i],
                    data['xGA'][i], data['xGD'][i], data['xGD/90'][i]]

In [15]:
GetTeamStat('Qatar')

# Расшифровка каждого параметра - (https://fbref.com/en/comps/1/World-Cup-Stats)

[0, 0, 3, 1, 7, -6, 0, 1.4, 3.5, -2.1, -0.71]

In [16]:
# Собираем векторы данных всех команд в одну общую коллекцию
def GetAllTeamStat():
    collection = collections.defaultdict(list)
    for t in teams:
        team_vector = GetTeamStat(t)
        collection[t] = team_vector
    return collection


In [17]:
GetAllTeamStat()

defaultdict(list,
            {'Netherlands': [2, 1, 0, 5, 1, 4, 7, 2.4, 2.7, -0.3, -0.11],
             'Senegal': [2, 0, 1, 5, 4, 1, 6, 3.8, 2.5, 1.3, 0.43],
             'Ecuador': [1, 1, 1, 4, 3, 1, 4, 3.7, 2.6, 1.2, 0.39],
             'Qatar': [0, 0, 3, 1, 7, -6, 0, 1.4, 3.5, -2.1, -0.71],
             'England': [2, 1, 0, 9, 2, 7, 7, 5.2, 2.3, 2.9, 0.97],
             'United States': [1, 2, 0, 2, 1, 1, 5, 2.6, 2.7, -0.1, -0.03],
             'Iran': [1, 0, 2, 4, 7, -3, 3, 3.2, 4.2, -0.9, -0.32],
             'Wales': [0, 1, 2, 1, 6, -5, 1, 2.7, 4.6, -1.9, -0.63],
             'Argentina': [2, 0, 1, 5, 2, 3, 6, 6.0, 0.7, 5.2, 1.74],
             'Poland': [1, 1, 1, 2, 2, 0, 4, 2.8, 5.9, -3.1, -1.04],
             'Mexico': [1, 1, 1, 2, 3, -1, 4, 3.3, 2.4, 0.9, 0.3],
             'Saudi Arabia': [1, 0, 2, 3, 5, -2, 3, 3.0, 6.0, -3.0, -1.01],
             'France': [2, 0, 1, 6, 3, 3, 6, 7.2, 1.6, 5.5, 1.85],
             'Australia': [2, 0, 1, 3, 4, -1, 6, 1.7, 5.6, -3.9, -1.29],


In [18]:
def GetTrainingData():
    totalNumGames = 0
    totalNumGames = len(matches.index)
    numFeatures = len((GetTeamStat('Qatar'))) #случайная команда для определения размерности
    xTrain = np.zeros(( totalNumGames, numFeatures))
    yTrain = np.zeros(( totalNumGames ))
    team_vectors = GetAllTeamStat()
    counter = 0
    for index, row in matches.iterrows():
        # Собираем вектор данных определенной команды 
        team = row['Команда']
        t_vector = team_vectors[team]

        # Собираем вектор данных соперника этой команды
        rivals = row['Соперник']
        r_vector = team_vectors[rivals]
        
         # находим разницу по каждому параметру - формируем вектор разницы
        diff = [a - b for a, b in zip(t_vector, r_vector)]
        
        # В xTrain записываем вектор разницы, в yTrain записываем вектор из нулей и единиц (0 - проигрыш, 1 - победа)
        if len(diff) != 0:
            xTrain[counter] = diff
        if team == row['Победитель']:
            yTrain[counter] = 1
        else: 
            yTrain[counter] = 0
        counter += 1   
    return xTrain, yTrain

xTrain, yTrain = GetTrainingData()  

In [19]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xTrain, yTrain)

In [20]:
def createGamePrediction(team1_vector, team2_vector):
    diff = [[a - b for a, b in zip(team1_vector, team2_vector)]]
    predictions = model.predict(diff)
    return predictions

In [21]:
team1_name = "France"
team2_name = "Argentina"

team1_vector = GetTeamStat(team1_name)
team2_vector = GetTeamStat(team2_name)

#print(team1_vector, team2_vector)

print ('Вероятность, что выиграет ' + team1_name + ':', createGamePrediction(team1_vector, team2_vector))
print ('Вероятность, что выиграет ' + team2_name + ':', createGamePrediction(team2_vector, team1_vector))

Вероятность, что выиграет France: [0.43551142]
Вероятность, что выиграет Argentina: [0.43547457]


In [22]:
for team_name in teams:
    team1_name = "Argentina"
    team2_name = team_name
    
    if(team1_name != team2_name):
        team1_vector = GetTeamStat(team1_name)
        team2_vector = GetTeamStat(team2_name)

        print(team1_name, createGamePrediction(team1_vector, team2_vector), " - ", team2_name, createGamePrediction(team2_vector, team1_vector,))

Argentina [0.48094063]  -  Netherlands [0.39004536]
Argentina [0.57942093]  -  Senegal [0.29156507]
Argentina [0.79504265]  -  Ecuador [0.07594334]
Argentina [1.06519052]  -  Qatar [-0.19420453]
Argentina [0.52519101]  -  England [0.34579498]
Argentina [0.60797646]  -  United States [0.26300953]
Argentina [0.85329621]  -  Iran [0.01768978]
Argentina [0.92407602]  -  Wales [-0.05309003]
Argentina [0.5981024]  -  Poland [0.27288359]
Argentina [0.68890342]  -  Mexico [0.18208257]
Argentina [0.71763409]  -  Saudi Arabia [0.15335191]
Argentina [0.43547457]  -  France [0.43551142]
Argentina [0.63187854]  -  Australia [0.23910745]
Argentina [0.68616897]  -  Tunisia [0.18481702]
Argentina [0.97714828]  -  Denmark [-0.10616228]
Argentina [0.49913806]  -  Japan [0.37184793]
Argentina [0.69069432]  -  Spain [0.18029167]
Argentina [0.36756049]  -  Germany [0.50342551]
Argentina [0.75155839]  -  Costa Rica [0.1194276]
Argentina [0.46269336]  -  Morocco [0.40829264]
Argentina [0.50315125]  -  Croati