In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_absolute_error

In [None]:
df = pd.read_csv('pl_22_23.csv', encoding='utf-8')
df_22 = pd.read_csv('pl_23_24.csv', encoding='utf-8')
df_22_24 = pd.concat([df, df_22], ignore_index=True)

In [None]:
df.head(2)

In [None]:
df_22.head(2)

In [None]:
df_22_24.head(2)

In [None]:
df_22_24['HomeTeam'].value_counts()

In [None]:
df_22_24.shape

In [None]:
df_22_24.dtypes

In [None]:
df_22_24.Date = pd.to_datetime(df_22_24['Date'], dayfirst=True)
df_22_24.Date.head(2)

In [None]:
df_22_24['Day_code'] = df_22_24['Date'].dt.dayofweek
df_22_24.Day_code.head(2)

In [None]:
df_22_24['Time_code'] = df_22_24['Time'].apply(lambda x: x[:2]).astype('int64')
df_22_24.Time_code.head(2)

In [None]:
encoder = LabelEncoder()
df_22_24['H_Team_code'] = encoder.fit_transform(df_22_24.HomeTeam)
df_22_24['A_Team_code'] = encoder.fit_transform(df_22_24.AwayTeam)
#df_22_24[['HomeTeam','H_Team_code']].sort_values('H_Team_code', ascending=False)

In [None]:
df_22_24['Win'] = np.where(df_22_24.FTR == 'H', 1, 0)
df_22_24.Win.head(2)

In [None]:
df_22_24['Goals'] = df_22_24.FTHG + df_22_24.FTAG
df_22_24.Goals.unique()

`Making some predictors and the the outcome we would like to predict`

In [None]:
features = ['Day_code', 'Time_code', 'H_Team_code', 'A_Team_code', 'Win']

In [None]:
df_22_24[features]

In [None]:
target = 'Win'
X = df_22_24[features].drop(target, axis=1)
y = df_22_24[target]
X.loc[577]

In [526]:
teams = df_22_24[['HomeTeam','H_Team_code']].groupby(['HomeTeam','H_Team_code']).count().reset_index()
teams

Unnamed: 0,HomeTeam,H_Team_code
0,Arsenal,0
1,Aston Villa,1
2,Bournemouth,2
3,Brentford,3
4,Brighton,4
5,Burnley,5
6,Chelsea,6
7,Crystal Palace,7
8,Everton,8
9,Fulham,9


In [None]:
def decoder(team: str) -> int:
    return teams.H_Team_code[teams.HomeTeam == team].values[0]

`Adding new observations to the dataframe and the real results of them`

In [None]:
def adding_new_data(day: str, time: int, team_h: str, team_a: str, result: int) -> None:
    X.loc[len(X.index)] = [day, time-1, team_h, team_a]
    y.loc[len(X.index)] = result

In [None]:
len(X)

In [None]:
X.loc[577]

In [None]:
# 2024-01-30, Tuesday, 20, Nott'm Forest, Arsenal, 0, 1-2
adding_new_data(1, 20, decoder("Nott'm Forest"), decoder("Arsenal"), 0)
# 2024-01-30, Tuesday, 20, Luton, Brighton, 1, 4-0
adding_new_data(1, 20, decoder("Luton"), decoder("Brighton"), 1)
# 2024-01-30, Tuesday, 20, Fulham, Everton, 0, 0-0
adding_new_data(1, 20, decoder("Fulham"), decoder("Everton"), 0)
# 2024-01-30, Tuesday, 21, Crystal Palace, Sheffield United, 1, 3-2
adding_new_data(1, 21, decoder("Crystal Palace"), decoder("Sheffield United"), 1)
# 2024-01-30, Tuesday, 21, Aston Villa, Newcastle, 0, 1-3
adding_new_data(1, 21, decoder("Aston Villa"), decoder("Newcastle"), 0)
# 2024-01-31, Wednesday, 20, Man City, Burnley, 1, 3-1
adding_new_data(2, 20, decoder("Man City"), decoder("Burnley"), 1)
# 2024-01-31, Wednesday, 20, Tottenham, Brentford, 1, 3-2
adding_new_data(2, 20, decoder("Tottenham"), decoder("Brentford"), 1)
# 2024-01-31, Wednesday, 21, Liverpool, Chelsea, 1, 4-1
adding_new_data(2, 21, decoder("Liverpool"), decoder("Chelsea"), 1)
# 2024-02-01, Thursday, 20, West Ham, Bournemouth, 0, 1-1
adding_new_data(3, 20, decoder("West Ham"), decoder("Bournemouth"), 0)
# 2024-02-01, Thursday, 21, Wolves, Man United, 0, 3-4
adding_new_data(3, 21, decoder("Wolves"), decoder("Man United"), 0)

In [None]:
X.loc[578], y.iloc[578]

In [None]:
len(X)

In [None]:
X = np.array(X)
y = np.array(y)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=42)

In [None]:
scaler = StandardScaler()
scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=15)
model_log = LogisticRegression()
model_log.fit(X_train, y_train)
y_pred = model_log.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f'Accuracy is {accuracy:.2f}'

In [None]:
# threshold = 0.5
# y_pred_proba = model_log.predict_proba(X_test)[:, 1]
# y_pred = np.where(y_pred_proba > threshold, 1, 0)
# y_test[:5], y_pred[:5]

In [None]:
tp, tn, fp, fn = 0, 0, 0, 0

for i in range(len(y_test)):
  if y_test[i] == 1 and y_pred[i] == 1: tp += 1
  elif y_test[i] == 0 and y_pred[i] == 0: tn += 1
  elif y_test[i] == 0 and y_pred[i] == 1: fp += 1
  else: fn +=1
    
tn, fp, fn, tp

In [None]:
conf = confusion_matrix(y_test, y_pred)
conf

In [None]:
tn, fp, fn, tp = conf.ravel()
tn, fp, fn, tp

In [None]:
sns.heatmap(conf,
            annot=True,
            linecolor='white',
            linewidths=.1,
            xticklabels=['Predicted lose', 'Predicted win'],
            yticklabels=['Actual lose', 'Actual win'])
plt.title('Confusion matrix of LogisticRegression model')
plt.show()
plt.clf()

In [None]:
np.random.seed(42)
accuracy_train, accuracy_test = [], []
depths = range(1,20)
for depth in depths:
  model = RandomForestClassifier(max_depth=depth)
  model.fit(X_train, y_train)
  y_pred_train, y_pred_test = model.predict(X_train), model.predict(X_test)
  accuracy_train.append(accuracy_score(y_train, y_pred_train))
  accuracy_test.append(accuracy_score(y_test, y_pred_test))
best_acc = np.max(accuracy_test)
best_depth = depths[np.argmax(accuracy_test)]
f'Best accuracy with using Randomforest is: {best_acc:.3f} and the best depth is: {best_depth} '

In [None]:
# plt.plot(depths, accuracy_train, 'r', depths, accuracy_test, 'b')
# plt.title('RandomForest model accuracy score vs depth')
# plt.legend(['Train accuracy, Test accuracy'])
# plt.show()

In [None]:
model = RandomForestClassifier(max_depth=best_depth, n_estimators=105)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
f'{accuracy_score(y_test, y_pred):.3f}'

In [None]:
y_pred

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred),
            annot=True,
            linecolor='white',
            linewidths=.1,
            xticklabels=['Predicted lose', 'Predicted win'],
            yticklabels=['Actual lose', 'Actual win'])
plt.title('Confusion matrix of RandomForest model')
plt.show()
plt.clf()

`Predicting the number of Goals full time`

In [None]:
linear_model = LinearRegression()
new_features = features[:-1]

In [None]:
new_features.extend(['Goals'])
new_features

In [None]:
target = 'Goals'
X = np.array(df_22_24[new_features].drop(target, axis=1))
y = np.array(df_22_24[target])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=20)
linear_model.fit(X_train, y_train)
y_pred = linear_model.predict(X_test)

In [None]:
f'Mean absoulute error: {mean_absolute_error(y_test, y_pred)}'

In [None]:
#Tuesday, 7 pm, Nott vs Arsenal
test_1 = np.array([[1, 19, decoder("Nott'm Forest"), decoder("Arsenal")]])
test_1, model.predict(test_1), linear_model.predict(test_1)

In [None]:
#Tuesday, 7 pm, Luton vs Brighton
test_2 = np.array([1, 19, decoder("Luton"), decoder("Brighton")]).reshape(1, -1)
test_2, model.predict(test_2)

In [None]:
#Tuesday, 7 pm, Fulham vs Everton
test_3 = np.array([1, 19, decoder("Fulham"), decoder("Everton")]).reshape(1, -1)
test_3, model.predict(test_3)

In [None]:
#Tuesday, 8 pm, Crystal vs Sheffield
test_4 = np.array([1, 20, decoder("Crystal Palace"), decoder("Sheffield United")]).reshape(1, -1)
test_4, model.predict(test_4)

In [None]:
#Tuesday, 8 pm, Aston Villa vs Newcastle
test_5 = np.array([1, 20, decoder("Aston Villa"), decoder("Newcastle")]).reshape(1, -1)
test_5, model.predict(test_5)

In [None]:
#Wednesday, 7 pm, Man City vs Burnley
test_6 = np.array([2, 19, decoder("Man City"), decoder("Burnley")]).reshape(1, -1)
test_6, model.predict(test_6)

In [None]:
#Wednesday, 7 pm, Tottenham vs Brentford
test_7 = np.array([2, 19, decoder("Tottenham"), decoder("Brentford")]).reshape(1, -1)
test_7, model.predict(test_7)

In [None]:
#Wednesday, 8 pm, Liverpool vs Chelsea
test_8 = np.array([2, 20, decoder("Liverpool"), decoder("Chelsea")]).reshape(1, -1)
test_8, model.predict(test_8)

In [None]:
def prediction(day: str, time: int, team_h: str, team_a: str) -> tuple:
    time -= 1
    days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    if day in days:
        day = days.index(day)
    result = model.predict(np.array([[day, time, decoder(team_h), decoder(team_a)]]))
    num_goals = linear_model.predict(np.array([[day, time, decoder(team_h), decoder(team_a)]]))
    return result, num_goals
prediction('Tuesday', 19, "Nott'm Forest", 'Arsenal')

In [None]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day = input(f'Enter the day from these options -> {days} when the match will be played: ')
time = int(input('\nEnter the time (e.g. if the match starts at 7 pm you may enter 19) when it will begin: '))
team_h = input(f'''\nEnter the name of the "Home Team from the folloing teams:
               \n{list(df_22_24.HomeTeam.unique()[:8])} or
               \n{list(df_22_24.HomeTeam.unique()[8:16])} or
               \n{list(df_22_24.HomeTeam.unique()[16:])}: ''')
team_a = input('\nEnter the name of the "Away Team". The options are the same as above: ')
if prediction(day, time, team_h, team_a)[0][0] == 0:
    print(f'\n{team_h} will DO NOT WIN meaning that the match will be DRAW OR LOSE.')
else:
    print(f'\n{team_h} will WIN.')
print(f'Expected goals: {prediction(day, time, team_h, team_a)[1][0]:.2f}')