In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import joblib


In [2]:
df = pd.read_csv('matches.csv')
df.info



<bound method DataFrame.info of       Unnamed: 0        date   time            comp         round  day venue  \
0              1  2021-08-15  16:30  Premier League   Matchweek 1  Sun  Away   
1              2  2021-08-21  15:00  Premier League   Matchweek 2  Sat  Home   
2              3  2021-08-28  12:30  Premier League   Matchweek 3  Sat  Home   
3              4  2021-09-11  15:00  Premier League   Matchweek 4  Sat  Away   
4              6  2021-09-18  15:00  Premier League   Matchweek 5  Sat  Home   
...          ...         ...    ...             ...           ...  ...   ...   
1384          38  2021-05-02  19:15  Premier League  Matchweek 34  Sun  Away   
1385          39  2021-05-08  15:00  Premier League  Matchweek 35  Sat  Home   
1386          40  2021-05-16  19:00  Premier League  Matchweek 36  Sun  Away   
1387          41  2021-05-19  18:00  Premier League  Matchweek 37  Wed  Away   
1388          42  2021-05-23  16:00  Premier League  Matchweek 38  Sun  Home   

     re

In [23]:
# df['result']
# df.head(10)
df.sample(10)


Unnamed: 0,team,opponent,venue,gf,ga,xg,xga,poss,winner
939,Leeds United,Leicester City,Home,1.0,4.0,1.0,3.5,67.0,Away
973,Everton,Crystal Palace,Away,2.0,1.0,2.0,0.4,58.0,Away
1350,West Bromwich Albion,Leeds United,Away,1.0,3.0,1.7,2.0,37.0,Home
91,Arsenal,Liverpool,Home,0.0,2.0,0.5,0.8,50.0,Away
1146,Crystal Palace,Burnley,Home,0.0,3.0,0.6,1.3,53.0,Away
835,West Ham United,Everton,Away,1.0,0.0,1.3,0.3,40.0,Away
174,West Ham United,Aston Villa,Away,4.0,1.0,2.3,0.7,58.0,Away
1144,Crystal Palace,Newcastle Utd,Away,2.0,1.0,0.9,1.8,39.0,Away
1008,Everton,Manchester City,Away,0.0,5.0,1.2,2.6,32.0,Home
349,Brentford,Manchester City,Home,0.0,1.0,0.3,0.8,24.0,Away


In [4]:
def decide_winner(row):
    if row['result'] == 'W':
        return 'Home' if row['venue'] == 'Home' else 'Away'
    elif row['result'] == 'L':
        return 'Away' if row['venue'] == 'Home' else 'Home'
    else:
        return 'Draw'

df['winner'] = df.apply(decide_winner, axis=1)


df = df[df['winner'] != 'Draw']

In [5]:
df = df[['team', 'opponent', 'venue', 'gf', 'ga', 'xg', 'xga', 'poss', 'winner']]

# Drop rows with missing important fields
df.dropna(subset=['team', 'opponent', 'venue', 'winner'], inplace=True)


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1074 entries, 0 to 1388
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   team      1074 non-null   object 
 1   opponent  1074 non-null   object 
 2   venue     1074 non-null   object 
 3   gf        1074 non-null   float64
 4   ga        1074 non-null   float64
 5   xg        1074 non-null   float64
 6   xga       1074 non-null   float64
 7   poss      1074 non-null   float64
 8   winner    1074 non-null   object 
dtypes: float64(5), object(4)
memory usage: 83.9+ KB


In [7]:
df.shape

(1074, 9)

In [8]:
X = df[['team', 'opponent', 'venue', 'gf', 'ga', 'xg', 'xga', 'poss']]
y = df['winner']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [10]:
# Categorical features
categorical_features = ['team', 'opponent', 'venue']

# Numerical features
numerical_features = ['gf', 'ga', 'xg', 'xga', 'poss']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])


In [11]:

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [None]:

pipeline.fit(X_train, y_train)


In [13]:

joblib.dump(pipeline, 'model.pkl')

teams = sorted(df['team'].unique())
joblib.dump(teams, 'teams.pkl')


['teams.pkl']

In [14]:
print(pipeline.score(X_test, y_test))


0.9953488372093023


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
log_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

log_model.fit(X_train, y_train)

# 5. Evaluate
print(log_model.score(X_test, y_test))

0.4511627906976744


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier())
])

knn_model.fit(X_train, y_train)
print(knn_model.score(X_test, y_test))


0.958139534883721


In [17]:
burnley_matches = df[df['team'] == 'Burnley']
burnley_away_matches = burnley_matches[burnley_matches['venue'] == 'Away']

print("Total away matches:", len(burnley_away_matches))
print("\nBurnley Away Results:")
print(burnley_away_matches['winner'].value_counts())


Total away matches: 25

Burnley Away Results:
winner
Home    18
Away     7
Name: count, dtype: int64


In [18]:

team_stats = df.groupby('team').agg(
    avg_gf=('gf', 'mean'),
    avg_ga=('ga', 'mean'),
    avg_xg=('xg', 'mean'),
    avg_xga=('xga', 'mean'),
    avg_poss=('poss', 'mean')
).reset_index()


team_stats = team_stats.round(2)


team_stats.to_csv('team_stats.csv', index=False)




In [None]:
home_team = 'Liverpool'
away_team = 'Burnley'
venue = 'Home'  

# Create a new sample for prediction
sample = pd.DataFrame([{
    'team': home_team,
    'opponent': away_team,
    'venue': venue,
    'gf': 0.92,    
    'ga': 1.69,
    'xg': 1.05,
    'xga': 1.55,
    'poss': 40.73
}])


predicted_winner = pipeline.predict(sample)[0]
predicted_proba = pipeline.predict_proba(sample).max()

print(f"Manual Prediction Result:")
print(f"Match: {home_team} vs {away_team} ({venue})")
print(f"Predicted Winner: {predicted_winner} ({predicted_proba*100:.1f}% confidence)")



🏆 Manual Prediction Result:
Match: Liverpool vs Burnley (Home)
Predicted Winner: Away (71.0% confidence)


In [20]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict on X_test
y_pred = pipeline.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[104   0]
 [  1 110]]

Classification Report:
               precision    recall  f1-score   support

        Away       0.99      1.00      1.00       104
        Home       1.00      0.99      1.00       111

    accuracy                           1.00       215
   macro avg       1.00      1.00      1.00       215
weighted avg       1.00      1.00      1.00       215

