In [5]:
import pandas as pd

# 1. Load the dataset from our data/raw folder
# The '../' means 'go up one directory level' from notebooks/
df = pd.read_csv(r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\premier_league_matches.csv')

# 2. Convert 'date' column to datetime objects
df['date'] = pd.to_datetime(df['date'])

# 3. Sort matches chronologically
df = df.sort_values('date')

# 4. Create our numerical target variable
# We'll map HOME_TEAM win to 1, DRAW to 0, and AWAY_TEAM win to 2
df['target'] = df['result'].map({'HOME_TEAM': 1, 'DRAW': 0, 'AWAY_TEAM': 2})

# --- Verification ---
# Use df.info() to see the data types of our columns (Date should be datetime64)
print("DataFrame Info:")
df.info()

# Display the first few rows with our new 'target' column
print("\nDataFrame Head:")
print(df.head())

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 0 to 29
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   date        30 non-null     datetime64[ns, UTC]
 1   home_team   30 non-null     object             
 2   away_team   30 non-null     object             
 3   home_goals  30 non-null     int64              
 4   away_goals  30 non-null     int64              
 5   result      30 non-null     object             
 6   target      30 non-null     int64              
dtypes: datetime64[ns, UTC](1), int64(3), object(3)
memory usage: 1.9+ KB

DataFrame Head:
                       date                  home_team            away_team  \
0 2025-08-15 19:00:00+00:00               Liverpool FC      AFC Bournemouth   
1 2025-08-16 11:30:00+00:00             Aston Villa FC  Newcastle United FC   
2 2025-08-16 14:00:00+00:00  Brighton & Hove Albion FC            Fulham FC   
3 2

In [6]:
# Define the number of past games to consider for form
ROLLING_WINDOW = 5

# A list of all unique team names
teams = pd.unique(df[['home_team', 'away_team']].values.ravel('K'))

# Create empty columns for our new form features, initializing them with 0.0
form_features = [
    'form_points', 'form_goals_scored', 'form_goals_conceded', 'form_goal_difference'
]
for side in ['home', 'away']:
    for feature in form_features:
        df[f'{side}_{feature}'] = 0.0

# --- Calculate Form for Each Team ---
for team in teams:
    # Filter for all matches played by the current team
    team_matches = df[(df['home_team'] == team) | (df['away_team'] == team)]

    # Calculate stats for this team in each of its matches
    # Points awarded: 3 for a win, 1 for a draw, 0 for a loss
    points = team_matches.apply(lambda row: 3 if (row['home_team'] == team and row['result'] == 'HOME_TEAM') or \
                                                 (row['away_team'] == team and row['result'] == 'AWAY_TEAM') else \
                                             1 if row['result'] == 'DRAW' else 0, axis=1)
    goals_scored = team_matches.apply(lambda row: row['home_goals'] if row['home_team'] == team else row['away_goals'], axis=1)
    goals_conceded = team_matches.apply(lambda row: row['away_goals'] if row['home_team'] == team else row['home_goals'], axis=1)

    # --- Calculate Rolling Averages ---
    # We use .shift(1) to get the form BEFORE the current match to prevent data leakage.
    # min_periods=1 allows us to get a value even for the first few games.
    rolling_stats = {
        'form_points': points.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goals_scored': goals_scored.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goals_conceded': goals_conceded.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goal_difference': (goals_scored - goals_conceded).shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean()
    }

    # --- Update the main DataFrame ---
    # We find the original indices of the team's matches and update the correct rows (home vs away)
    for feature, values in rolling_stats.items():
        df.loc[team_matches.index, f'home_{feature}'] = team_matches.index.map(values).where(team_matches['home_team'] == team, df.loc[team_matches.index, f'home_{feature}'])
        df.loc[team_matches.index, f'away_{feature}'] = team_matches.index.map(values).where(team_matches['away_team'] == team, df.loc[team_matches.index, f'away_{feature}'])

# The first few matches for each team won't have 5 previous games, creating 'NaN' (Not a Number) values.
# We'll fill these with 0.
df.fillna(0, inplace=True)

# --- Verification ---
# Display the last 5 rows to see the new form features for recent games
print("DataFrame with Form Features (Last 5 matches):")
print(df.tail())

DataFrame with Form Features (Last 5 matches):
                        date                  home_team            away_team  \
25 2025-08-30 16:30:00+00:00            Leeds United FC  Newcastle United FC   
26 2025-08-31 13:00:00+00:00  Brighton & Hove Albion FC   Manchester City FC   
27 2025-08-31 13:00:00+00:00       Nottingham Forest FC   West Ham United FC   
28 2025-08-31 15:30:00+00:00               Liverpool FC           Arsenal FC   
29 2025-08-31 18:00:00+00:00             Aston Villa FC    Crystal Palace FC   

    home_goals  away_goals     result  target  home_form_points  \
25           0           0       DRAW       0               1.5   
26           2           1  HOME_TEAM       1               0.5   
27           0           3  AWAY_TEAM       2               2.0   
28           1           0  HOME_TEAM       1               3.0   
29           0           3  AWAY_TEAM       2               0.5   

    home_form_goals_scored  home_form_goals_conceded  \
25           

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 1. Select our features (the 'clues') and the target (the 'answer')
features = [
    'home_form_points', 'home_form_goals_scored', 'home_form_goals_conceded', 'home_form_goal_difference',
    'away_form_points', 'away_form_goals_scored', 'away_form_goals_conceded', 'away_form_goal_difference'
]

X = df[features]
y = df['target']

# 2. Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize and train our Logistic Regression model
# The model 'learns' the relationship between X_train and y_train
model = LogisticRegression()
model.fit(X_train, y_train)

# 4. Evaluate the model's accuracy on the unseen test data
accuracy = model.score(X_test, y_test)

print(f"Our first model's accuracy on the test set is: {accuracy*100:.2f}%")

ModuleNotFoundError: No module named 'scipy'