# SC1015 Project - Premier League Dataset

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics

from sklearn.model_selection import train_test_split

In [None]:
resultsData = pd.read_csv('results.csv')
statsData = pd.read_csv('stats.csv')
statsData.head()

## Total of 240 Rows and 42 Columns in stats.CSV file

### 2 Categorical Variables, 40 Numeric Variables

In [None]:
statsData.info()
statsData.shape

# Data Cleaning

## Check for any missing values in the stats Dataframe

In [None]:
statsDataFrame = pd.DataFrame(statsData)

# Check for missing values in each column
missing_values = statsDataFrame.isnull().sum()

# Print the number of missing values for each column
print(missing_values)
print()

# Filter out columns that have missing values and print them
missing_values_filtered = missing_values[missing_values > 0]
if len(missing_values_filtered) > 0:
    print("Columns with missing values and their count:")
    print(missing_values_filtered)
else:
    print("There are no missing values in any column.")

## Check the skewness for each of the columns with missing values

### Total of 6 columns with missing values will be placed in a pd DataFrame

In [None]:
missingValueColumns = pd.DataFrame(statsDataFrame[['saves', 'head_clearance', 'total_through_ball', 'backward_pass', 'big_chance_missed', 'dispossessed']])
missingValueColumns.skew()

### Analysis:

saves (0.392526): This shows a moderate positive skew, suggesting that most of the data are concentrated on the lower end, with fewer high values extending the tail to the right.

head_clearance (0.519139): Also indicates a moderate positive skew. Similar to "saves", most data points are lower, with some high values stretching the distribution to the right.

total_through_ball (1.725295): This has a high positive skewness, indicating a significant number of lower values and a long tail towards the higher values. This suggests that very high values are relatively rare but significantly impact the distribution's shape.

backward_pass (0.713525): Shows a positive skew but less extreme than total_through_ball. It indicates a concentration of data towards lower values with a tail of higher values.

big_chance_missed (1.065818): With a positive skewness greater than 1, this distribution has a long right tail. There are significantly more lower values, with the higher values stretching the distribution.

dispossessed (0.639976): Exhibits a moderate positive skew, indicating a concentration of lower values with a tail of higher values, but not as pronounced as total_through_ball or big_chance_missed.

Due to the existence of missing values, we decided to exclude the columns for head_clearance, total_through_ball, backward_pass, dispossessed, and saves. These columns pertain to actions such as clearing the ball using the head, executing passes in dynamic situations, making passes back towards one's own side, losing possession to an opponent, and preventing shots from scoring.

As a result, we are now concentrating on the 'big_chance_missed' columns, which we consider to be a strong predictor of a team's goals scored and its standing. The 'big_chance_missed' metric records significant scoring opportunities that were not capitalized on, potentially altering the outcome of a game.

Based on the positive skewness of the data, median imputation is generally more robust than mean imputation. The median is less affected by outliers and skewed data, making it a more representative measure of central tendency for skewed distributions. Therefore based on this, we decided to fill in the missing values for the saves and big_chance_missed, based on the median value for each club (20 Clubs total)

## Dropping of columns: head_clearance, total_through_ball, backward_pass, dispossessed

In [None]:
statsDataFrame.drop(columns=['head_clearance', 'total_through_ball', 'backward_pass', 'dispossessed', 'saves'], inplace=True)
statsDataFrame.head()

### statsDataFrame now has 37 columns

## Fill in missing values for big_chance_missed using Median

In [None]:
# Derby County/Charlton Athletic/Sheffield United/Portsmouth
filtered_indices = statsDataFrame['team'] == 'Arsenal'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Aston Villa'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Birmingham City'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Blackburn Rovers'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Bolton Wanderers'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Burnley'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Charlton Athletic'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Chelsea'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Derby County'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Everton'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Fulham'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Hull City'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Liverpool'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Manchester City'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Manchester United'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Middlesbrough'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Newcastle United'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Reading'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Portsmouth'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Sheffield United'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Sunderland'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Stoke City'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Tottenham Hotspur'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Watford'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'West Ham United'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'West Bromwich Albion'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

filtered_indices = statsDataFrame['team'] == 'Wigan Athletic'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)


filtered_indices = statsDataFrame['team'] == 'Wolverhampton Wanderers'
median = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].median()
print("Median of filtered column:", median)
statsDataFrame.loc[filtered_indices, 'big_chance_missed'] = statsDataFrame.loc[filtered_indices, 'big_chance_missed'].fillna(median)

## Adding of new column "Draw" to the statsDataFrame

Upon reviewing the statsDataFrame, we noticed the absence of a "draw" column that would reflect the number of draws a football team had during the season. Considering each team plays 38 matches annually, we computed the "draw" figures by deducting the sum of wins and losses from 38. After calculating this data, we would integrate this new "draw" column into the existing dataFrame.

In [None]:
# Calculate the "draw" column first
statsDataFrame['draw'] = 38 - statsDataFrame['wins'] - statsDataFrame['losses']

# Get the position of the 'losses' column
# The new column will be inserted at position + 1
position = statsDataFrame.columns.get_loc('losses') + 1

# Insert the 'draw' column next to 'losses'
statsDataFrame.insert(position, 'draw_temp', statsDataFrame['draw'])

# Drop the original 'draw' column and rename 'draw_temp' to 'draw'
statsDataFrame.drop('draw', axis=1, inplace=True)
statsDataFrame.rename(columns={'draw_temp': 'draw'}, inplace=True)

statsDataFrame.shape

## Adding of new column "total_points" to the statsDataFrame

Using wins and draw to tally the total points; 1 win = 3 points, 1 draw = 1 point. 
At the end of each season, team with most points would win.

In [None]:
# Calculate the "total_points" column first
statsDataFrame['total_points'] = statsDataFrame['wins'] * 3 + statsDataFrame['draw'] 

# Get the position of the 'draw' column
# The new column will be inserted at position + 1
position = statsDataFrame.columns.get_loc('draw') + 1

# Insert the 'total_points' column next to 'draw'
statsDataFrame.insert(position, 'total_points_temp', statsDataFrame['total_points'])

# Drop the original 'draw' column and rename 'draw_temp' to 'draw'
statsDataFrame.drop('total_points', axis=1, inplace=True)
statsDataFrame.rename(columns={'total_points_temp': 'total_points'}, inplace=True)

statsDataFrame.shape

## Adding of new column "goal_difference" to the statsDataFrame

In [None]:
# Assuming statsDataFrame is your existing DataFrame

# Calculate the goal difference for each club for each season
statsDataFrame['goal_difference'] = statsDataFrame['goals'] - statsDataFrame['goals_conceded']

total_points_index = statsDataFrame.columns.get_loc('total_points') + 1

# Reorder columns to place 'goal_difference' next to 'total_points'
statsDataFrame = statsDataFrame.reindex(columns= [*statsDataFrame.columns[:total_points_index],
                                                  'goal_difference',
                                                  *statsDataFrame.columns[total_points_index:-1]])

statsDataFrame.info()

## Adding of new column "placing" to the statsDataFrame
"Placing" will mean the position the team finished in for the season.

In [None]:
# Define the column variable to sort by
column_to_sort = 'total_points'
secondary_sort_column = 'goal_difference'
# Function to sort every 20 rows based on a column and modify the main DataFrame
def sort_every_20(df, column, secondary_column):
    for i in range(0, len(df), 20):
        subset = df.iloc[i:i+20]
        sorted_subset = subset.sort_values(by=[column, secondary_column], ascending=[False, False])
        df.iloc[i:i+20] = sorted_subset.values
    return df

# Call the function with your DataFrame
sorted_statsDataFrame = sort_every_20(statsDataFrame, column_to_sort,secondary_sort_column)

print(sorted_statsDataFrame)
statsDataFrame['placing'] = np.tile(range(1, 21), 12)[:len(statsDataFrame)]


# Get the position of the 'total_points' column
# The new column will be inserted at position + 1
position = statsDataFrame.columns.get_loc('total_points') + 1

# Insert the 'placing' column next to 'draw'
statsDataFrame.insert(position, 'placing_temp', statsDataFrame['placing'])

# Drop the original 'draw' column and rename 'draw_temp' to 'draw'
statsDataFrame.drop('placing', axis=1, inplace=True)
statsDataFrame.rename(columns={'placing_temp': 'placing'}, inplace=True)

# Exploratory Data Analysis

In [None]:
points = pd.DataFrame(statsDataFrame['total_points']) # Response
goals = pd.DataFrame(statsDataFrame['goals'])
goals_conceded = pd.DataFrame(statsDataFrame['goals_conceded'])
touches = pd.DataFrame(statsDataFrame['touches'])
clean_sheet = pd.DataFrame(statsDataFrame['clean_sheet'])
total_scoring_att = pd.DataFrame(statsDataFrame['total_scoring_att'])

In [None]:
# Set up matplotlib figure with three subplots
f, axes = plt.subplots(6, 3, figsize=(28, 28))

# Plot the basic uni-variate figures for total points
sb.boxplot(data = points, orient = "h", ax = axes[0,0])
sb.histplot(data = points, ax = axes[0,1])
sb.violinplot(data = points, orient = "h", ax = axes[0,2])

# Plot the basic uni-variate figures for goals
sb.boxplot(data = goals, orient = "h", ax = axes[1,0])
sb.histplot(data = goals, ax = axes[1,1])
sb.violinplot(data = goals, orient = "h", ax = axes[1,2])

# Plot the basic uni-variate figures for goal_conceded
sb.boxplot(data = goals_conceded, orient = "h", ax = axes[2,0])
sb.histplot(data = goals_conceded, ax = axes[2,1])
sb.violinplot(data = goals_conceded, orient = "h", ax = axes[2,2])

# Plot the basic uni-variate figures for touches
sb.boxplot(data = touches, orient = "h", ax = axes[3,0])
sb.histplot(data = touches, ax = axes[3,1])
sb.violinplot(data = touches, orient = "h", ax = axes[3,2])

# Plot the basic uni-variate figures for clean_sheet
sb.boxplot(data = clean_sheet, orient = "h", ax = axes[4,0])
sb.histplot(data = clean_sheet, ax = axes[4,1])
sb.violinplot(data = clean_sheet, orient = "h", ax = axes[4,2])

# Plot the basic uni-variate figures for total_scoring_att
sb.boxplot(data = total_scoring_att, orient = "h", ax = axes[5,0])
sb.histplot(data = total_scoring_att, ax = axes[5,1])
sb.violinplot(data = total_scoring_att, orient = "h", ax = axes[5,2])

## Correlation Matrix between variables

In [None]:
jointDF = pd.concat([points, goals, goals_conceded, touches, clean_sheet, total_scoring_att], axis = 1).reindex(points.index)
f = plt.figure(figsize = (10,8))
sb.heatmap(jointDF.corr(), vmin = -1, vmax = 1, linewidths = 1, annot = True, fmt = ".2f", annot_kws = {"size" : 18})
jointDF.corr()

### Analysis:

Goals: Highly positively correlated with total points (0.90), it gets in line for considering to be added to the model.

Goals Conceded: The defender of a team is another prospect with a strong negative correlation with total points (-0.84) and hence might be included in the list as indicator of the defensive strength.

Touches and Clean Sheet: Sufficiently related with total points (0.71 and 0.79 respectively), indicating they are very important and remember that this strongly correlated with each other (that is the coefficient is 0.56), suggesting multicollinearity if both are included.

Total Scoring Att: In high degree positive correlation with total points (0.76) this factor shows great individual impact upon the overall team result.

In [None]:
statsDataFrame.info()

# Linear Regression

As we aim to derive valuable insights that contribute to football betting, we plan to explore the relationship between total points and the variables 'goals', 'goals_conceded', 'clean_sheet', 'total_scoring_att', and 'goal_fastbreak' using multivariate linear regression. We intend to structure our data to model it as a time series. For instance, we'll use data from 2006-2015 as training data to test against the 2016 season, and data from 2006-2016 as training data to test against the 2017 season, and so on. The model will predict the total points based on the predictors mentioned above. By testing the model against the data from 2016, 2017, and 2018, we will evaluate whether the model’s prediction accuracy is sufficiently reliable to forecast subsequent season winners.

## Bivariate Linear Regression

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


# Data splits for multiple seasons
season_splits = {
    "9_seasons": (statsDataFrame.iloc[:180], statsDataFrame.iloc[181:201]),
    "10_seasons": (statsDataFrame.iloc[:200], statsDataFrame.iloc[201:221]),
    "11_seasons": (statsDataFrame.iloc[:220], statsDataFrame.iloc[221:241])
}

predictors = ["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]

# Initialize a Linear Regression model
linreg = LinearRegression()

# Setup the figure for plotting
fig, axes = plt.subplots(nrows=len(season_splits), ncols=len(predictors), figsize=(20, 15), constrained_layout=True)
if len(season_splits) == 1:  # Handling the case where there's only one row
    axes = [axes]

# Iterate over each season and each predictor
for season_index, (season, (train_data, test_data)) in enumerate(season_splits.items()):
    print(f"Model results for training on {season} and testing on the next season:")
    for predictor_index, predictor in enumerate(predictors):
        # Fit the model using training data for each predictor
        X_train = train_data[[predictor]]
        y_train = train_data["total_points"]
        linreg.fit(X_train, y_train)
        
        # Compute R² and predict on training data to compute MSE
        train_score = linreg.score(X_train, y_train)
        y_train_pred = linreg.predict(X_train)
        train_mse = mean_squared_error(y_train, y_train_pred)
        
        # Predict on testing data
        X_test = test_data[[predictor]]
        y_test = test_data["total_points"]
        test_score = linreg.score(X_test, y_test)
        y_test_pred = linreg.predict(X_test)
        test_mse = mean_squared_error(y_test, y_test_pred)
        
        # Plotting
        ax = axes[season_index][predictor_index]
        ax.scatter(X_train, y_train, color="blue", label="Training Data")
        ax.plot(X_train, y_train_pred, color="black", label="Model Prediction")
        ax.scatter(X_test, y_test, color="red", label="Test Data")
        ax.set_title(f"{season} - {predictor}")
        ax.set_xlabel(predictor)
        ax.set_ylabel("Total Points")
        ax.legend()

        # Print the results including intercept and coefficients
        print(f"Model details for predictor: {predictor}")
        print(f" Intercept of Regression: {linreg.intercept_}")
        print(f" Coefficient of Regression for {predictor}: {linreg.coef_[0]}")
        print(f" Goodness of Fit - Train Dataset:")
        print(f"  R^2 = {train_score:.4f}")
        print(f"  MSE = {train_mse:.4f}")
        print(f" Goodness of Fit - Test Dataset:")
        print(f"  R^2 = {test_score:.4f}")
        print(f"  MSE = {test_mse:.4f}")
        print()
    print("-----------------------------------------------------------------")

plt.show()

In [None]:
statsDataFrame.head(181)

## Multivariate Linear Regression

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


train_data_9_seasons = statsDataFrame.iloc[:180]   # First 9 seasons samples for training
test_data_10th_season = statsDataFrame.iloc[181:201]   # Number 10th season sample for testing

train_data_10_seasons = statsDataFrame.iloc[:200]   # First 10 seasons samples for training
test_data_11th_season = statsDataFrame.iloc[201:221]   # Number 11th sample for testing

train_data_11_seasons = statsDataFrame.iloc[:220]   # First 11 seasons samples for training
test_data_12th_season = statsDataFrame.iloc[221:241]   # Number 12th sample for testing


# Function to train the model and plot results
def train_and_plot(X_train, y_train, X_test, y_test, title_suffix):
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    # Output the intercept and coefficients
    print(f'Intercept of Regression for {title_suffix}: b = ', linreg.intercept_)
    print(f'Coefficients of Regression for {title_suffix}:')
    # Ensure that coefficients are formatted as a list of lists (if they are not already)
    coef_list = linreg.coef_.tolist()[0] if len(linreg.coef_.shape) > 1 else linreg.coef_.tolist()
    print(pd.DataFrame(list(zip(X_train.columns, coef_list)), columns=["Predictors", "Coefficients"]))
    print()

    # Predictions
    y_train_pred = linreg.predict(X_train)
    y_test_pred = linreg.predict(X_test)

    # Visualization
    f, axes = plt.subplots(1, 2, figsize=(24, 12))
    axes[0].scatter(y_train, y_train_pred, color="blue")
    axes[0].plot(y_train, y_train, 'w-', linewidth=1)
    axes[0].set_xlabel("True values of the Response Variable (Train)")
    axes[0].set_ylabel("Predicted values of the Response Variable (Train)")
    axes[0].set_title(f"Train Predictions for {title_suffix}")

    axes[1].scatter(y_test, y_test_pred, color="green")
    axes[1].plot(y_test, y_test, 'w-', linewidth=1)
    axes[1].set_xlabel("True values of the Response Variable (Test)")
    axes[1].set_ylabel("Predicted values of the Response Variable (Test)")
    axes[1].set_title(f"Test Predictions for {title_suffix}")
    plt.show()

    # Goodness of Fit
    print(f"Goodness of Fit of Model \tTrain Dataset ({title_suffix})")
    print("Explained Variance (R^2) \t:", linreg.score(X_train, y_train))
    print("Mean Squared Error (MSE) \t:", mean_squared_error(y_train, y_train_pred))
    print()

    print(f"Goodness of Fit of Model \tTest Dataset ({title_suffix})")
    print("Explained Variance (R^2) \t:", linreg.score(X_test, y_test))
    print("Mean Squared Error (MSE) \t:", mean_squared_error(y_test, y_test_pred))
    print()
    
# Train and test for the 9th season
X_train = train_data_9_seasons[["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]]
y_train = train_data_9_seasons["total_points"]
X_test = test_data_10th_season[["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]]
y_test = test_data_10th_season["total_points"]

train_and_plot(X_train, y_train, X_test, y_test, "9 Seasons Training, Test on 10th Season")

# Train and test for the 10th season
X_train = train_data_10_seasons[["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]]
y_train = train_data_10_seasons["total_points"]
X_test = test_data_11th_season[["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]]
y_test = test_data_11th_season["total_points"]

train_and_plot(X_train, y_train, X_test, y_test, "10 Seasons Training, Test on 11th Season")

# Train and test for the 11th season
X_train = train_data_11_seasons[["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]]
y_train = train_data_11_seasons["total_points"]
X_test = test_data_12th_season[["goals", "goals_conceded", "clean_sheet", "total_scoring_att", "goal_fastbreak"]]
y_test = test_data_12th_season["total_points"]

train_and_plot(X_train, y_train, X_test, y_test, "11 Seasons Training, Test on 12th Season")

# Classification

# Extra code

In [None]:
all_seasons = set(statsDataFrame['season'].unique())

# Group by 'team' and aggregate seasons into a set for each team
teams_seasons = statsDataFrame.groupby('team')['season'].agg(set)

# Filter teams that have data for all seasons
consistent_teams = teams_seasons[teams_seasons.apply(lambda x: x == all_seasons)]

# Extract the list of consistent teams
consistent_team_list = consistent_teams.index.tolist()

# Filter the original DataFrame to include only consistent teams
filtered_df = statsDataFrame[statsDataFrame['team'].isin(consistent_team_list)]
print("Teams that appeared in all seasons:", consistent_team_list)

## Adding of new column "qualifications" to the statsDataFrame

Positions 1-4 qualify for champion's league | Position(s) 5 qualify for europa league | Positions 18-20 qualify for relegation | Remaining positions do not qualify for any other leagues

In [None]:
def determine_qualification(placing):
    if 1 <= placing <= 4:
        return 'Champions League'
    elif placing == 5:
        return 'Europa League'
    elif 18 <= placing <= 20:
        return 'Relegation'
    else:
        return 'No qualifications'

    
statsDataFrame['qualifications'] = statsDataFrame['placing'].apply(determine_qualification)
# Find the index of the 'placement' column
placement_index = statsDataFrame.columns.get_loc('placing')

# Insert the 'qualifications' column right after the 'placement' column
# We use placement_index + 1 to place it right after the 'placement' column
statsDataFrame.insert(placement_index + 1, 'qualifications_new', statsDataFrame['qualifications'])

# Now you can drop the old 'qualifications' column since it's duplicated
statsDataFrame.drop('qualifications', axis=1, inplace=True)

# Rename the new column back to 'qualifications'
statsDataFrame.rename(columns={'qualifications_new': 'qualifications'}, inplace=True)

## Adding of new column "presence" to the statsDataFrame

In [None]:
statsDataFrame['presence'] = 'Present'

# Find the index of the 'team' column
team_index = statsDataFrame.columns.get_loc('team')

# Insert the 'presence' column right after the 'team' column
statsDataFrame.insert(team_index + 1, 'presence_new', statsDataFrame['presence'])

# Drop the old 'presence' column since it's now duplicated
statsDataFrame.drop('presence', axis=1, inplace=True)

# Optionally, rename the new column back to 'presence'
statsDataFrame.rename(columns={'presence_new': 'presence'}, inplace=True)

In [None]:
#statsDataFrame.to_csv('test.csv')

## Adding of new entries for teams that were absent throughout the 2006-2018 seasons

In [None]:
# Assuming statsDataFrame is already loaded
number_of_unique_clubs = statsDataFrame['team'].nunique()
all_clubs = set(statsDataFrame['team'].unique())  # Get a set of all unique clubs

# Get all unique seasons
unique_seasons = statsDataFrame['season'].unique()

# Dictionary to hold seasons and the clubs that did not appear
missing_clubs_by_season = {}

for season in unique_seasons:
    # Get the set of clubs that appeared in this season
    clubs_in_season = set(statsDataFrame[statsDataFrame['season'] == season]['team'])
    
    # Find clubs that did not appear in this season by subtracting the sets
    missing_clubs = all_clubs - clubs_in_season
    
    # Store the missing clubs in the dictionary
    missing_clubs_by_season[season] = missing_clubs
    
    # Print out the missing clubs for this season
    if missing_clubs:
        print(f"Missing clubs in {season}: {', '.join(missing_clubs)}")
    else:
        print(f"All clubs were present in {season}.")

        
###########################################################################

# Define the columns to be zeroed or defaulted
stats_columns = [col for col in statsDataFrame.columns if col not in ['team', 'season', 'presence']]
default_values = {col: 0 for col in stats_columns}  # Set default values for stats to zero

# List to store new row data
new_rows = []


for season, clubs in missing_clubs_by_season.items():
    for club in clubs:
        new_row = {'team': club, 'season': season, 'presence': 'Absent'}
        new_row.update(default_values)  # Update the row with default values for other stats
        new_rows.append(new_row)
        
# Convert list of new rows to DataFrame
new_entries_df = pd.DataFrame(new_rows)

# Concatenate this new DataFrame to the existing statsDataFrame
statsDataFrame = pd.concat([statsDataFrame, new_entries_df], ignore_index=True)

###################################################################

# Assuming the 'presence' column might have different casing or unexpected ordering, we map values to a helper numeric column
statsDataFrame['presence_order'] = statsDataFrame['presence'].map({'Present': 1, 'Absent': 2})  # Smaller numbers sort first
# Sorting by 'season' and 'presence_order'
statsDataFrame.sort_values(by=['season', 'presence_order'], ascending=[True, True], inplace=True)
# Reset the index after sorting
statsDataFrame.reset_index(drop=True, inplace=True)
# Drop the 'presence_order' column as it's no longer needed after sorting
statsDataFrame.drop('presence_order', axis=1, inplace=True)

## Fill up rows with "absent" status with NaN values for their stats

In [None]:
# Define which columns to fill with NaN
stats_columns = [col for col in statsDataFrame.columns if col not in ['team', 'season', 'presence']]

# Set columns to NaN for rows where 'presence' is 'Absent'
statsDataFrame.loc[statsDataFrame['presence'] == 'Absent', stats_columns] = np.nan

statsDataFrame.head(40)
statsDataFrame.to_csv('abc.csv')