In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

In [2]:
# File to Load 
path = "EPL_all_seasons_sum.csv"
EPL_df = pd.read_csv(path)
EPL_df.head()

Unnamed: 0.1,Unnamed: 0,Season,League Rank,Squad,# of Players,Avg Age,Avg Poss,MP,W,D,...,GD,Pts,Pts/MP,xG,xGA,xGD,xGD/90,Attendance,Annual Wages ($),League
0,0,2017-2018,1,Manchester City,32,26.4,71.0,38,32,4,...,79,100,2.63,78.6,23.8,54.8,1.44,54070,184103578.0,EPL
1,1,2017-2018,2,Manchester Utd,35,26.7,54.7,38,25,6,...,40,81,2.13,55.7,40.7,15.0,0.4,74976,201934520.0,EPL
2,2,2017-2018,3,Tottenham,34,25.8,61.8,38,23,8,...,38,77,2.03,64.7,33.9,30.8,0.81,67953,97579832.0,EPL
3,3,2017-2018,4,Liverpool,34,25.6,60.3,38,21,12,...,46,75,1.97,72.9,33.8,39.1,1.03,53049,116263857.0,EPL
4,4,2017-2018,5,Chelsea,45,26.7,55.6,38,21,7,...,24,70,1.84,54.4,33.8,20.6,0.54,41282,147846240.0,EPL


In [3]:
EPL_df.columns

Index(['Unnamed: 0', 'Season', 'League Rank', 'Squad', '# of Players',
       'Avg Age', 'Avg Poss', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Pts',
       'Pts/MP', 'xG', 'xGA', 'xGD', 'xGD/90', 'Attendance',
       'Annual Wages ($)', 'League'],
      dtype='object')

In [4]:
# group by team name
squad_groupby = EPL_df.groupby(['Squad'])

In [5]:
# Get average for all stats for each teams' last 5 seasons
avg_pts = squad_groupby['Pts'].mean()
avg_mp = squad_groupby['MP'].mean()
avg_wins = squad_groupby['W'].mean()
avg_draws = squad_groupby['D'].mean()
avg_losses = squad_groupby['L'].mean()
avg_poss = squad_groupby['Avg Poss'].mean()
avg_GF = squad_groupby['GF'].mean()
avg_GA = squad_groupby['GA'].mean()
avg_GD = squad_groupby['GD'].mean()
avg_xG = squad_groupby['xG'].mean()
avg_xGA = squad_groupby['xGA'].mean()
avg_xGD = squad_groupby['xGD'].mean()
avg_wages = squad_groupby['Annual Wages ($)'].mean()

In [6]:
# Create a DataFrame called using averages calculated above
EPL_sum ={
    "Matches_Played": avg_mp,
    "Wins": avg_wins,
    "Draws": avg_draws,
    "Losses": avg_losses,
    "Points": avg_pts,
    "Goal_Differential": avg_GD,
    "xG_Differential": avg_xGD,
    "Possession": avg_poss,
    'Wages': avg_wages}

EPL_sum_df = pd.DataFrame(EPL_sum)

EPL_sum_df

Unnamed: 0_level_0,Matches_Played,Wins,Draws,Losses,Points,Goal_Differential,xG_Differential,Possession,Wages
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Arsenal,38.0,18.8,7.4,11.8,63.8,16.4,8.36,55.92,163806000.0
Aston Villa,38.0,12.666667,7.0,18.333333,45.0,-6.333333,-9.0,46.366667,74838600.0
Bournemouth,38.0,11.0,8.0,19.0,41.0,-18.333333,-13.933333,46.366667,55941090.0
Brentford,38.0,13.0,7.0,18.0,46.0,-8.0,-2.7,44.8,25677630.0
Brighton,38.0,9.6,13.0,15.4,41.8,-13.6,-5.64,48.76,55778390.0
Burnley,38.0,11.4,10.2,16.4,44.4,-14.8,-14.36,41.82,48016070.0
Cardiff City,38.0,10.0,4.0,24.0,34.0,-35.0,-19.1,35.4,40090150.0
Chelsea,38.0,20.4,8.6,9.0,69.8,25.6,27.14,60.32,175604600.0
Crystal Palace,38.0,11.8,10.2,16.0,45.6,-10.4,-7.4,45.38,92044990.0
Everton,38.0,13.8,8.6,15.6,50.0,-8.4,-4.82,46.48,107874700.0


In [7]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = EPL_sum_df['Points']

# Separate the X variable, the features
X = EPL_sum_df.drop(columns = 'Points')

In [8]:
# Review the y variable Series
y

Squad
Arsenal            63.8
Aston Villa        45.0
Bournemouth        41.0
Brentford          46.0
Brighton           41.8
Burnley            44.4
Cardiff City       34.0
Chelsea            69.8
Crystal Palace     45.6
Everton            50.0
Fulham             27.0
Huddersfield       26.5
Leeds United       48.5
Leicester City     55.8
Liverpool          86.4
Manchester City    91.6
Manchester Utd     69.0
Newcastle Utd      45.4
Norwich City       21.5
Sheffield Utd      38.5
Southampton        42.0
Stoke City         33.0
Swansea City       33.0
Tottenham          68.0
Watford            37.0
West Brom          28.5
West Ham           50.8
Wolves             53.0
Name: Points, dtype: float64

In [9]:
# Review the X variable DataFrame
X

Unnamed: 0_level_0,Matches_Played,Wins,Draws,Losses,Goal_Differential,xG_Differential,Possession,Wages
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Arsenal,38.0,18.8,7.4,11.8,16.4,8.36,55.92,163806000.0
Aston Villa,38.0,12.666667,7.0,18.333333,-6.333333,-9.0,46.366667,74838600.0
Bournemouth,38.0,11.0,8.0,19.0,-18.333333,-13.933333,46.366667,55941090.0
Brentford,38.0,13.0,7.0,18.0,-8.0,-2.7,44.8,25677630.0
Brighton,38.0,9.6,13.0,15.4,-13.6,-5.64,48.76,55778390.0
Burnley,38.0,11.4,10.2,16.4,-14.8,-14.36,41.82,48016070.0
Cardiff City,38.0,10.0,4.0,24.0,-35.0,-19.1,35.4,40090150.0
Chelsea,38.0,20.4,8.6,9.0,25.6,27.14,60.32,175604600.0
Crystal Palace,38.0,11.8,10.2,16.0,-10.4,-7.4,45.38,92044990.0
Everton,38.0,13.8,8.6,15.6,-8.4,-4.82,46.48,107874700.0


In [10]:
# Check the balance of our target values
y.value_counts()

33.0    2
63.8    1
86.4    1
50.8    1
28.5    1
37.0    1
68.0    1
42.0    1
38.5    1
21.5    1
45.4    1
69.0    1
91.6    1
55.8    1
45.0    1
48.5    1
26.5    1
27.0    1
50.0    1
45.6    1
69.8    1
34.0    1
44.4    1
41.8    1
46.0    1
41.0    1
53.0    1
Name: Points, dtype: int64

In [11]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [12]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
model = LogisticRegression(random_state = 1)

# Fit the model using training data
model.fit(X_train, y_train)

ValueError: Unknown label type: 'continuous'