In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.dummy import DummyRegressor

from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.compose import TransformedTargetRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("Wide_WithAreaDensity_V5.csv")

## Duplicating

In [3]:
data_dup = data[['Team2', 'Team1', 'Score2', 'Score1', 'Date/Time', 'nb_matches2', 'nb_matches1', 'goals_past2', 'goals_past1',
                 'ratio_goals_past2', 'ratio_goals_past1', 'nb_wins2', 'nb_wins1', 'ratio_wins2', 'ratio_wins1', 'nb_losses2',
                 'nb_losses1', 'ratio_losses2', 'ratio_losses1', 'nb_draws2', 'nb_draws1', 'ratio_draws2', 'ratio_draws1',
                 'nb_wins_opp2', 'nb_wins_opp1', 'ratio_wins_opp2', 'ratio_wins_opp1', 'nb_losses_opp2', 'nb_losses_opp1',
                 'ratio_losses_opp2', 'ratio_losses_opp1', 'nb_draws_opp2', 'nb_draws_opp1', 'ratio_draws_opp2',
                 'ratio_draws_opp1', 'Ranks2', 'Ranks1', 'Points2', 'Points1', 'Year', 'Population2', 'Population1',
                 'Surface2', 'Surface1', 'Density2', 'Density1']]
data_dup.columns = ['Team1', 'Team2', 'Score1', 'Score2', 'Date/Time', 'nb_matches1', 'nb_matches2', 'goals_past1', 
                    'goals_past2', 'ratio_goals_past1', 'ratio_goals_past2', 'nb_wins1', 'nb_wins2', 'ratio_wins1',
                    'ratio_wins2', 'nb_losses1', 'nb_losses2', 'ratio_losses1', 'ratio_losses2', 'nb_draws1', 'nb_draws2', 
                    'ratio_draws1', 'ratio_draws2', 'nb_wins_opp1', 'nb_wins_opp2', 'ratio_wins_opp1',
                    'ratio_wins_opp2', 'nb_losses_opp1', 'nb_losses_opp2', 'ratio_losses_opp1', 'ratio_losses_opp2', 
                    'nb_draws_opp1', 'nb_draws_opp2', 'ratio_draws_opp1', 'ratio_draws_opp2', 'Ranks1', 'Ranks2', 'Points1',
                    'Points2', 'Year', 'Population1', 'Population2', 'Surface1', 'Surface2', 'Density1', 'Density2']
data = data.append(data_dup, ignore_index=True)

## Split into Train and Test

In [4]:
df_test = data.loc[data["Date/Time"] >= '2018']
df_train = data.loc[data["Date/Time"] < '2018']

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

## Get Target Vector

In [5]:
y_train = []
for i in range(len(df_train)):
    if df_train['Score1'][i]+df_train['Score2'][i] == 0:
        y_train.append(0)
    else:
        y_train.append(df_train['Score1'][i]/(df_train['Score1'][i]+df_train['Score2'][i]))
        
y_test = []
for i in range(len(df_test)):
    if df_test['Score1'][i] > df_test['Score2'][i]:
        y_test.append(1)
    elif df_test['Score1'][i] < df_test['Score2'][i]:
        y_test.append(2)
    elif df_test['Score1'][i] == df_test['Score2'][i]:
        y_test.append(0)

## Deleting Useless Columns: Getting X

In [6]:
X_train = df_train.drop(['Team1','Team2','Score1','Score2','Date/Time','Year'],axis=1)
X_test = df_test.drop(['Team1','Team2','Score1','Score2','Date/Time','Year'],axis=1)

In [7]:
data.columns

Index(['Team1', 'Team2', 'Score1', 'Score2', 'Date/Time', 'nb_matches1',
       'nb_matches2', 'goals_past1', 'goals_past2', 'ratio_goals_past1',
       'ratio_goals_past2', 'nb_wins1', 'nb_wins2', 'ratio_wins1',
       'ratio_wins2', 'nb_losses1', 'nb_losses2', 'ratio_losses1',
       'ratio_losses2', 'nb_draws1', 'nb_draws2', 'ratio_draws1',
       'ratio_draws2', 'nb_wins_opp1', 'nb_wins_opp2', 'ratio_wins_opp1',
       'ratio_wins_opp2', 'nb_losses_opp1', 'nb_losses_opp2',
       'ratio_losses_opp1', 'ratio_losses_opp2', 'nb_draws_opp1',
       'nb_draws_opp2', 'ratio_draws_opp1', 'ratio_draws_opp2', 'Ranks1',
       'Ranks2', 'Points1', 'Points2', 'Year', 'Population1', 'Population2',
       'Surface1', 'Surface2', 'Density1', 'Density2'],
      dtype='object')

## 2-class Classification: Removing the observations corresponding to Draw

In [8]:
def reg_to_class (y):
    y_class = []
    
    for i in range(len(y)//2):
        if abs(y[i] - y[i+len(y)//2]) < 0.03:
            y_class.append(0)
        elif y[i] > y[i+len(y)//2]:
            y_class.append(1)
        elif y[i] < y[i+len(y)//2]:
            y_class.append(2)
            
    y_class2 = []
    for i in y_class:
        if i == 0:
            y_class2.append(0)
        elif i == 1:
            y_class2.append(2)
        elif i == 2:
            y_class2.append(1)
        
    return(y_class+y_class2)

In [9]:
l_del = []
for i in range(len(y_train)):
    if reg_to_class(y_train)[i] == 0:
        l_del.append(i)
        
X_train = X_train.drop(l_del, axis=0)

y_train2 = []
for i in range(len(y_train)):
    if not(i in l_del):
        y_train2.append(y_train[i])
    
X_train.reset_index(drop=True)
y_train = y_train2

In [10]:
l_del = []
for i in range(len(y_test)):
    if reg_to_class(y_test)[i] == 0:
        l_del.append(i)
        
X_test = X_test.drop(l_del, axis=0)

y_test2 = []
for i in range(len(y_test)):
    if not(i in l_del):
        y_test2.append(y_test[i])
    
X_test.reset_index(drop=True)
y_test = y_test2

In [11]:
cols = X_train.columns

## Scaling

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  This is separate from the ipykernel package so we can avoid doing imports until


In [13]:
## If you want to try the 2-class classification, please uncomment this cell

# def reg_to_class (y):
#     y_class = []
    
#     for i in range(len(y)//2):
#         if y[i] >= y[i+len(y)//2]:
#             y_class.append(1)
#         elif y[i] < y[i+len(y)//2]:
#             y_class.append(2)
            
#     y_class2 = []
#     for i in y_class:
#         if i == 1:
#             y_class2.append(2)
#         elif i == 2:
#             y_class2.append(1)
        
#     return(y_class+y_class2)

## Trying out Regression Algorithms

In [14]:
model = DummyRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)

0.0

In [15]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00       503
           2       0.00      0.00      0.00       503

   micro avg       0.00      0.00      0.00      1006
   macro avg       0.00      0.00      0.00      1006
weighted avg       0.00      0.00      0.00      1006



  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [16]:
model = MLPRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)

0.562624254473161

In [17]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.62      0.56      0.59       503
           2       0.62      0.56      0.59       503

   micro avg       0.56      0.56      0.56      1006
   macro avg       0.41      0.38      0.39      1006
weighted avg       0.62      0.56      0.59      1006



  'recall', 'true', average, warn_for)


In [18]:
model = GradientBoostingRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)

0.6163021868787276

In [19]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.67      0.62      0.64       503
           2       0.67      0.62      0.64       503

   micro avg       0.62      0.62      0.62      1006
   macro avg       0.45      0.41      0.43      1006
weighted avg       0.67      0.62      0.64      1006



  'recall', 'true', average, warn_for)


In [20]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)



0.5725646123260437

In [21]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.64      0.57      0.60       503
           2       0.64      0.57      0.60       503

   micro avg       0.57      0.57      0.57      1006
   macro avg       0.43      0.38      0.40      1006
weighted avg       0.64      0.57      0.60      1006



  'recall', 'true', average, warn_for)


In [22]:
for i in range(len(cols)):
    print(cols[i], ' --> ',model.feature_importances_[i])

nb_matches1  -->  0.021914959040027458
nb_matches2  -->  0.024952730512294415
goals_past1  -->  0.02702473170123937
goals_past2  -->  0.028255470871859468
ratio_goals_past1  -->  0.05003144338177383
ratio_goals_past2  -->  0.056887490273537886
nb_wins1  -->  0.007760844221412017
nb_wins2  -->  0.007920731678405154
ratio_wins1  -->  0.03075102188173474
ratio_wins2  -->  0.029609859738588967
nb_losses1  -->  0.008253885942685406
nb_losses2  -->  0.007471461481307944
ratio_losses1  -->  0.03240188322399161
ratio_losses2  -->  0.032115803138667263
nb_draws1  -->  0.012501462015065254
nb_draws2  -->  0.0133389411274314
ratio_draws1  -->  0.0316082649705399
ratio_draws2  -->  0.02839804370587932
nb_wins_opp1  -->  0.0020628551072736545
nb_wins_opp2  -->  0.0018424523208705281
ratio_wins_opp1  -->  0.002614061349884087
ratio_wins_opp2  -->  0.0022335614916429818
nb_losses_opp1  -->  0.001409279513063175
nb_losses_opp2  -->  0.0021964319517308144
ratio_losses_opp1  -->  0.0026719676118895355
r

In [23]:
model = AdaBoostRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)

0.536779324055666

In [24]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.67      0.54      0.60       503
           2       0.67      0.54      0.60       503

   micro avg       0.54      0.54      0.54      1006
   macro avg       0.45      0.36      0.40      1006
weighted avg       0.67      0.54      0.60      1006



  'recall', 'true', average, warn_for)


In [25]:
model = BaggingRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)

0.5685884691848907

In [26]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.61      0.57      0.59       503
           2       0.61      0.57      0.59       503

   micro avg       0.57      0.57      0.57      1006
   macro avg       0.41      0.38      0.39      1006
weighted avg       0.61      0.57      0.59      1006



  'recall', 'true', average, warn_for)


In [27]:
model = TransformedTargetRegressor()
model.fit(X_train, y_train)
accuracy_score(reg_to_class(model.predict(X_test)),y_test)

0.6202783300198808

In [28]:
print(classification_report(y_test, reg_to_class(model.predict(X_test))))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.68      0.62      0.65       503
           2       0.68      0.62      0.65       503

   micro avg       0.62      0.62      0.62      1006
   macro avg       0.45      0.41      0.43      1006
weighted avg       0.68      0.62      0.65      1006



  'recall', 'true', average, warn_for)


In [29]:
# model = GradientBoostingRegressor(random_state=0)
# model.fit(X_train, y_train)
# T = [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1,0.11,0.12,0.13,0.14,0.15,0.16,0.17,0.18,0.19,0.2]
# R = []
# for t in T:
#     R.append(accuracy_score(reg_to_class(model.predict(X_test),t),y_test))

In [30]:
# plt.plot(T,R)
# plt.xlabel("Threshold")
# plt.ylabel("Accuracy of the Gradient Boosting")

In [31]:
# model = GradientBoostingRegressor(random_state=0)
# model.fit(X_train, y_train)
# T = [0.01,0.015,0.02,0.025,0.03,0.035,0.04]
# R = []
# for t in T:
#     R.append(accuracy_score(reg_to_class(model.predict(X_test),t),y_test))

In [32]:
# plt.plot(T,R)
# plt.xlabel("Threshold")
# plt.ylabel("Accuracy of the Gradient Boosting")

In [33]:
# print(classification_report(y_test, reg_to_class(model.predict(X_test),0.03)))