In [1]:
import pandas as pd,numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score,cohen_kappa_score,confusion_matrix,recall_score,precision_score
from sklearn.linear_model import LogisticRegression
import plotly.express as px
from datetime import datetime
import time, random
import xgboost as xgb
#Set Seed
random.seed(69)
#Read dataset
train = pd.read_csv("/Users/jojoel/Google Drive/Github/bet369/Datasets/train2.csv", header = 0)
#Remove columns with lots of missing values
del train["BettingCuoteHT"]
del train["BettingCuoteFT"]
del train["HostGoalsFT"]
del train["GuestGoalsFT"]
#Filter matches with HT
train = train[train['Status']== "HT"]
train = train.drop_duplicates(subset = "MatchID")
#Remove last row wich are the features again
train = train.iloc[:-1]
#Remove matches with no posession
train = train.loc[(train['HostPossessionFT'] != "0") & (train['GuestPossessionFT'] != "0")]
# Indicies of each class' observations
i_class0 = np.where(train["Goal"] == "0")[0]
i_class1 = np.where(train["Goal"] == "1")[0]

#Get size of the underrepresented class
n_class0 = len(i_class0)

#Create new train with classes equally represented
i_class1_downsampled = np.random.choice(i_class1, size=n_class0, replace=False)
train = train.iloc[i_class0,:].append(train.iloc[i_class1_downsampled])
#Feature-engineering
X = train.iloc[:,12:32]
stand = MinMaxScaler().fit(X)
X = pd.DataFrame(stand.transform(X))
pca = PCA(0.99)
pca.fit(X)
pcX = pca.transform(X)
y = train["Goal"]
#Transform variables to numeric
for i in X.columns:
    X[i] = pd.to_numeric(X[i])
y = pd.to_numeric(y)
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(pcX,y,train_size = 0.7)
#Models
rf = RandomForestClassifier(n_estimators = 200)
rf.fit(X_train, y_train)
lr = LogisticRegression()
lr.fit(X_train, y_train)
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}
xgb = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1)
xgb.fit(X_train,y_train)
#Predictions
rfpreds = rf.predict(X_test)
lrpreds = lr.predict(X_test)
xgbpreds = xgb.predict(X_test)
#Probabilities
rfprobs = rf.predict_proba(X_test)[:,1]
lrprobs = lr.predict_proba(X_test)[:,1]
xgbprobs = xgb.predict_proba(X_test)[:,1]
#Results
print("Metric   "," RF "," LR ","XGB")
print("Accuracy ", round(accuracy_score(rfpreds,y_test),2), round(accuracy_score(lrpreds,y_test),2), 
      round(accuracy_score(xgbpreds,y_test),2))
print("Kappa    ",round(cohen_kappa_score(rfpreds,y_test),2),round(cohen_kappa_score(lrpreds,y_test),2),
      round(cohen_kappa_score(xgbpreds,y_test),2))
print("Precision",round(precision_score(rfpreds,y_test),2),round(precision_score(lrpreds,y_test),2),
      round(precision_score(xgbpreds,y_test),2))
print("Recall   ", round(recall_score(rfpreds,y_test),2),round(recall_score(lrpreds,y_test),2),
      round(recall_score(xgbpreds,y_test),2))




The default of the `iid` parameter will change from True to False in version 0.22 and will be removed in 0.24. This will change numeric results when test-set sizes are unequal.



Metric     RF   LR  XGB
Accuracy  0.62 0.69 0.67
Kappa     0.24 0.38 0.35
Precision 0.71 0.73 0.71
Recall    0.58 0.65 0.64
