In [1]:
# Models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import pickle

# Data
import pandas as pd
import numpy as np

# Dataviz
import plotly.express as px

# ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

Training-Test set

In [2]:
data = pd.read_csv('dataset.csv')
X = data.drop('win', axis=1)
y = data.win

In [4]:
data.xG

0      -0.004728
1       0.004728
2       0.394395
3      -0.394395
4       2.561767
          ...   
5329   -1.073942
5330    1.432983
5331   -1.432983
5332    0.552426
5333   -0.552426
Name: xG, Length: 5334, dtype: float64

In [8]:
# Split your data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.fit_transform(X_test)

# SVM

In [32]:
# Create the SVM model
# Using a linear kernel for simplicity
SVM_clf = svm.SVC(kernel='linear')

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
SVM_scores = cross_val_score(SVM_clf, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=-1)

# Print the cross-validation scores
print("Cross-validation scores:", SVM_scores)
print("Mean cross-validation score:", np.mean(SVM_scores))

# Train the model on the entire training set
SVM_clf.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
SVM_pred = SVM_clf.predict(X_test_scaled)
print("Accuracy on test set:", accuracy_score(y_test, SVM_pred))
print(classification_report(y_test, SVM_pred))
print(confusion_matrix(y_test, SVM_pred))

Cross-validation scores: [0.95673403 0.9525941  0.95591726 0.95279756 0.94967786]
Mean cross-validation score: 0.9535441609490733
Accuracy on test set: 0.9543728298611112
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     11643
           1       0.95      0.93      0.94      6789

    accuracy                           0.95     18432
   macro avg       0.95      0.95      0.95     18432
weighted avg       0.95      0.95      0.95     18432

[[11287   356]
 [  485  6304]]


In [33]:
# Your coefficients array
coefficients = SVM_clf.coef_[0]

# Example column names, replace these with your actual column names
column_names = X_train.columns

# Ensure the lengths match
assert len(coefficients) == len(column_names), "Coefficients and column names must have the same length."

# Sort coefficients and column names by the coefficients
sorted_indices = np.argsort((coefficients))
sorted_coefficients = coefficients[sorted_indices]
sorted_column_names = [column_names[i] for i in sorted_indices]

# Plot using Plotly Express
fig = px.bar(x=sorted_column_names, y=sorted_coefficients)
fig.show()

# LOGISTIC

In [10]:
# Create the Logistic Regression model
LOG_clf = LogisticRegression(max_iter=1000)

# Perform cross-validation
# Here, we use 5-fold cross-validation, but you can adjust the number of folds as needed
LOG_scores = cross_val_score(LOG_clf, X_train_scaled, y_train, cv=5, scoring='accuracy', n_jobs=8)

# Print the cross-validation scores
print("Cross-validation scores:", LOG_scores)
print("Mean cross-validation score:", np.mean(LOG_scores))

# Train the model on the entire training set
LOG_clf.fit(X_train_scaled, y_train)

# Evaluate the model on the test set
LOG_pred = LOG_clf.predict(X_test_scaled)
print("Accuracy on test set:", accuracy_score(y_test, LOG_pred))
print(classification_report(y_test, LOG_pred))
print(confusion_matrix(y_test, LOG_pred))


Cross-validation scores: [0.95442832 0.95103425 0.9557138  0.95266192 0.94574432]
Mean cross-validation score: 0.9519165218267949
Accuracy on test set: 0.9530707465277778
              precision    recall  f1-score   support

           0       0.96      0.97      0.96     11643
           1       0.95      0.92      0.94      6789

    accuracy                           0.95     18432
   macro avg       0.95      0.95      0.95     18432
weighted avg       0.95      0.95      0.95     18432

[[11293   350]
 [  515  6274]]


In [11]:
# Your coefficients array
coefficients = LOG_clf.coef_[0]

# Example column names, replace these with your actual column names
column_names = X_train.columns

# Ensure the lengths match
assert len(coefficients) == len(column_names), "Coefficients and column names must have the same length."

# Sort coefficients and column names by the coefficients
sorted_indices = np.argsort((coefficients))
sorted_coefficients = coefficients[sorted_indices]
sorted_column_names = [column_names[i] for i in sorted_indices]

# Plot using Plotly Express
fig = px.bar(x=sorted_column_names, y=sorted_coefficients)
fig.show()

In [12]:
# Pickling the LOG_clf model
with open('logistic.pkl', 'wb') as file:
    pickle.dump(LOG_clf, file)

# Pickling the SVM_clf model
# with open('SVM.pkl', 'wb') as file:
#     pickle.dump(SVM_clf, file)