# Support Vector Classifier (SVC)
## Josiah's Traditional ML Comparison Model

In [None]:
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# for SVC Model
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.svm import SVC

## Extract Premier League Soccer Data

In [None]:
premier_results = pd.read_csv("premier_data/premier_matches.csv")

#### Clean/Encode Data!

In [None]:
# POSSIBLY drop betting odds and other cols
some_cols_to_drop = [
    "MaxOver25", "MaxUnder25",
    "HandiSize", "HandiHome", "HandiAway",
    # "C_LTH", "C_LTA", "C_VHD", "C_VAD", "C_PHB", "C_HTB",   [Maybe?]
    # "OddHome", "OddDraw", "OddAway",  [CAN USE!]
    "MaxHome", "MaxDraw", "MaxAway", "Over25", "Under25",
    "MatchTime"  # Odd TIME cols
]
premier_results = premier_results.drop(columns=some_cols_to_drop)

In [None]:
# Grab PREMIER LEAGUE data
premier_results = premier_results[premier_results['Division'] == 'E0'].copy()
premier_results.head(20)

# Extract only Year from datetime conversion object
premier_results['MatchDate'] = pd.to_datetime(premier_results['MatchDate']).dt.year

# Pull recent DATA (-20 years)
premier_results = premier_results[premier_results['MatchDate'] >= 2005]
premier_results = premier_results[premier_results['MatchDate'] <= 2024]

# Grab ONLY Premier League Data - Division Code = E0
premier_results = premier_results[premier_results['Division'] == 'E0']
pd.set_option('display.max_columns', None)

# Drop DRAWs for BINARY CLASSIFICATION
premier_results = premier_results[premier_results['FTResult'] != 'D'].copy()
premier_results.head(15)

In [None]:
print(f"W/L ROWS: {premier_results.shape[0]}")
premier_results.head()
# Map H/A to 0/1
# H (Home Win) = 0
# A (Away Win) = 1

premier_results['FTResult'] = premier_results['FTResult'].map({'H': 1, 'A': 0})  # AWAY WINS weighted HIGHER?

premier_results.head(20)

In [None]:
# Drop any row that is missing Odds or Elo
cols_to_check = ['HomeElo', 'AwayElo', 'OddHome', 'OddAway']
premier_results = premier_results.dropna(subset=cols_to_check)

In [None]:
# Train/Test Split
train, test = train_test_split(premier_results, test_size=0.2, random_state=0)
print(f"Rows: {premier_results.shape[0]}")

In [None]:
X_train = train[['HomeElo',	'AwayElo', 'OddHome', 'OddAway', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away']]  # before the game 'starts'
y_train = train['FTResult']  # 'after' game

X_test = test[['HomeElo',	'AwayElo', 'OddHome', 'OddAway', 'Form3Home', 'Form5Home', 'Form3Away', 'Form5Away']]   # before the game 'starts'
y_test =  test['FTResult']

In [None]:
# Using Standard Scaler to fit/transform and SCALE data!
data_scaler = StandardScaler()

X_train_scaled = data_scaler.fit_transform(X_train)
X_test_scaled = data_scaler.fit_transform(X_test)

## Create the (non-linear) SVC Model!

In [None]:
#NON-Scale test
svc_model = SVC(kernel='rbf', C=1.0)  #non-linear 'bubble' = rbf model
svc_model.fit(X_train, y_train) # uses Labels for Regulation and updating?

predicted = svc_model.predict(X_test)
metrics.accuracy_score(predicted,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# 1. Define the "Grid" of values to test
# C: How much you punish errors (Low = Soft margin, High = Hard margin)
# gamma: How far the influence of a single training example reaches
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf']
}

# 2. Run the Search (This trains 80 different models!)
# verbose=2 lets you see the progress
grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=5)
grid.fit(X_train_scaled, y_train)

# 3. Print the Winner
print(f"Best Parameters: {grid.best_params_}")
print(f"Best Accuracy: {grid.best_score_:.4f}")

# 4. Use the best model to predict
grid_predictions = grid.predict(X_test_scaled)

In [None]:
svc_model = SVC(kernel='rbf', C=50.0, gamma=0.1)  #non-linear 'bubble' = rbf model
svc_model.fit(X_train_scaled, y_train) # uses Labels for Regulation and updating?

predicted = svc_model.predict(X_test_scaled)
metrics.accuracy_score(predicted,y_test)

In [None]:
DecisionBoundaryDisplay.from_estimator(
        svc_model,
        X_train_scaled,
        response_method="predict",
        alpha=0.8,
        cmap="Pastel1",
        xlabel='Soccer Cols',
        ylabel='Win?',
    )

plt.scatter(X[:, 0], X[:, 1],
            c=y,
            s=20, edgecolors="k")
plt.show()

In [None]:
# 1. Feature Engineering (The "Diff" variables that reveal the noise)
# Use your main dataframe variable (e.g., df or matches)
df_pca = premier_results.copy()
df_pca = df_pca[df_pca['FTResult'] != 'D'] # Remove draws

# Create the engineered features
df_pca['Elo_Diff'] = df_pca['HomeElo'] - df_pca['AwayElo']
df_pca['Form_Diff'] = df_pca['Form5Home'] - df_pca['Form5Away']
df_pca['Odds_Ratio'] = df_pca['OddAway'] / df_pca['OddHome']

# Select features and clean
features = ['Elo_Diff', 'Form_Diff', 'Odds_Ratio', 'OddHome', 'OddAway']
df_pca = df_pca.dropna(subset=features)

X_pca_data = df_pca[features]
y_pca_data = df_pca['FTResult']

# 2. Scale & Transform (StandardScaler is required for PCA)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled_pca = scaler.fit_transform(X_pca_data)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled_pca)

# 3. Create Plotting DataFrame
# 1. Create Plotting DataFrame
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# --- THE FIX: Use .values to ignore index mismatch ---
# This ensures the rows align by position, not by the old index numbers
raw_results = y_pca_data.values if hasattr(y_pca_data, 'values') else y_pca_data
pca_df['Result'] = raw_results

# 2. Robust Mapping (Handles 0/1 integers OR 'H'/'A' strings)
# We convert to string first so the map works for both types
pca_df['Result'] = pca_df['Result'].astype(str).map({
    '1': 'H', '1.0': 'H', 'H': 'H',
    '0': 'A', '0.0': 'A', 'A': 'A'
})

# Drop any rows where mapping failed (just in case)
pca_df = pca_df.dropna(subset=['Result'])

# 3. Generate the "Purple Fog" Plot
plt.figure(figsize=(10, 8))
sns.scatterplot(
    x='PC1',
    y='PC2',
    hue='Result',
    data=pca_df,
    palette={'H': 'red', 'A': 'blue'},
    alpha=0.5
)

# Add Centroids for clarity
centroids = pca_df.groupby('Result')[['PC1', 'PC2']].mean()

# Check if keys exist before plotting to prevent KeyError
if 'H' in centroids.index:
    plt.scatter(centroids.loc['H', 'PC1'], centroids.loc['H', 'PC2'],
                c='darkred', s=200, marker='X', edgecolors='black', label='Home Centroid')
if 'A' in centroids.index:
    plt.scatter(centroids.loc['A', 'PC1'], centroids.loc['A', 'PC2'],
                c='darkblue', s=200, marker='X', edgecolors='black', label='Away Centroid')

plt.title('PCA Analysis: Visualizing Class Overlap', fontsize=14)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.3)
plt.show()