Import the necessary packages 

In [1]:
import pandas as pd 
import numpy as np 
import os
import plotly.express as px
import plotly.graph_objects as go

Import csv containing boxing data that was cleaned on python and excel

In [2]:
box_d4 = pd.read_csv('box_d4.csv')
box_d4.dropna()


Unnamed: 0,Rank,Name,Division,Beginning Year,Longevity,Origin,Wins,Losses,Draws,Birth Year,Starting Age,Total Bouts,Win %
0,1,Ray Robinson,middle,1940,25,"Harlem, New York, USA",174,19,6,1921,19,199,0.874372
1,2,Jimmy McLarnin,welter,1923,13,"Glendale, California, USA",55,11,3,1907,16,69,0.797101
2,3,Floyd Mayweather Jr,welter,1996,21,"Las Vegas, Nevada, USA",50,0,0,1977,19,50,1.000000
3,4,Harry Greb,middle,1913,13,"Pittsburgh, Pennsylvania, USA",265,19,17,1894,19,301,0.880399
4,5,Carlos Monzon,middle,1963,14,"Santa Fe, Santa Fe, Argentina",87,3,9,1942,21,99,0.878788
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,221,Roman Gonzalez,super fly,2005,19,"Managua, Nicaragua",52,4,0,1987,18,56,0.928571
221,222,Sergio Martinez,middle,1997,26,Spain,57,3,2,1975,22,62,0.919355
222,223,Victor Galindez,light heavy,1969,11,"Moron, Buenos Aires, Argentina",55,9,4,1948,21,68,0.808824
223,224,California Jackie,welter,1936,13,"Los Angeles, California, USA",63,19,6,1918,18,88,0.715909


In [3]:
box_d4['Starting Age'].min(), box_d4['Starting Age'].max()

(10, 31)

Initial Data Visualization using Plotly

In [4]:
fig = px.bar(
    box_d4,
    x="Beginning Year",
    y="Longevity",
    color="Division",
    hover_name="Name",
    title="Boxing Career Longevity by Division",
    labels={"Longevity": "Longevity (Years)", "Beginning Year": "Career Start Year"},
    color_discrete_sequence=px.colors.qualitative.Alphabet
)

fig.update_layout(
    plot_bgcolor='whitesmoke',
    paper_bgcolor='whitesmoke',
    font=dict(color='black'),
    bargap=0.3
)

fig.update_layout(bargap=0.3)
fig.show()

Using Wikipedia to find the birthdates of the fighters.

In [None]:
import wikipedia
import re

#Function for extracting birth year from Wikipedia summary
def get_birth_year(name):
    try:
        search_results = wikipedia.search(name)
        if not search_results:
            return None

        page_title = search_results[0]
        summary = wikipedia.summary(page_title, sentences=3, auto_suggest=False)

        # Comprehensive set of patterns to extract 4-digit birth years
        patterns = [
            r"\(born\s+[A-Za-z]+\s+\d{1,2},\s+(\d{4})",           # (born January 1, 1940)
            r"\(born\s+(\d{4})",                                  # (born 1940)
            r"\bborn\s+[A-Za-z]+\s+\d{1,2},\s+(\d{4})",           # born January 1, 1940
            r"\bborn\s+in\s+(\d{4})",                             # born in 1940
            r"\bborn\s+on\s+[A-Za-z]+\s+\d{1,2},\s+(\d{4})",      # born on January 1, 1940
            r"\(b\.\s+(\d{4})",                                   # (b. 1940)
            r"\(b\.\s+[A-Za-z]+\s+\d{1,2},\s+(\d{4})",            # (b. January 1, 1940)
            r"\bb\.\s+(\d{4})",                                   # b. 1940
            r"\bb\.\s+[A-Za-z]+\s+\d{1,2},\s+(\d{4})",            # b. January 1, 1940
            r"\*\s*(\d{4})",                                      # * 1940 (some lists use this for birth)
            r"\b[Dd]ate\s+of\s+[Bb]irth.*?(\d{4})",               # Date of Birth ... 1940
            r"\(.*?(\d{4})\s*–",                                  # (1940 – 2021)
            r"\(.*?(\d{4})\)",                                    # (1940)
            r"\b(\d{4})\s*–\s*\d{4}",                             # 1940–2021
            r"\bborn.*?(\d{4})",                                  # loose fallback: born ... 1940
            r"\b(\d{4})\s*–",                                     # 1940– (common in intro)
        ]

        for pattern in patterns:
            match = re.search(pattern, summary)
            if match:
                year = match.group(1)
                if year.isdigit() and 1800 < int(year) < 2025:
                    return int(year)

        return None

    except Exception as e:
        print(f"{name}: {e}")
        return None

In [6]:
print(box_d4)



     Rank                 Name     Division  Beginning Year  Longevity  \
0       1         Ray Robinson       middle            1940         25   
1       2       Jimmy McLarnin       welter            1923         13   
2       3  Floyd Mayweather Jr       welter            1996         21   
3       4           Harry Greb       middle            1913         13   
4       5        Carlos Monzon       middle            1963         14   
..    ...                  ...          ...             ...        ...   
220   221       Roman Gonzalez    super fly            2005         19   
221   222      Sergio Martinez       middle            1997         26   
222   223      Victor Galindez  light heavy            1969         11   
223   224    California Jackie       welter            1936         13   
224   225         Jimmy Carter        light            1946         14   

                             Origin  Wins  Losses  Draws  Birth Year  \
0             Harlem, New York, USA   1

Encoding Features

In [None]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

# Create a list of divisions
divisions = ['minimum', 'super fly','bantam','super bantam', 'feather', 'super feather',
             'light', 'super light', 'welter','super welter', 'middle', 'super middle', 'light heavy', 'heavy']
ord_features = ['Division']
ordEnc = OrdinalEncoder(categories = [divisions])


# Create a ColumnTransformer to apply the OrdinalEncoder to the specified columns
coltrans = ColumnTransformer(
    transformers=[
        ("ord", ordEnc, ord_features),
        ],
    remainder = 'passthrough',
    verbose_feature_names_out=False)


X_trans = coltrans.fit_transform(box_d4)
X_trans

array([[10.0, 1, 'Ray Robinson', ..., 19, 199, 0.874371859],
       [8.0, 2, 'Jimmy McLarnin', ..., 16, 69, 0.797101449],
       [8.0, 3, 'Floyd Mayweather Jr', ..., 19, 50, 1.0],
       ...,
       [12.0, 223, 'Victor Galindez', ..., 21, 68, 0.808823529],
       [8.0, 224, 'California Jackie', ..., 18, 88, 0.715909091],
       [6.0, 225, 'Jimmy Carter', ..., 22, 125, 0.68]], dtype=object)

In [8]:
new_feature_names = coltrans.get_feature_names_out()
new_feature_names

array(['Division', 'Rank', 'Name', 'Beginning Year', 'Longevity',
       'Origin', 'Wins', 'Losses', 'Draws', 'Birth Year', 'Starting Age',
       'Total Bouts', 'Win %'], dtype=object)

Create a New Dataframe with the encoded features

In [9]:
boxing_d2 = pd.DataFrame(X_trans, columns = new_feature_names)
boxing_d2

Unnamed: 0,Division,Rank,Name,Beginning Year,Longevity,Origin,Wins,Losses,Draws,Birth Year,Starting Age,Total Bouts,Win %
0,10.0,1,Ray Robinson,1940,25,"Harlem, New York, USA",174,19,6,1921,19,199,0.874372
1,8.0,2,Jimmy McLarnin,1923,13,"Glendale, California, USA",55,11,3,1907,16,69,0.797101
2,8.0,3,Floyd Mayweather Jr,1996,21,"Las Vegas, Nevada, USA",50,0,0,1977,19,50,1.0
3,10.0,4,Harry Greb,1913,13,"Pittsburgh, Pennsylvania, USA",265,19,17,1894,19,301,0.880399
4,10.0,5,Carlos Monzon,1963,14,"Santa Fe, Santa Fe, Argentina",87,3,9,1942,21,99,0.878788
...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,1.0,221,Roman Gonzalez,2005,19,"Managua, Nicaragua",52,4,0,1987,18,56,0.928571
221,10.0,222,Sergio Martinez,1997,26,Spain,57,3,2,1975,22,62,0.919355
222,12.0,223,Victor Galindez,1969,11,"Moron, Buenos Aires, Argentina",55,9,4,1948,21,68,0.808824
223,8.0,224,California Jackie,1936,13,"Los Angeles, California, USA",63,19,6,1918,18,88,0.715909


In [10]:
print(boxing_d2)

    Division Rank                 Name Beginning Year Longevity  \
0       10.0    1         Ray Robinson           1940        25   
1        8.0    2       Jimmy McLarnin           1923        13   
2        8.0    3  Floyd Mayweather Jr           1996        21   
3       10.0    4           Harry Greb           1913        13   
4       10.0    5        Carlos Monzon           1963        14   
..       ...  ...                  ...            ...       ...   
220      1.0  221       Roman Gonzalez           2005        19   
221     10.0  222      Sergio Martinez           1997        26   
222     12.0  223      Victor Galindez           1969        11   
223      8.0  224    California Jackie           1936        13   
224      6.0  225         Jimmy Carter           1946        14   

                             Origin Wins Losses Draws Birth Year Starting Age  \
0             Harlem, New York, USA  174     19     6       1921           19   
1         Glendale, California, U

Check the minimum and maximum values for outliers 

In [11]:
boxing_d2['Starting Age'].min(), boxing_d2['Starting Age'].max()

(10, 31)

Model validation/ Selection

Standardization and Scaling 

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = boxing_d2.drop(columns=["Rank","Win %", "Name", "Origin","Wins","Losses","Draws","Total Bouts"])  # replace with your actual target
y = boxing_d2["Win %"]

feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X

Unnamed: 0,Division,Beginning Year,Longevity,Birth Year,Starting Age
0,10.0,1940,25,1921,19
1,8.0,1923,13,1907,16
2,8.0,1996,21,1977,19
3,10.0,1913,13,1894,19
4,10.0,1963,14,1942,21
...,...,...,...,...,...
220,1.0,2005,19,1987,18
221,10.0,1997,26,1975,22
222,12.0,1969,11,1948,21
223,8.0,1936,13,1918,18


ElasticNet

In [39]:
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score

enet = ElasticNetCV(cv=5, random_state=42)
enet.fit(X_train_scaled, y_train)

y_pred_enet = enet.predict(X_test_scaled)

print("ElasticNet R²:", r2_score(y_test, y_pred_enet))
print("ElasticNet RMSE:", mean_squared_error(y_test, y_pred_enet, squared=False))

for feature, coef in zip(feature_names, enet.coef_):
    print(f"Feature: {feature}, Coefficient: {coef}")

ElasticNet R²: 0.3913066558716891
ElasticNet RMSE: 0.07893752155448257
Feature: Division, Coefficient: 0.0
Feature: Beginning Year, Coefficient: 0.05227019374638879
Feature: Longevity, Coefficient: -0.015237887588237516
Feature: Birth Year, Coefficient: 0.0
Feature: Starting Age, Coefficient: 0.00264571108050764



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



ElasticNet Feature Importance

In [45]:
import plotly.express as px
import pandas as pd

# Create a DataFrame of coefficients
coef_df = pd.Series(enet.coef_, index=X.columns).sort_values(ascending=False).reset_index()
coef_df.columns = ['Feature', 'Coefficient']

# Plot with Plotly
fig = px.bar(
    coef_df,
    x='Coefficient',
    y='Feature',
    orientation='h',
    title='ElasticNet Feature Importance',
    labels={'Coefficient': 'Model Coefficient', 'Feature': 'Feature'},
    text='Coefficient'
)
fig.update_layout(yaxis=dict(autorange="reversed"))

fig.show()

GridSearch to find best parameters

In [41]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

# Define the grid
param_grid = {
    'alpha': [0.01, 0.1, 1.0, 10.0],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9, 1.0]
}

# Create model
elastic_net = ElasticNet(max_iter=10000, random_state=42)

# Grid Search
grid_search_enet = GridSearchCV(
    estimator=elastic_net,
    param_grid=param_grid,
    cv=5,
    scoring='r2',
    verbose=1
)

# Fit model
grid_search_enet.fit(X_train_scaled, y_train)

# Best model
best_enet = grid_search_enet.best_estimator_

print("Best Params:", grid_search_enet.best_params_)
print("Train R²:", grid_search_enet.best_score_)

# Evaluate on test set
from sklearn.metrics import r2_score, mean_squared_error

y_pred_enet = best_enet.predict(X_test_scaled)
print("Test R²:", r2_score(y_test, y_pred_enet))
print("Test RMSE:", mean_squared_error(y_test, y_pred_enet, squared=False))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Params: {'alpha': 0.01, 'l1_ratio': 0.5}
Train R²: 0.34845892872189344
Test R²: 0.38534811262898183
Test RMSE: 0.07932294318011471



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Model Summary 

In [30]:
# Print a summary of the best ElasticNet model
print("ElasticNet Model Summary:")
print(f"Alpha: {best_enet.alpha}")
print(f"L1 Ratio: {best_enet.l1_ratio}")
print(f"Coefficients: {best_enet.coef_}")
print(f"Intercept: {best_enet.intercept_}")

ElasticNet Model Summary:
Alpha: 0.01
L1 Ratio: 0.5
Coefficients: [ 0.          0.04994623 -0.01266566  0.          0.00143711]
Intercept: 0.8496191143277777


Model Summary with Features 

In [None]:
for feature, coef in zip(feature_names, best_enet.coef_):
    print(f"Feature: {feature}, Coefficient: {coef}")

Feature: Division, Coefficient: 0.0
Feature: Beginning Year, Coefficient: 0.05227019374638879
Feature: Longevity, Coefficient: -0.015237887588237516
Feature: Birth Year, Coefficient: 0.0
Feature: Starting Age, Coefficient: 0.00264571108050764


In [16]:
X = X.apply(pd.to_numeric, errors='ignore')


errors='ignore' is deprecated and will raise in a future version. Use to_numeric without passing `errors` and catch exceptions explicitly instead



In [17]:
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = X[col].astype(float)
        except:
            pass

XGBoost Model

In [18]:
from xgboost import XGBRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb = XGBRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)

print("XGBoost R²:", r2_score(y_test, y_pred_xgb))
print("XGBoost RMSE:", mean_squared_error(y_test, y_pred_xgb, squared=False))

XGBoost R²: 0.2942644618827319
XGBoost RMSE: 0.08499731437810541



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



XGBoost Feature Importance

In [19]:
importance_df = pd.Series(xgb.feature_importances_, index=X.columns).sort_values(ascending=False).reset_index()
importance_df.columns = ['Feature', 'Importance']
# Plot feature importances
fig_importance = px.bar(
    importance_df,
    x='Importance',
    y='Feature',
    orientation='h',
    title="XGBoost Feature Importance",
    text='Importance'
)
fig_importance.update_layout(yaxis=dict(autorange="reversed"))
fig_importance.show()


XGBoost GridSearch

In [20]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Initialize base model
xgb = XGBRegressor(random_state=42)

# Grid search with 5-fold CV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,
    scoring='r2',  # You can change this to 'neg_mean_squared_error', etc.
    verbose=1,
    n_jobs=-1
)

# Fit search
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_

print("Best Parameters:", grid_search.best_params_)
print("Best R² Score on Training:", grid_search.best_score_)

# Evaluate on test set
from sklearn.metrics import mean_squared_error, r2_score

y_pred = best_model.predict(X_test)
print("Test R²:", r2_score(y_test, y_pred))
print("Test RMSE:", mean_squared_error(y_test, y_pred, squared=False))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best R² Score on Training: 0.2524061913335897
Test R²: 0.2871765449314059
Test RMSE: 0.0854230749956562



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Assessment 

In [42]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(name, y_true, y_pred):
    print(f"🔍 {name} Evaluation")
    print(f"R² Score: {r2_score(y_true, y_pred):.3f}")
    print(f"RMSE: {mean_squared_error(y_true, y_pred, squared=False):.3f}")
    print(f"MAE: {mean_absolute_error(y_true, y_pred):.3f}")
    print('-' * 40)

print(evaluate_model("ElasticNet", y_test, y_pred_enet))
print(evaluate_model("XGBoost", y_test, y_pred_xgb))

🔍 ElasticNet Evaluation
R² Score: 0.385
RMSE: 0.079
MAE: 0.061
----------------------------------------
None
🔍 XGBoost Evaluation
R² Score: 0.294
RMSE: 0.085
MAE: 0.068
----------------------------------------
None



'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.


'squared' is deprecated in version 1.4 and will be removed in 1.6. To calculate the root mean squared error, use the function'root_mean_squared_error'.



Plotting Predictions vs actual for both models

In [23]:
import plotly.graph_objects as go
import numpy as np

def plot_predictions_plotly(y_true, y_pred, model_name):
    fig = go.Figure()

    # Scatter plot: predicted vs actual
    fig.add_trace(go.Scatter(
        x=y_true,
        y=y_pred,
        mode='markers',
        name='Predictions',
        marker=dict(size=8, opacity=0.7)
    ))

    # Diagonal reference line (perfect prediction)
    min_val = np.min([y_true.min(), y_pred.min()])
    max_val = np.max([y_true.max(), y_pred.max()])
    fig.add_trace(go.Scatter(
        x=[min_val, max_val],
        y=[min_val, max_val],
        mode='lines',
        line=dict(dash='dash', color='red'),
        name='Ideal Fit'
    ))

    fig.update_layout(
        title=f"{model_name} Predictions vs Actual",
        xaxis_title="Actual",
        yaxis_title="Predicted",
        width=600,
        height=600,
        template="plotly_white"
    )

    fig.show()

# Usage
plot_predictions_plotly(y_test, y_pred_enet, "ElasticNet")
plot_predictions_plotly(y_test, y_pred_xgb, "XGBoost")

Scatter of Longevity vs Starting Age 

In [46]:
fig_tradeoff = px.scatter(
    boxing_d2,
    x='Starting Age',
    y='Longevity',
    trendline='ols',
    title='Longevity vs. Starting Age',
    labels={'Starting Age': 'Starting Age', 'Win %': 'Win Percentage'}
)
fig_tradeoff.show()