In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

In [5]:
# Test reading the CSV file from kaggle API dataset
train_data_csv = pd.read_csv('~/projects/pytorch/examples/Sport_Difficulty_Lin_Reg/data/toughestsport.csv')
#print(train_data_csv.head())
inputs = torch.tensor(train_data_csv.loc[:, 'Endurance':'Analytical Aptitude'].values, dtype=torch.float32)
targets = torch.tensor(train_data_csv[['Total']].values, dtype=torch.float32)
# Keep the SPORT column for later reference
sports_rank = np.empty((len(train_data_csv), 2), dtype=object)
sports_rank[:, 0] = train_data_csv['Rank'].values   # first column = Rank
sports_rank[:, 1] = train_data_csv['SPORT'].values  # second column = SPORT

# Normalize the targets to improve training
targets_mean = targets.mean()
targets_std = targets.std()
targets_normalized = (targets - targets_mean) / targets_std

# Define Tensor dataset
train_ds = TensorDataset(inputs, targets_normalized)

# Hyperparameters
lr = 1e-3
num_epochs = 10000
batch_size = 8

# Define DataLoader
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

# Define the model
class LinearRegressionModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(input_dim, output_dim)
    
    def forward(self, x):
        return self.linear(x)
    
# Model, loss, optimizer
model = LinearRegressionModel(input_dim=inputs.shape[1], output_dim=1)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)

# Initial predictions and loss
with torch.no_grad():
    preds_init = model(inputs)
    loss_init = criterion(preds_init, targets)

In [6]:
# Training loop
for epoch in range(num_epochs):
    epoch_loss = 0.0
    
    for x_batch, y_batch in train_dl:
        # Forward pass
        preds = model(x_batch)
        
        # Compute loss
        loss = criterion(preds, y_batch)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    # Average loss for the epoch
    avg_loss = epoch_loss / len(train_dl)
    
    #if (epoch + 1) % 1000 == 0 or epoch == 0:
        #print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')

# After training: print final linear regression formula
weights = model.linear.weight.detach().squeeze()  # shape: (num_features,)
bias = model.linear.bias.item()                  # scalar

# Build formula string
input_features = train_data_csv.loc[:, 'Endurance':'Analytical Aptitude'].columns.tolist()
terms = [f"{weights[i]:.4f}*{input_features[i]}" for i in range(len(input_features))]
formula = " + ".join(terms) + f" + {bias:.4f}"

print("\nFinal Linear Regression Model:")
print(f"y = {formula}")


Final Linear Regression Model:
y = 0.0811*Endurance + 0.0730*Strength + 0.0864*Power + 0.0800*Speed + 0.0860*Agility + 0.0776*Flexibility + 0.0807*Nerve + 0.0872*Durability + 0.0785*Hand-eye coordination + 0.0788*Analytical Aptitude + -3.9636


In [7]:
with torch.no_grad():
    predicted_totals = model(inputs) * targets_std + targets_mean  # un-normalize
    sorted_indices = torch.argsort(predicted_totals.squeeze(), descending=True)  # high → low
    # Initialize an empty tensor to hold predicted ranks
    predicted_ranks = torch.empty_like(sorted_indices)  
    # Assign ranks based on sorted order
    predicted_ranks[sorted_indices] = torch.arange(1, len(sorted_indices)+1)

# Predictions
#print("Predicted Totals:", predicted_totals[:10])
#print("\nPredicted Ranks:", predicted_ranks[:10])
#print("\nPredicted Sports:", sports_rank[sorted_indices[:10], 1])

# Answers
#print("\nActual targets:", targets[:10])
#print("\nActual Ranks:", sports_rank[:10, 0])
#print("Actual Sports:", sports_rank[:10, 1])

In [8]:
# TABLE 1: All sports with actual and predicted rankings
results_df = pd.DataFrame({
    'Sport': sports_rank[:, 1],
    'Actual Rank': sports_rank[:, 0].astype(int),
    'Actual Total': targets.squeeze().numpy(),
    'Predicted Rank': predicted_ranks.numpy().astype(int),
    'Predicted Total': predicted_totals.squeeze().numpy(),
    'Difference in Total Score': (abs(predicted_totals.squeeze() - targets.squeeze())).numpy()
})

# Sort by actual rank to show top 10
results_df_sorted = results_df.sort_values('Actual Rank')
# Export Table 1: All sports results
results_df_sorted.to_csv('outputs/results_table.csv', index=False)

# TABLE 2: Feature Importance (Weights)
weights = model.linear.weight.detach().squeeze().numpy()
input_features = train_data_csv.loc[:, 'Endurance':'Analytical Aptitude'].columns.tolist()

# Sort by absolute weight (descending)
sorted_weight_indices = np.argsort(np.abs(weights))[::-1]

feature_importance_df = pd.DataFrame({
    'Characteristic': [input_features[i] for i in sorted_weight_indices],
    'Weight': [weights[i] for i in sorted_weight_indices],
    'Percentage %': [round(weights[i] * 100, 2) for i in sorted_weight_indices]
})

# Export Table 2: Feature importance
feature_importance_df.to_csv('outputs/feature_importance.csv', index=False)

In [10]:
# Printed Tables (unnecessary, but for quick view)
print("=" * 100)
print("TABLE 1: Top 15 Sports - Actual vs Predicted")
print("=" * 100)
print(results_df_sorted.head(15).to_string(index=False))
print("\n")

print("=" * 80)
print("TABLE 2: Feature Importance (Sorted by Absolute Weight)")
print("=" * 80)
print(feature_importance_df.to_string(index=False))

TABLE 1: Top 15 Sports - Actual vs Predicted
                 Sport  Actual Rank  Actual Total  Predicted Rank  Predicted Total  Difference in Total Score
                Boxing            1        72.375               1        72.271805                   0.103195
            Ice Hockey            2        71.750               2        71.660225                   0.089775
              Football            3        68.375               3        68.254364                   0.120636
            Basketball            4        67.875               4        67.817802                   0.057198
             Wrestling            5        63.500               5        63.232037                   0.267963
          Martial Arts            6        63.375               6        63.231930                   0.143070
                Tennis            7        62.750               7        62.681068                   0.068932
            Gymnastics            8        62.500               8        62