<a href="https://colab.research.google.com/github/Krishnak57/Climasub-gpt-proto/blob/main/notebooks/prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.exceptions import ConvergenceWarning
import warnings

# For the simple Neural Networks (MLP and Stamina)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# --- Load Data ---
df = pd.read_csv('data/sim_match.csv')
print("Successfully loaded data/sim_match.csv")

# --- 1. Model 1: MLP for EVPM (Feedforward Network) ---
print("\n--- Training Model 1: EVPM (MLP) ---")

# Define features (X) and target (y)
evpm_features = ['minute', 'sprints', 'pressures', 'xG_flow', 'opponent_strength', 'rest_days', 'WBGT', 'altitude']
evpm_target = 'EVPM_ground_truth'

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df[evpm_features], df[evpm_target], test_size=0.2, random_state=42
)

# Scale features
scaler_evpm = StandardScaler()
X_train_scaled = scaler_evpm.fit_transform(X_train)
X_test_scaled = scaler_evpm.transform(X_test)

# Define a simple MLP using PyTorch
class EVPM_MLP(nn.Module):
    def __init__(self, input_size):
        super(EVPM_MLP, self).__init__()
        # FIXED: Use capital nn.Linear
        self.layer_1 = nn.Linear(input_size, 32)
        self.layer_2 = nn.Linear(32, 16)
        self.output_layer = nn.Linear(16, 1) # Output 1 value (EVPM)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.layer_1(x))
        x = self.relu(self.layer_2(x))
        x = self.output_layer(x)
        return x

# Convert data to PyTorch Tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Initialize model, loss, and optimizer
evpm_model = EVPM_MLP(input_size=len(evpm_features))
criterion_evpm = nn.MSELoss() # Mean Squared Error for regression
optimizer_evpm = optim.Adam(evpm_model.parameters(), lr=0.001)

# --- Quick Training Loop ---
num_epochs = 3 # Keep this VERY small for the test
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer_evpm.zero_grad()
        outputs = evpm_model(inputs)
        loss = criterion_evpm(outputs, targets)
        loss.backward()
        optimizer_evpm.step()
    print(f'EVPM Model - Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print("EVPM (MLP) Model trained.")


# --- 2. Model 2: Stamina (Simplified Model) ---
print("\n--- Training Model 2: Stamina (Simple) ---")
# For Step 4, we just test the pipe. We'll make this a real LSTM in Step 6.
stamina_features = ['minute', 'sprints', 'pressures', 'WBGT', 'altitude']
stamina_target = 'stamina_index'

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    df[stamina_features], df[stamina_target], test_size=0.2, random_state=42
)

scaler_stamina = StandardScaler()
X_train_s_scaled = scaler_stamina.fit_transform(X_train_s)
X_test_s_scaled = scaler_stamina.transform(X_test_s)

# We re-use the MLP class for this simple test
stamina_model = EVPM_MLP(input_size=len(stamina_features)) 
criterion_stamina = nn.MSELoss()
optimizer_stamina = optim.Adam(stamina_model.parameters(), lr=0.001)

# Convert data
X_train_s_tensor = torch.tensor(X_train_s_scaled, dtype=torch.float32)
y_train_s_tensor = torch.tensor(y_train_s.values, dtype=torch.float32).view(-1, 1)
train_dataset_s = TensorDataset(X_train_s_tensor, y_train_s_tensor)
train_loader_s = DataLoader(train_dataset_s, batch_size=64, shuffle=True)

# --- Quick Training Loop ---
for epoch in range(num_epochs):
    for inputs, targets in train_loader_s:
        optimizer_stamina.zero_grad()
        outputs = stamina_model(inputs)
        loss = criterion_stamina(outputs, targets)
        loss.backward()
        optimizer_stamina.step()
    print(f'Stamina Model - Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print("Stamina (Simple) Model trained.")


# --- 3. Model 3: Logistic Regression for Injury ---
print("\n--- Training Model 3: Injury (Logistic Regression) ---")
# This is a classifier (0 = no injury, 1 = injury)

injury_features = ['stamina_index', 'sprints', 'pressures', 'minute']
injury_target = 'injury_event'

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    df[injury_features], df[injury_target], test_size=0.2, random_state=42, stratify=df[injury_target]
)

scaler_injury = StandardScaler()
X_train_i_scaled = scaler_injury.fit_transform(X_train_i)
X_test_i_scaled = scaler_injury.transform(X_test_i)

# Use class_weight='balanced' to handle the imbalanced data
injury_model = LogisticRegression(class_weight='balanced')

# Suppress warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=ConvergenceWarning)
    injury_model.fit(X_train_i_scaled, y_train_i)

# Test it
pred_probs = injury_model.predict_proba(X_test_i_scaled)[:, 1] # Get prob of class '1'

print(f"Injury (Logistic) Model trained.")
print(f"Test AUC: {roc_auc_score(y_test_i, pred_probs):.4f}")

Successfully loaded data/sim_match.csv

--- Training Model 1: EVPM (MLP) ---


KeyError: "['xG_flow', 'rest_days', 'WBGT'] not in index"

In [13]:
import sys
import os

# Get the path to the parent directory (your project root)
# This assumes your 'src' folder is at the root, 
# and this notebook is in the 'notebooks' folder.
project_root = os.path.abspath(os.path.join('..'))

# Add the project root to the list of paths Python searches
if project_root not in sys.path:
    sys.path.append(project_root)

print(f"Project root added to path: {project_root}")

Project root added to path: /workspaces/Climasub-gpt-proto


In [25]:
from src.data import generate_synthetic_match_data
import pandas as pd

# This runs your function and creates 'data/sim_match.csv'
data = generate_synthetic_match_data()

# Now, we load that CSV back into a DataFrame called 'df'
try:
    df = pd.read_csv('data/sim_match.csv')
    print("\nSuccessfully loaded 'data/sim_match.csv'")
    print(f"Data shape: {df.shape}")
    print(df.head())
except FileNotFoundError:
    print("\n--- ERROR ---")
    print("Could not find 'data/sim_match.csv'.")
    print("Make sure your 'generate_synthetic_match_data' function is saving the file to the 'data' folder.")

Generating a synthetic dataset for a 90-minute match for 22 players
Generated 1980 rows of data.
   minute  player_id  opponent_strength   hot_days     altitude  sprints  \
0       1          1           0.754578  31.423692  1833.536437        0   
1       1          2           0.904416  34.346159  1833.536437        0   
2       1          3           0.729592  32.571363  1833.536437        1   
3       1          4           0.557566  31.966036  1833.536437        1   
4       1          5           0.848898  28.919521  1833.536437        0   

   pressures  xg_flow        EVM  cumulative_load  stamina_index  injury_prob  \
0          0        0  -0.013466                0          0.995     0.001005   
1          0        1  50.008524                0          0.995     0.001005   
2          1        2  99.995501                2          0.995     0.001005   
3          1        1  50.012471                2          0.995     0.001005   
4          2        2  99.999901         

In [4]:
%pip install torch transformers pandas numpy scikit-learn lifelines streamlit scipy


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
