In [9]:
# -------------------- Part 1: Data Engineering --------------------

import pandas as pd

# Load data
fighters_df = pd.read_csv('Fighters_info_updated2.csv')
fights_df = pd.read_csv('df_all_fights (1).csv')

# Merge fight results with fighter stats
winner_merged = fights_df.merge(
    fighters_df, left_on="winner_id", right_on="Id", suffixes=('', '_fighter1')
)
full_merged = winner_merged.merge(
    fighters_df, left_on="loser_id", right_on="Id", suffixes=('_fighter1', '_fighter2')
)

# Select numeric features
winner_features = [
    col for col in full_merged.columns
    if col.endswith('_fighter1') and full_merged[col].dtype in ['float64', 'int64']
]
loser_features = [
    col for col in full_merged.columns
    if col.endswith('_fighter2') and full_merged[col].dtype in ['float64', 'int64']
]

# Create balanced dataset (both perspectives)
X_winner = full_merged[winner_features + loser_features].fillna(0)
y_winner = [1] * len(X_winner)

X_loser = full_merged[loser_features + winner_features].fillna(0)
y_loser = [0] * len(X_loser)

# Combine both into full dataset
X = pd.concat([X_winner, X_loser], axis=0)
y = y_winner + y_loser


In [10]:
# Save processed features and labels
X.to_csv('X_processed.csv', index=False)
pd.Series(y).to_csv('y_labels.csv', index=False)

In [13]:

# -------------------- Part 2: Modeling + Saving + Visuals (Regression) --------------------

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
Expand
message.txt
3 KB

# -------------------- Part 2: Modeling + Saving + Visuals (Regression) --------------------

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# Save scaler for Streamlit app
joblib.dump(scaler, 'scaler.pkl')

# Define regression models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'KNN Regressor': KNeighborsRegressor(n_neighbors=5)
}

# Train, evaluate, save models
for name, model in models.items():
    print(f"\n{name}")
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    
    # Regression metrics
    mse = mean_squared_error(y_test, preds)
    mae = mean_absolute_error(y_test, preds)
    r2  = r2_score(y_test, preds)
    print(f"  MSE:  {mse:.4f}")
    print(f"  MAE:  {mae:.4f}")
    print(f"  R2:   {r2:.4f}")
    
    # Save model
    fname = name.lower().replace(' ', '_').replace('-', '') + '_model.pkl'
    joblib.dump(model, fname)
    
    # Scatter plot: Actual vs Predicted
    plt.figure(figsize=(5, 4))
    plt.scatter(y_test, preds, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()],
             [y_test.min(), y_test.max()],
             'r--', lw=2)
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.title(f'Actual vs Predicted ({name})')
    plt.tight_layout()
    plt.show()
    
    # Residual plot
    residuals = y_test - preds
    plt.figure(figsize=(5, 4))
    sns.histplot(residuals, kde=True)
    plt.xlabel('Residual')
    plt.title(f'Residual Distribution ({name})')
    plt.tight_layout()
    plt.show()

    # Feature Importance for Random Forest
    if name == 'Random Forest Regressor':
        importances = model.feature_importances_
        feature_names = X.columns
        sorted_idx = importances.argsort()[-10:][::-1]  # top 10
        plt.figure(figsize=(8, 4))
        plt.barh(range(10), importances[sorted_idx], align='center')
        plt.yticks(range(10), feature_names[sorted_idx])
        plt.xlabel("Feature Importance")
        plt.title("Top 10 Important Features (RF Regressor)")
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show() 

SyntaxError: invalid syntax (3769314279.py, line 9)