In [1]:
import pandas as pd

df = pd.read_csv("Final_Merge.csv")

print(df.columns.tolist())


['player_id', 'player_name', 'team', 'position', 'matches_played', 'minutes_played', 'passes', 'shots', 'goals', 'assists', 'tackles', 'interceptions', 'yellow_cards', 'assists_per_match', 'goals_per_match', 'minutes_per_match', 'defensive_index', 'discipline_index', 'ball_possession_index', 'performance_score', 'injury_risk_score', 'market_value_millions', 'market_value_', 'estimated_resale_value', 'market_value_last_updated', 'contract_end_year', 'years_remaining_contract', 'twitter_mentions_count', 'twitter_engagement_rate', 'social_sentiment_score', 'social_engagement_score', 'public_perception_index', 'mention_volatility_score', 'trending_mentions_7d', 'sentiment_classification', 'total_injuries_history', 'days_injured_last_season', 'current_injury_status', 'injury_risk_assessment', 'avg_recovery_time_days', 'injury_prone_body_parts', 'historical_injury_count', 'days_since_last_injury']


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load dataset
df = pd.read_csv("Final_Merge.csv")

# Target
TARGET = "market_value_millions"

# Drop non-feature columns
DROP_COLS = [
    "player_name", "team", "sentiment_classification",
    "market_value_last_updated"
]

X = df.drop(columns=[TARGET] + DROP_COLS, errors="ignore")
y = df[TARGET]

# One-hot encode categorical columns
X = pd.get_dummies(X, drop_first=True)

# Fill missing values
X = X.fillna(X.median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)



Training samples: (360, 46)
Testing samples: (90, 46)


In [3]:
# ================================
# MODEL 1: RANDOM FOREST REGRESSOR
# Target: market_value_millions
# ================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# ------------------------------------------------
# 1. Load Dataset
# ------------------------------------------------
df = pd.read_csv("/content/Final_Merge.csv")
print("Dataset shape:", df.shape)

# ------------------------------------------------
# 2. Define Target and Features
# ------------------------------------------------
target = "market_value_millions"

features = [
    "matches_played",
    "minutes_played",
    "passes",
    "shots",
    "goals",
    "assists",
    "tackles",
    "interceptions",
    "yellow_cards",
    "assists_per_match",
    "goals_per_match",
    "minutes_per_match",
    "defensive_index",
    "discipline_index",
    "ball_possession_index",
    "performance_score",
    "injury_risk_score",
    "total_injuries_history",
    "days_injured_last_season",
    "avg_recovery_time_days",
    "days_since_last_injury",
    "twitter_mentions_count",
    "social_sentiment_score",
    "public_perception_index"
]

X = df[features]
y = df[target]

# ------------------------------------------------
# 3. Handle Missing Values
# ------------------------------------------------
X = X.fillna(X.median())
y = y.fillna(y.median())

# ------------------------------------------------
# 4. Train-Test Split (80-20)
# ------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ------------------------------------------------
# 5. Train Random Forest Model
# ------------------------------------------------
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

# ------------------------------------------------
# 6. Predictions
# ------------------------------------------------
y_pred = rf_model.predict(X_test)

# ------------------------------------------------
# 7. Evaluation Metrics
# ------------------------------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n RANDOM FOREST RESULTS")
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)

# ------------------------------------------------
# 8. Feature Importance (Top 10)
# ------------------------------------------------
feature_importance = pd.DataFrame({
    "feature": features,
    "importance": rf_model.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n Top 10 Important Features")
print(feature_importance.head(10))


Dataset shape: (450, 43)

 RANDOM FOREST RESULTS
MAE : 1.5675487433579254
RMSE: 1.9974347052325725
R²  : 0.9677455514469709

 Top 10 Important Features
                     feature  importance
15         performance_score    0.952342
21    twitter_mentions_count    0.003387
20    days_since_last_injury    0.003311
10           goals_per_match    0.003282
19    avg_recovery_time_days    0.002766
1             minutes_played    0.002603
18  days_injured_last_season    0.002439
9          assists_per_match    0.002363
4                      goals    0.002193
11         minutes_per_match    0.002171


In [4]:
# =========================
# XGBOOST REGRESSION MODEL
# =========================

# Install xgboost if not present
!pip install xgboost

import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# -------------------------
# 1. Load Dataset
# -------------------------
df = pd.read_csv("Final_Merge.csv")

# -------------------------
# 2. Define Target & Features
# -------------------------
TARGET = "market_value_millions"

DROP_COLS = [
    "player_name",
    "team",
    "sentiment_classification",
    "market_value_last_updated"
]

X = df.drop(columns=[TARGET] + DROP_COLS, errors="ignore")
y = df[TARGET]

# -------------------------
# 3. Encoding & Cleaning
# -------------------------
X = pd.get_dummies(X, drop_first=True)
X = X.fillna(X.median())

# -------------------------
# 4. Train-Test Split
# -------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# -------------------------
# 5. Scaling (Numerical Only)
# -------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# -------------------------
# 6. Train XGBoost Model
# -------------------------
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

xgb_model.fit(X_train, y_train)

# -------------------------
# 7. Predictions
# -------------------------
y_pred = xgb_model.predict(X_test)

# -------------------------
# 8. Evaluation Metrics
# -------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(" XGBOOST RESULTS")
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)

# -------------------------
# 9. Feature Importance
# -------------------------
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": xgb_model.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n Top 10 Important Features (XGBoost)")
print(feature_importance.head(10))


 XGBOOST RESULTS
MAE : 1.4628336541325673
RMSE: 1.8823368827203817
R²  : 0.9713556387465332

 Top 10 Important Features (XGBoost)
                     feature  importance
16         performance_score    0.684802
18    estimated_resale_value    0.216302
37      market_value__Rising    0.013702
11           goals_per_match    0.008829
13           defensive_index    0.007724
19         contract_end_year    0.006075
20  years_remaining_contract    0.004676
21    twitter_mentions_count    0.002916
1             matches_played    0.002681
7                    tackles    0.002678


In [5]:
# =========================
# LIGHTGBM REGRESSION MODEL
# =========================

# Install LightGBM
!pip install lightgbm

import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


# 1. Load Dataset

df = pd.read_csv("Final_Merge.csv")

# 2. Define Target & Features

TARGET = "market_value_millions"

DROP_COLS = [
    "player_name",
    "team",
    "sentiment_classification",
    "market_value_last_updated"
]

X = df.drop(columns=[TARGET] + DROP_COLS, errors="ignore")
y = df[TARGET]

# 3. Encoding & Cleaning

X = pd.get_dummies(X, drop_first=True)
X = X.fillna(X.median())

# 4. Train-Test Split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# 5. Scaling (Numerical Only)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# 6. Train LightGBM Model

lgbm_model = LGBMRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

lgbm_model.fit(X_train, y_train)

# -------------------------
# 7. Predictions
# -------------------------
y_pred = lgbm_model.predict(X_test)

# -------------------------
# 8. Evaluation Metrics
# -------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# MAE (Mean Absolute Error) - Average absolute prediction error
# RMSE (Root Mean Squared Error) - Penalizes large errors
# R² Score -  Measures variance explained by model

print("LIGHTGBM RESULTS")
print("MAE :", mae)
print("RMSE:", rmse)
print("R²  :", r2)

# -------------------------
# 9. Feature Importance
# -------------------------
feature_importance = pd.DataFrame({
    "feature": X.columns,
    "importance": lgbm_model.feature_importances_
}).sort_values(by="importance", ascending=False)

print("\n Top 10 Important Features (LightGBM)")
print(feature_importance.head(10))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2486
[LightGBM] [Info] Number of data points in the train set: 360, number of used features: 45
[LightGBM] [Info] Start training from score 16.596303
LIGHTGBM RESULTS
MAE : 1.7713036913575546
RMSE: 2.4964996991174266
R²  : 0.9496142793513447

 Top 10 Important Features (LightGBM)
                     feature  importance
16         performance_score         356
18    estimated_resale_value         330
11           goals_per_match         291
24   social_engagement_score         200
22   twitter_engagement_rate         193
13           defensive_index         190
2             minutes_played         180
4                      shots         153
29  days_injured_last_season         141
26  mention_volatility_score         132


