In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb


In [28]:
data = pd.read_csv("fifa_players.csv")


In [32]:
features = [
    "age", "height_cm", "weight_kgs", "overall_rating", "potential",
    "wage_euro", "crossing", "finishing", "dribbling", "stamina", "strength", "vision"
]
target = "value_euro"

In [34]:
data_clean = data[features + [target]].dropna()
X = pd.get_dummies(data_clean[features]) 
y = data_clean[target]

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [39]:
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

RMSE: 2408220238194.4443
MAE: 823443.6868686868
R²: 0.9692901733416693


In [40]:
importance = pd.DataFrame({"Feature": X.columns, "Importance": model.feature_importances_})
print(importance.sort_values("Importance", ascending=False))

           Feature  Importance
3   overall_rating    0.827486
4        potential    0.092640
0              age    0.027352
7        finishing    0.014222
8        dribbling    0.010699
6         crossing    0.008769
11          vision    0.004618
9          stamina    0.004205
5        wage_euro    0.004011
2       weight_kgs    0.002564
1        height_cm    0.001724
10        strength    0.001709
