In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

In [3]:
try:
    df_fifa = pd.read_csv('fifa20_data.csv')
    if 'Value' in df_fifa.columns and df_fifa['Value'].dtype == 'object':
        df_fifa['Value_Numeric'] = df_fifa['Value'].replace('[\€,M,K]', '', regex=True)
        df_fifa['Value_Numeric'] = pd.to_numeric(df_fifa['Value_Numeric'], errors='coerce')
        df_fifa.loc[df_fifa['Value'].str.contains('M', na=False), 'Value_Numeric'] *= 1000000
        df_fifa.loc[df_fifa['Value'].str.contains('K', na=False), 'Value_Numeric'] *= 1000
    else:
        df_fifa['Value_Numeric'] = df_fifa['Value']
    features_q10 = ['Overall', 'Potential', 'Age', 'Total Stats']
    df_q10 = df_fifa[features_q10 + ['Value_Numeric']].dropna()
    df_q10 = df_q10[df_q10['Value_Numeric'] > 0]
    X_q10_with = df_q10[features_q10]
    X_q10_without = df_q10[['Overall', 'Age', 'Total Stats']]
    y_q10 = df_q10['Value_Numeric']
    model_with = LinearRegression()
    model_with.fit(X_q10_with, y_q10)
    model_without = LinearRegression()
    model_without.fit(X_q10_without, y_q10)
    print("WITH ALL FEATURES (including Potential):")
    print(f"  Intercept: €{model_with.intercept_:,.2f}")
    for feat, coef in zip(features_q10, model_with.coef_):
        print(f"  {feat}: €{coef:,.2f}")
    print(f"  R²: {model_with.score(X_q10_with, y_q10):.4f}")
    print("\nWITHOUT Potential:")
    print(f"  R²: {model_without.score(X_q10_without, y_q10):.4f}")
    normalized_impact = np.abs(model_with.coef_ * X_q10_with.std().values)
    max_impact_feature = features_q10[np.argmax(normalized_impact)]
    print(f"\nAttribute that increases value most: {max_impact_feature}")
    print("\nCOMPARISON:")
    r2_diff = model_with.score(X_q10_with, y_q10) - model_without.score(X_q10_without, y_q10)
    print(f"  R² improvement with Potential: {r2_diff:.4f}")
    print(f"  Potential {'significantly improves' if r2_diff > 0.05 else 'slightly improves'} prediction")
except FileNotFoundError:
    print("FIFA dataset (data.csv) not found. Please download from Kaggle.")
except Exception as e:
    print(f"Error: {e}")

  df_fifa = pd.read_csv('fifa20_data.csv')


WITH ALL FEATURES (including Potential):
  Intercept: €-30,711,613.41
  Overall: €319,744.64
  Potential: €319,744.64
  Age: €-325,537.85
  Total Stats: €-578.12
  R²: 0.4784

WITHOUT Potential:
  R²: 0.4784

Attribute that increases value most: Overall

COMPARISON:
  R² improvement with Potential: 0.0000
  Potential slightly improves prediction
