In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset
df = pd.read_csv("fifa_players.csv")

# Drop irrelevant or non-numeric columns
irrelevant_columns = [
    'name', 'full_name', 'birth_date', 'positions', 'nationality',
    'preferred_foot', 'body_type', 'national_team', 'national_team_position',
    'national_jersey_number'
]
df_clean = df.drop(columns=irrelevant_columns)

# Drop rows with missing values
df_clean = df_clean.dropna()

# Define features and target
X = df_clean.drop(columns=['overall_rating'])
y = df_clean['overall_rating']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'KNN': KNeighborsRegressor()
}

# Train models, evaluate, and generate scatter plots
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Store evaluation metrics
    results[name] = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'R2': r2_score(y_test, y_pred)
    }

    # Generate scatter plot
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6, s=50) # Increased marker size (s)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', label='Perfect Prediction')
    plt.xlabel("Actual Overall Rating")
    plt.ylabel("Predicted Overall Rating")
    plt.title(f"Actual vs Predicted Ratings for {name} Model")  
    plt.grid(True)
    plt.legend() # Added legend
    plt.tight_layout()
    plt.savefig(f"actual_vs_predicted_{name.lower().replace(' ', '_')}.png", format="png")
    plt.close()

# Convert results to DataFrame and display
results_df = pd.DataFrame(results).T.sort_values(by='R2', ascending=False)
print("Model Performance Comparison:\n")
print(results_df)

# Save model performance table as PNG (changed from PDF)
fig, ax = plt.subplots(figsize=(8, 4))
ax.axis('off')
table = plt.table(
    cellText=np.round(results_df.values, 4),
    colLabels=results_df.columns,
    rowLabels=results_df.index,
    cellLoc='center',
    loc='center'
)
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.5)
plt.title("Model Performance Comparison", fontsize=14, weight='bold')
plt.savefig("model_performance_comparison.png", format="png")
plt.close()

# Prepare data for grouped bar graph
models_list = results_df.index
metrics = ['RMSE', 'MSE', 'R2']
bar_width = 0.25  # Width of each bar
x = np.arange(len(models_list))  # X positions for each model

# Create grouped bar graph - PLOTTING ORIGINAL VALUES
plt.figure(figsize=(10, 6))
bars_rmse = plt.bar(x - bar_width, results_df['RMSE'], bar_width, label='RMSE', color='#1f77b4')
bars_mse = plt.bar(x, results_df['MSE'], bar_width, label='MSE', color='#ff7f0e')
bars_r2 = plt.bar(x + bar_width, results_df['R2'], bar_width, label='R2', color='#2ca02c')

# Add actual values on top of each bar (keep this)
for bars, metric in zip([bars_rmse, bars_mse, bars_r2], metrics):
    for bar, model in zip(bars, models_list):
        height = bar.get_height()
        actual_value = results_df.loc[model, metric]
        plt.text(bar.get_x() + bar.get_width() / 2, height, f'{actual_value:.4f}',
                 ha='center', va='bottom', fontsize=9)

# Customize plot (keep this)
plt.xlabel("Model")
plt.ylabel("Metric Value") # Changed y-label back to "Metric Value"
plt.title("Metrics Comparison Across All Models (Original Scale)") # Changed title
plt.xticks(x, models_list, rotation=0) # Keep horizontal labels
plt.legend(loc='upper left') # Keep legend position
plt.tight_layout()
plt.savefig("metrics_comparison_original_scale.png", format="png") # Changed to png
plt.close()

# Make predictions on the test set and create a DataFrame for comparison
predictions_df = pd.DataFrame({'Actual Rating': y_test})

for name, model in models.items():
    predictions_df[f'Predicted ({name})'] = model.predict(X_test)

# Display the predictions DataFrame
print("\nPredictions on Test Set (including actual ratings):\n")
print(predictions_df.head(20)) # Displaying the first 20 predictions