In [1]:
import os
import pandas as pd

file_path = os.path.join('data', 'processed', 'preprocessed_data.csv')
data = pd.read_csv(file_path)

In [2]:
from sklearn.model_selection import train_test_split

# Define Features (X) and Target (y)
X = data.drop(columns=['Market value']) 
y = data['Market value']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

feature_columns = X.columns 

X_train_df = pd.DataFrame(X_train, columns=feature_columns)
X_test_df = pd.DataFrame(X_test, columns=feature_columns)
y_train_series = pd.Series(y_train, name='target')  
y_test_series = pd.Series(y_test, name='target')


import os

# Define the directory path
processed_dir = os.path.join(os.getcwd(), 'data', 'processed')

# Create the directory if it doesn't exist
os.makedirs(processed_dir, exist_ok=True)

# Save the datasets as CSV files in the processed directory
X_train_df.to_csv(os.path.join(processed_dir, 'X_train.csv'), index=False)
X_test_df.to_csv(os.path.join(processed_dir, 'X_test.csv'), index=False)
y_train_series.to_csv(os.path.join(processed_dir, 'y_train.csv'), index=False)
y_test_series.to_csv(os.path.join(processed_dir, 'y_test.csv'), index=False)

print(f"Files have been saved to: {processed_dir}")

Files have been saved to: c:\Users\mehra\Clones\LHL-Final-Project\notebooks\data\processed


In [3]:
X_train.fillna(0, inplace=True)  # Replace NaN with 0
X_test.fillna(0, inplace=True)  # Same for test data

# Drop irrelevant columns
X_train.drop(columns=['Player'], inplace=True)
X_test.drop(columns=['Player'], inplace=True)

# Process Date of birth/Age
X_train['Age'] = X_train['Date of birth/Age'].str.extract(r'\((\d+)\)').astype(float)
X_test['Age'] = X_test['Date of birth/Age'].str.extract(r'\((\d+)\)').astype(float)
X_train.drop(columns=['Date of birth/Age'], inplace=True)
X_test.drop(columns=['Date of birth/Age'], inplace=True)

# Process Height
X_train['Height'] = X_train['Height'].str.replace(',', '.', regex=False).str.replace('m', '').astype(float)
X_test['Height'] = X_test['Height'].str.replace(',', '.', regex=False).str.replace('m', '').astype(float)

# Encode Position column
X_train = pd.get_dummies(X_train, columns=['Position'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['Position'], drop_first=True)

# One-hot encode the object columns
categorical_columns = ['Foot', 'Contract', 'Nat.', 'Age_Group']
X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

# Align test set with train set
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Verify the processed features
print(X_train.dtypes)
print("Missing values in X_train:", X_train.isnull().sum().sum())


Matches_Played                 int64
Starts                         int64
Minutes_Played               float64
90s_Played                   float64
Goals                        float64
                              ...   
Nat._['France', 'Congo']        bool
Nat._['Morocco', 'Spain']       bool
Age_Group_18-23                 bool
Age_Group_24-28                 bool
Age_Group_29-33                 bool
Length: 65, dtype: object
Missing values in X_train: 0


In [4]:
# Verify data types
print("Feature data types (X_train):")
print(X_train.dtypes)

# Check for missing values again
print("Missing values in processed X_train:", X_train.isnull().sum().sum())


Feature data types (X_train):
Matches_Played                 int64
Starts                         int64
Minutes_Played               float64
90s_Played                   float64
Goals                        float64
                              ...   
Nat._['France', 'Congo']        bool
Nat._['Morocco', 'Spain']       bool
Age_Group_18-23                 bool
Age_Group_24-28                 bool
Age_Group_29-33                 bool
Length: 65, dtype: object
Missing values in processed X_train: 0


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Clean column names for compatibility
X_train.columns = X_train.columns.astype(str).str.replace(r'[\[\]<>,]', '', regex=True)
X_test.columns = X_test.columns.astype(str).str.replace(r'[\[\]<>,]', '', regex=True)

models = {
    'Linear Regression': LinearRegression(),
    'SVM': SVR(kernel='rbf'),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42)
}

# Dictionary to store model performance
model_performance = []

# Train and evaluate each model
for name, model in models.items():
    if name == 'SVM':  # SVM requires scaled data
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:  # Other models use unscaled data
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    model_performance.append((name, mae, mse, r2))

    print(f"{name}:")
    print(f"  MAE: {mae:.2f}")
    print(f"  MSE: {mse:.2f}")
    print(f"  R² Score: {r2:.2f}")
    print()


Linear Regression:
  MAE: 37.64
  MSE: 2236.47
  R² Score: -0.07

SVM:
  MAE: 41.48
  MSE: 2689.33
  R² Score: -0.29

Random Forest:
  MAE: 14.37
  MSE: 343.53
  R² Score: 0.83

XGBoost:
  MAE: 13.78
  MSE: 336.78
  R² Score: 0.84



In [6]:
import pandas as pd

# Create a DataFrame for performance comparison
performance_df = pd.DataFrame(model_performance, columns=['Model', 'MAE', 'MSE', 'R2'])
performance_df.sort_values(by='MAE', ascending=True, inplace=True)

print(performance_df)


               Model        MAE          MSE        R2
3            XGBoost  13.777180   336.776154  0.838166
2      Random Forest  14.365000   343.526637  0.834922
0  Linear Regression  37.638514  2236.467792 -0.074708
1                SVM  41.480349  2689.331012 -0.292326
