# Final Report - Machine Learning Project


## Step 1: Loading the Data
We start by importing the necessary libraries and loading the dataset.


In [None]:
# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings("ignore")

# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load the dataset
print("Loading dataset from file...")
data_path = "final_data.csv"  # Ensure this path is correct
data = pd.read_csv(data_path)
print("Data Loaded Successfully!")

# Display dataset info
print("\nDataset Info:")
data.info()
print("\nFirst 5 Rows:")
print(data.head())

##  Step 2: Exploratory Data Analysis (EDA)
Before we train our models, we analyze the dataset to check for:
- **Missing values**
- **Duplicates**
- **Outliers**
- **Feature correlations**

In [None]:
# Checking for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Checking for duplicates
print("\nDuplicate Rows:", data.duplicated().sum())

# Checking summary statistics
print("\nSummary Statistics:")
print(data.describe())

# Checking correlation between features
plt.figure(figsize=(12, 6))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Matrix")
plt.show()


##  Step 3: Data Preprocessing
### Handling missing values, duplicates, and encoding categorical variables.


In [None]:
# Drop duplicates if any
if data.duplicated().sum() > 0:
    data.drop_duplicates(inplace=True)
    print("\nNo duplicates to remove!")

# Encoding categorical features
print("\nEncoding categorical variables...")
data = pd.get_dummies(data, drop_first=True)

# Save cleaned dataset
cleaned_data_path = "cleaned_final_data.csv"
data.to_csv(cleaned_data_path, index=False)
print(f"Cleaned data saved at: {cleaned_data_path}")


## Step 4: Regression Model Training



In [None]:
# Import ML libraries
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define features and target variable
target = "market_value"  # Modify based on dataset
X = data.drop(columns=[target])
y = data[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Linear Regression model
lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = lin_reg.predict(X_test_scaled)

# Model Evaluation
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R² Score:", r2_score(y_test, y_pred))


## Step 5: Classification Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define Classification Target Variable
target_classification = "market_value_category"
X_class = data.drop(columns=[target_classification])
y_class = data[target_classification]

# Train-test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_class, y_class, test_size=0.2, random_state=42)

# Standardizing features
X_train_c_scaled = scaler.fit_transform(X_train_c)
X_test_c_scaled = scaler.transform(X_test_c)

# Train Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_clf.fit(X_train_c_scaled, y_train_c)

# Predictions
y_pred_rf = rf_clf.predict(X_test_c_scaled)

In [None]:
# Classification Evaluation
print(classification_report(y_test_c, y_pred_rf))
print("Accuracy:", accuracy_score(y_test_c, y_pred_rf))

## Step 6: K-Means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Drop target variable for clustering
X_cluster = data.drop(columns=["market_value", "market_value_category"], errors='ignore')

# Standardize data
X_cluster_scaled = scaler.fit_transform(X_cluster)

# Find the optimal k using the Elbow Method
wcss = []  # Within-Cluster Sum of Squares
for k in range(1, 15):
    kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(X_cluster_scaled)
    wcss.append(kmeans.inertia_)

# Plot Elbow Method
plt.figure(figsize=(8, 4))
plt.plot(range(1, 15), wcss, marker='o', linestyle='--', color='b')
plt.xlabel("Number of Clusters (k)")
plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
plt.title("Elbow Method to Find Optimal k")
plt.show()

## Step 7: Best Model Selection & Deployment


In [None]:
# Save models for deployment
joblib.dump(lin_reg, "linear_regression.pkl")
joblib.dump(rf_clf, "random_forest.pkl")
print("✅ Models saved successfully!")

In [None]:
# Save clustered dataset
data["Cluster"] = kmeans.fit_predict(X_cluster_scaled)
data.to_csv("clustered_players.csv", index=False)
print("✅ Clustered data saved successfully!")