#  Classification Models - Machine Learning

## Import Libraries necessary for classification

In [None]:
# Importing necessary libraries
import warnings
warnings.filterwarnings("ignore")

# Data manipulation & visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Machine Learning models
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\manar\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\manar\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\manar\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\manar\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\manar\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\manar\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\manar\AppData\Roaming\Python\Python39\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\manar\AppData\Roaming\Python\Python39\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C

AttributeError: _ARRAY_API not found

## Load & prepare the Dataset
We load the cleaned dataset and display its structure.


In [None]:
# Load the cleaned dataset
data_path = "cleaned_final_data.csv"
data = pd.read_csv(data_path)

FileNotFoundError: [Errno 2] No such file or directory: 'cleaned_final_data.csv'

In [None]:
# Check dataset shape
print(data.shape)

In [None]:
# Display dataset info
data.info()

In [None]:
# Display first few rows
print(data.head())

In [None]:
# Display last few rows
print(data.tail())

In [None]:
# Define target and features
target = "market_value_category"
X = data.drop(columns=[target])
y = LabelEncoder().fit_transform(data[target])

NameError: name 'data' is not defined

In [None]:
# Identify categorical & numerical features
categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['number']).columns.tolist()

In [None]:
print(categorical_features)
print(numerical_features)

## Feature Engineering (Encoding & Scaling)

In [None]:
# Apply one-hot encoding to categorical variables & scale numerical ones
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [None]:
# Define the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Transform the data
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

##  Step 1: Training Logistic Regression Model


In [None]:
# Train Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train_transformed, y_train)

# Predictions
y_pred_logistic = log_reg.predict(X_test_transformed)

# Model Evaluation
print(classification_report(y_test, y_pred_logistic))
print("Accuracy:", accuracy_score(y_test, y_pred_logistic))

In [None]:
# Logistic Regression is used for classification problems. It predicts probabilities for different classes
# and assigns the class with the highest probability. Here, we evaluate it using accuracy and precision metrics.
# It works best when features are linearly separable.

##  Step 2: Training Decision Tree 


In [None]:
# Train Decision Tree Model
dt = DecisionTreeClassifier(max_depth=5)  # Prevent overfitting by limiting tree depth
dt.fit(X_train_transformed, y_train)

# Predictions
y_pred_dt = dt.predict(X_test_transformed)

In [None]:
# Model Evaluation
print(classification_report(y_test, y_pred_dt))

In [None]:
# Decision Trees split the data based on feature conditions to classify data points.
# We use a max depth of 5 to prevent overfitting and ensure generalizability.

##  Step 3: Training The Random Forest with Hyperparameter Tuning


In [None]:
# Define parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees
    'max_depth': [5, 10, 15]  # Limit tree depth to prevent overfitting
}

# Perform grid search
rf_grid = GridSearchCV(RandomForestClassifier(), rf_param_grid, cv=5, scoring='accuracy', verbose=1)
rf_grid.fit(X_train_transformed, y_train)

# Get the best model
best_rf = rf_grid.best_estimator_

# Predictions
y_pred_rf = best_rf.predict(X_test_transformed)

NameError: name 'GridSearchCV' is not defined

In [None]:
# Model Evaluation
print(classification_report(y_test, y_pred_rf))

In [None]:
# Save the Best Random Forest Model
joblib.dump(best_rf, "random_forest.pkl")

In [None]:
# Random Forest is an ensemble model that combines multiple Decision Trees.
# It prevents overfitting by averaging the predictions from different trees.
# We used GridSearchCV to tune the number of trees (n_estimators) and depth (max_depth) to find the best model.

## Step 4: Train a Basic SVM Model & Optimize 
We'll start with **SVM using default settings** and evaluate the performance.


In [None]:
# Define parameter grid
svm_param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf', 'poly']  # Different kernels for SVM
}

# Perform grid search
svm_grid = GridSearchCV(SVC(), svm_param_grid, cv=5, scoring='accuracy', verbose=1)
svm_grid.fit(X_train_transformed, y_train)

# Get best SVM model
best_svm = svm_grid.best_estimator_

# Predictions
y_pred_best_svm = best_svm.predict(X_test_transformed)

In [None]:
# Model Evaluation
print(classification_report(y_test, y_pred_best_svm))

In [None]:
# The RBF kernel often performs best when relationships are non-linear.

### Experiment with Different SVM Kernels
We test **linear, RBF, and polynomial kernels** to see which performs best.


In [None]:
kernels = ['linear', 'rbf', 'poly']

In [None]:
for kernel in kernels:
    print(f"\n Testing SVM with {kernel} kernel ")
    svm_model = SVC(kernel=kernel)
    svm_model.fit(X_train_transformed, y_train)
    y_pred = svm_model.predict(X_train_transformed)
    
    print(classification_report(y_test, y_pred))

### Optimize SVM Hyperparameters using GridSearchCV
We use GridSearchCV to **find the best C and kernel**.

In [None]:
# Define parameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly']
}

# Perform grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_transformed, y_train)

In [None]:
# Best parameters
print(f"Best SVM Parameters: {grid_search.best_params_}")

In [None]:
# Train best model
best_svm = grid_search.best_estimator_

# Predict using the best model
y_pred_best_svm = best_svm.predict(X_test_transformed)

In [None]:
# Evaluate best SVM model
print(classification_report(y_test, y_pred_best_svm))

In [None]:
# SVM performed well, especially with the RBF kernel after tuning.
# It was computationally expensive but handled complex patterns better than Logistic Regression.

## Step 5: Train a Basic KNN Model & Optimize k
We start with **default k=5** and evaluate.


In [None]:
# Initialize KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train model
knn_model.fit(X_train_transformed, y_train)

In [None]:
# Predict
y_pred_knn = knn_model.predict(X_test_transformed)

In [None]:
# Evaluate
print(classification_report(y_test, y_pred_knn))

### Find Optimal k for KNN using Cross-Validation
We test different **values of k** to find the best one.


In [None]:
# Try different values for k
k_range = range(1, 21)
scores = []

# Perform cross-validation
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    score = cross_val_score(knn, X_train_transformed, y_train, cv=5, scoring='accuracy')
    scores.append(score.mean())

In [None]:
# Plot accuracy vs k
plt.figure(figsize=(8,4))
plt.plot(k_range, scores, marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Cross-Validation Accuracy')
plt.title('Finding the Best k for KNN')
plt.show()

In [None]:
# Best k value
best_k = k_range[np.argmax(scores)]
print(best_k)

In [None]:
# Train final KNN model
best_knn = KNeighborsClassifier(n_neighbors=best_k)
best_knn.fit(X_train_transformed, y_train)

In [None]:
# Predict with best KNN
y_pred_best_knn = best_knn.predict(X_test_transformed)

# Model Evaluation
print(classification_report(y_test, y_pred_best_knn))

In [None]:
# KNN accuracy depends on choosing the right k.
# Larger k values generalize better but may reduce sensitivity to patterns.

## Step 6: Compare All Classification Models
We compare **SVM, KNN, Logistic Regression, Decision Trees, and Random Forest**.


In [None]:
# Store model results
models = {
    "Logistic Regression": y_pred_logistic,  # Logistic Regression Predictions
    "Decision Trees": y_pred_dt,  # Decision Tree Predictions
    "Random Forest": y_pred_rf,  # Random Forest Predictions
    "SVM (Best)": y_pred_best_svm,  # Optimized SVM Predictions
    "KNN (Best)": y_pred_best_knn  # Optimized KNN Predictions
}

In [None]:
# Print evaluation metrics for all models
for model_name, y_pred in models.items():
    print(f"\n {model_name} Performance ")
    print(classification_report(y_test, y_pred))

### Final comparison:

In [None]:
# SVM (optimized) and Random Forest performed best.
# KNN improved with tuning but remained less stable on larger datasets.

## Step 7: Save the Best Models


In [None]:
# Save trained models for deployment
joblib.dump(best_rf, "random_forest.pkl")
joblib.dump(best_svm, "svm_model.pkl")
joblib.dump(best_knn, "knn_model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")