<a href="https://colab.research.google.com/github/MdFoysalBhuiyan/ML/blob/main/Ml%20paper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA
import xgboost as xgb

# Load the dataset
file_path = '/content/fatigue_intervention_data.xlsx'
data = pd.read_excel(file_path)

# Step 1: Prepare the features and target variable
X = data.drop(columns=['fatigue_status'])  # Features (excluding the target variable)
y = data['fatigue_status']  # Target variable (0: pre-fatigue, 1: post-fatigue)

# Step 2: Apply Polynomial Features (degree 2) to capture non-linearities
poly = PolynomialFeatures(degree=2, include_bias=False) # include_bias=False to avoid a constant column
X_poly = poly.fit_transform(X)

# Convert the polynomial features back to a DataFrame for easier handling and inspection (optional, but good for understanding)
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X.columns))

# Step 3: Preprocess the data (standardization) on the new polynomial features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly_df)

# Step 4: Feature Selection using PCA (Optional)
# Reduce to a reasonable number of principal components. The number of features has increased significantly.
# We'll start with 6 components, but this might be adjusted based on explained variance.
pca = PCA(n_components=6)  # Adjust n_components as needed after inspecting explained variance ratio
X_pca = pca.fit_transform(X_scaled)

# Step 5: Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Step 6: Model 1 - XGBoost Classifier
xgb_model = xgb.XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)

# Step 7: Evaluate the XGBoost model
print("XGBoost Classification Report (with Polynomial Features):")
print(classification_report(y_test, xgb_predictions))

# Step 8: Accuracy Score
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"XGBoost Accuracy (with Polynomial Features): {xgb_accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Classification Report (with Polynomial Features):
              precision    recall  f1-score   support

           0       0.60      0.55      0.57       124
           1       0.56      0.60      0.58       116

    accuracy                           0.57       240
   macro avg       0.58      0.58      0.57       240
weighted avg       0.58      0.57      0.57       240

XGBoost Accuracy (with Polynomial Features): 0.5750


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],           # Number of features to consider when looking for the best split
    'max_depth': [10, 20, 30, 40, 50, None],     # Maximum number of levels in a tree
    'min_samples_split': [2, 5, 10],            # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],              # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]                  # Method of selecting samples for training each tree
}

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
# n_iter: Number of parameter settings that are sampled. Trades off computational cost with quality of the solution.
# cv: Number of folds for cross-validation
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1, scoring='accuracy')

# Fit RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)

# Print the best parameters found
print("Best parameters found: ", random_search.best_params_)

# Get the best estimator
best_rf_model = random_search.best_estimator_

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found:  {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}


In [None]:
# Make predictions with the best Random Forest model
rf_predictions = best_rf_model.predict(X_test)

# Evaluate the best Random Forest model
print("Random Forest Classification Report (Tuned):")
print(classification_report(y_test, rf_predictions))

# Accuracy Score
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy (Tuned): {rf_accuracy:.4f}")

Random Forest Classification Report (Tuned):
              precision    recall  f1-score   support

           0       0.59      0.58      0.59       124
           1       0.56      0.57      0.56       116

    accuracy                           0.57       240
   macro avg       0.57      0.57      0.57       240
weighted avg       0.58      0.57      0.58       240

Random Forest Accuracy (Tuned): 0.5750
