<a href="https://colab.research.google.com/github/Muthon1/DataScience/blob/main/Phase_3_Hyper_Parameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install numpy pandas scikit-learn matplotlib

# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score



In [None]:
# Load the 20 Newsgroup dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [5]:
# Hyperparameter Tuning
# Vectorize text using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Split data into training and test sets
# Use the vectorized data 'X' instead of 'newsgroups.data'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV  # Import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
import numpy as np  # Import numpy for random distributions

# Define the parameter distribution for RandomizedSearchCV
param_dist = {
    'criterion': ['entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'auto', 'sqrt', 'log2'],
}

# Create the Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(dt_classifier, param_dist, n_iter=20, cv=5, n_jobs=-1, verbose=2, random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Output the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

# Evaluate the best model on the test set
best_dt_classifier = random_search.best_estimator_
dt_predictions = best_dt_classifier.predict(X_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


25 fits failed out of a total of 100.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
skle

Best parameters found:  {'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': None, 'max_depth': 20, 'criterion': 'entropy'}
Best cross-validation score:  0.295966564172946


In [6]:
# Naive Bayes Hyperparameter Tuning
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter grid for tuning Naive Bayes
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],   # Smoothing parameter (Laplace smoothing)
    'fit_prior': [True, False],            # Whether to learn class prior probabilities or not
}

# Create the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Set up GridSearchCV to find the best parameters using cross-validation
grid_search = GridSearchCV(nb_classifier, param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Output the best parameters and best cross-validation score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

# Evaluate the best model on the test set
best_nb_classifier = grid_search.best_estimator_
nb_predictions = best_nb_classifier.predict(X_test)

print("Naive Bayes (Tuned) Performance:")
print(f"Accuracy: {accuracy_score(y_test, nb_predictions)}")
print(classification_report(y_test, nb_predictions))

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found:  {'alpha': 0.1, 'fit_prior': False}
Best cross-validation score:  0.7565004112945439
Naive Bayes (Tuned) Performance:
Accuracy: 0.7546419098143236
              precision    recall  f1-score   support

           0       0.35      0.52      0.42       151
           1       0.73      0.75      0.74       202
           2       0.71      0.65      0.68       195
           3       0.61      0.77      0.68       183
           4       0.83      0.73      0.78       205
           5       0.89      0.82      0.86       215
           6       0.84      0.70      0.76       193
           7       0.83      0.80      0.81       196
           8       0.84      0.76      0.80       168
           9       0.93      0.85      0.89       211
          10       0.93      0.90      0.92       198
          11       0.79      0.82      0.80       201
          12       0.81      0.70      0.75       202
          13

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_selection import SelectKBest, chi2
import numpy as np

#Feature Selection (if not already done)
selector = SelectKBest(chi2, k=1000)  # Choose desired k value
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Define a Reduced Parameter Distribution
param_dist = {
    'n_estimators': [100, 150],  # Reduced range
    'criterion': ['entropy'],
    'max_depth': [None, 10, 20],  # Fewer options
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],  # Reduced options
}

# Create the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Set up RandomizedSearchCV with Reduced Iterations and Folds
random_search = RandomizedSearchCV(
    rf_classifier,
    param_dist,
    n_iter=10,  # Reduced iterations
    cv=3,       # Reduced folds (if necessary)
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the model
# Using a subset:
# X_subset = X_train_selected[:5000]
# y_subset = y_train[:5000]
# random_search.fit(X_subset, y_subset)

# Fit on the full (selected) data:
random_search.fit(X_train_selected, y_train)

# Output the best parameters and score
print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

# Evaluate the best model on the test set
best_rf_classifier = random_search.best_estimator_
rf_predictions = best_rf_classifier.predict(X_test_selected)

print("Random Forest (Tuned) Performance:")
print(f"Accuracy: {accuracy_score(y_test, rf_predictions)}")
print(classification_report(y_test, rf_predictions))

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters found:  {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': None, 'criterion': 'entropy'}
Best cross-validation score:  0.5848370298659243
Random Forest (Tuned) Performance:
Accuracy: 0.5822281167108754
              precision    recall  f1-score   support

           0       0.54      0.32      0.40       151
           1       0.58      0.48      0.53       202
           2       0.57      0.64      0.60       195
           3       0.44      0.58      0.50       183
           4       0.69      0.54      0.61       205
           5       0.76      0.64      0.69       215
           6       0.64      0.64      0.64       193
           7       0.69      0.62      0.65       196
           8       0.24      0.72      0.36       168
           9       0.74      0.57      0.64       211
          10       0.70      0.83      0.76       198
          11     

# Significance of Hyperparameter tuning
Hyperparameter tuning is crucial for building high-performance machine learning models. Selection of the right hyperparameters significantly improves accuracy, reduces overfitting and speeds up training making the models more efficient.