In [1]:
import importlib

def install_if_not_installed(package):
    try:
        importlib.import_module(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"{package} is not installed. Installing...")
        !pip install {package}
        print(f"{package} has been successfully installed.")

# Check and install ucimlrepo if not installed
install_if_not_installed("ucimlrepo")

# Import and use ucimlrepo
import ucimlrepo

ucimlrepo is already installed.


In [2]:
import pandas as pd
from ucimlrepo import fetch_ucirepo

import warnings     # filter warning messages
warnings.simplefilter(action="ignore")

In [3]:
# Load data into DataFrame

spambase = fetch_ucirepo(id=94)

X = pd.DataFrame(spambase.data.features)
y = pd.DataFrame(spambase.data.targets)

In [4]:
X.head()

Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,word_freq_conference,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.0,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191


In [5]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Train the logistic regression model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = model.predict(X_test)

In [6]:
# Model Evaluation

from sklearn.metrics import (confusion_matrix, accuracy_score,
                             precision_score, recall_score,
                             f1_score, classification_report)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print()

conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)
print()

tn, fp, fn, tp = conf_matrix.ravel()
print("True Positives:", tp)
print("False Positives:", fp)
print("True Negatives:", tn)
print("False Negatives:", fn)
print()

accuracy = accuracy_score(y_test, y_pred)
error = 1 - accuracy
print(f"Accuracy: {accuracy:.3f}")
print(f"Error: {error:.3f}")
print()

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

print()

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       676
           1       0.92      0.90      0.91       475

    accuracy                           0.93      1151
   macro avg       0.93      0.92      0.92      1151
weighted avg       0.93      0.93      0.93      1151


Confusion Matrix:
[[641  35]
 [ 49 426]]

True Positives: 426
False Positives: 35
True Negatives: 641
False Negatives: 49

Accuracy: 0.927
Error: 0.073

Precision: 0.924
Recall: 0.897
F1 Score: 0.910



In [7]:
# Print the coefficients of the features
coefficients = model.coef_

# Store coefficients and feature names in a list of dictionaries
coefficients_data = [{"Feature": f"Feature {i+1}",
                      "Coefficient": coef}
                     for i, coef in enumerate(coefficients[0])]

# Create DataFrame
coefficients_df = pd.DataFrame(coefficients_data)

# Print DataFrame
print("DataFrame of coefficients:")
print(coefficients_df.head())

DataFrame of coefficients:
     Feature  Coefficient
0  Feature 1    -0.046781
1  Feature 2    -0.137094
2  Feature 3     0.165656
3  Feature 4     0.243974
4  Feature 5     0.885636


In [8]:
# Make predictions on the testing data
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities of positive class
thresholds = [0.25, 0.50, 0.75, 0.90]

print("Metrics for different thresholds:")
for threshold in thresholds:
    # Classify instances based on the threshold
    y_pred_thresholded = (y_probs >= threshold).astype(int)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_thresholded)
    precision = precision_score(y_test, y_pred_thresholded)
    recall = recall_score(y_test, y_pred_thresholded)

    # Print the metrics
    print(f"\nThreshold = {threshold}")
    print(f"Accuracy: {accuracy:.3f}, \nPrecision: {precision:.3f}, \nRecall: {recall:.3f}.")

Metrics for different thresholds:

Threshold = 0.25
Accuracy: 0.886, 
Precision: 0.801, 
Recall: 0.964.

Threshold = 0.5
Accuracy: 0.927, 
Precision: 0.924, 
Recall: 0.897.

Threshold = 0.75
Accuracy: 0.877, 
Precision: 0.959, 
Recall: 0.735.

Threshold = 0.9
Accuracy: 0.795, 
Precision: 0.969, 
Recall: 0.520.
