In [1]:
#Authors
#Ganesh Sarla
#Sai Akhil Gourvarapu

In [None]:
#Libraries required
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import time
from tabulate import tabulate



In [2]:
#Define constant values
headers = ["Fold", "Accuracy (Logistic)", "Accuracy (SVM)", "Accuracy (Random Forest)",
           "F1 Score (Logistic)", "F1 Score (SVM)", "F1 Score (Random Forest)",
           "Time (Logistic)", "Time (SVM)", "Time (Random Forest)"]
final_rank_headers = ["Classifier", "Accuracy Rank", "F1 Score Rank", "Time Rank"]
K_fold = 10
iteration = 0
k=3
n=K_fold
Sum_of_squared_differences_for_accuracy_1=0
Sum_of_squared_differences_for_accuracy_2=0
Sum_of_squared_differences_for_f1_1=0
Sum_of_squared_differences_for_f1_2=0
Sum_of_squared_differences_for_Time_1=0
Sum_of_squared_differences_for_Time_2=0
critical_value=7.8
q=2.343
Classifiers = ["Logistic Regression", "SVM", "Random Forest"]

In [3]:
column_names = ["word_freq_make", "word_freq_address", "word_freq_all", "word_freq_3d", "word_freq_our",
                "word_freq_over", "word_freq_remove", "word_freq_internet", "word_freq_order", "word_freq_mail",
                "word_freq_receive", "word_freq_will", "word_freq_people", "word_freq_report", "word_freq_addresses",
                "word_freq_free", "word_freq_business", "word_freq_email", "word_freq_you", "word_freq_credit",
                "word_freq_your", "word_freq_font", "word_freq_000", "word_freq_money", "word_freq_hp", "word_freq_hpl",
                "word_freq_george", "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet",
                "word_freq_857", "word_freq_data", "word_freq_415", "word_freq_85", "word_freq_technology",
                "word_freq_1999", "word_freq_parts", "word_freq_pm", "word_freq_direct", "word_freq_cs", "word_freq_meeting",
                "word_freq_original", "word_freq_project", "word_freq_re", "word_freq_edu", "word_freq_table",
                "word_freq_conference", "char_freq_;", "char_freq_(", "char_freq_[", "char_freq_!", "char_freq_$",
                "char_freq_#", "capital_run_length_average", "capital_run_length_longest", "capital_run_length_total",
                "spam_class"]
data = pd.read_csv('spambase.data', names=column_names)

# Separate features and target variable
X = data.drop("spam_class", axis=1)
Y = data["spam_class"]

# Define classifiers
svm_classifier = make_pipeline(StandardScaler(), SVC(gamma="auto"))
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
logic_classifier = LogisticRegression(max_iter=5000)
cv = StratifiedKFold(n_splits=K_fold, shuffle=True, random_state=1)

# Set up arrays to hold performance metric values upon initialization.
Data_of_accuracy = np.zeros((K_fold, 3))
Data_of_f1 = np.zeros((K_fold, 3))
Data_of_time = np.zeros((K_fold, 3))

# Set up arrays with initial values to hold the performance metrics ranks.
accuracy_ranks = np.zeros((K_fold, 3))
f1_ranks = np.zeros((K_fold, 3))
time_ranks = np.zeros((K_fold, 3))

In [4]:
# Create empty lists to store fold data and rank data
fold_data_list = []
rank_data_list = []

for train_index, test_index in cv.split(X, Y):
    X_train_cv, X_test_cv = X.iloc[train_index], X.iloc[test_index]
    Y_train_cv, Y_test_cv = Y.iloc[train_index], Y.iloc[test_index]

    # Fit the model on the training data
    logic_classifier.fit(X_train_cv, Y_train_cv)
    svm_classifier.fit(X_train_cv, Y_train_cv)
    rf_classifier.fit(X_train_cv, Y_train_cv)

    # Measuring the accuracy of the classifiers
    Data_of_accuracy[iteration, 0] = logic_classifier.score(X_test_cv, Y_test_cv)
    Data_of_accuracy[iteration, 1] = svm_classifier.score(X_test_cv, Y_test_cv)
    Data_of_accuracy[iteration, 2] = rf_classifier.score(X_test_cv, Y_test_cv)

    y_pred_logic = logic_classifier.predict(X_test_cv)
    y_pred_svm = svm_classifier.predict(X_test_cv)
    y_pred_rf = rf_classifier.predict(X_test_cv)

    # Measuring the f1-score of the classifiers
    Data_of_f1[iteration, 0] = f1_score(Y_test_cv, y_pred_logic)
    Data_of_f1[iteration, 1] = f1_score(Y_test_cv, y_pred_svm)
    Data_of_f1[iteration, 2] = f1_score(Y_test_cv, y_pred_rf)

    start_time_logic = time.time()
    logic_classifier.fit(X_train_cv, Y_train_cv)
    time_logic = time.time() - start_time_logic

    start_time_svm = time.time()
    svm_classifier.fit(X_train_cv, Y_train_cv)
    time_svm = time.time() - start_time_svm

    start_time_rf = time.time()
    rf_classifier.fit(X_train_cv, Y_train_cv)
    time_rf = time.time() - start_time_rf

    # Measuring time taken for each classifier to build
    Data_of_time[iteration, 0] = time_logic
    Data_of_time[iteration, 1] = time_svm
    Data_of_time[iteration, 2] = time_rf

    # Ranking the classifiers for each metric
    accuracy_ranks[iteration, :] = np.argsort(np.argsort(-Data_of_accuracy[iteration, :])) + 1
    f1_ranks[iteration, :] = np.argsort(np.argsort(-Data_of_f1[iteration, :])) + 1
    time_ranks[iteration, :] = np.argsort(np.argsort(Data_of_time[iteration, :])) + 1

    iteration += 1
    fold_data = [iteration,
                 Data_of_accuracy[iteration - 1, 0], Data_of_accuracy[iteration - 1, 1], Data_of_accuracy[iteration - 1, 2],
                 Data_of_f1[iteration - 1, 0], Data_of_f1[iteration - 1, 1], Data_of_f1[iteration - 1, 2],
                 Data_of_time[iteration - 1, 0], Data_of_time[iteration - 1, 1], Data_of_time[iteration - 1, 2]]
    fold_data_list.append(fold_data)

    rank_data = [iteration,
                 accuracy_ranks[iteration - 1, 0], accuracy_ranks[iteration - 1, 1], accuracy_ranks[iteration - 1, 2],
                 f1_ranks[iteration - 1, 0], f1_ranks[iteration - 1, 1], f1_ranks[iteration - 1, 2],
                 time_ranks[iteration - 1, 0], time_ranks[iteration - 1, 1], time_ranks[iteration - 1, 2]]
    rank_data_list.append(rank_data)

# Create DataFrames from the lists
fold_df = pd.DataFrame(fold_data_list, columns=headers)
rank_df = pd.DataFrame(rank_data_list, columns=headers)

# Calculate the average and standard deviation values for each metric
average_accuracy = np.mean(Data_of_accuracy, axis=0)
std_accuracy = np.std(Data_of_accuracy, axis=0)
average_f1 = np.mean(Data_of_f1, axis=0)
std_f1 = np.std(Data_of_f1, axis=0)
average_time = np.mean(Data_of_time, axis=0)
std_time = np.std(Data_of_time, axis=0)

print("\nAverage Accuracy:", average_accuracy)
print("Standard Deviation Accuracy:", std_accuracy)
print("\nAverage F1 Score:", average_f1)
print("Standard Deviation F1 Score:", std_f1)
print("\nAverage Time:", average_time)
print("Standard Deviation Time:", std_time)

# Calculate the average ranking of classifiers across all the metrics
final_ranks_data = [
    ["Logistic", np.mean(accuracy_ranks[:, 0]), np.mean(f1_ranks[:, 0]), np.mean(time_ranks[:, 0])],
    ["SVM", np.mean(accuracy_ranks[:, 1]), np.mean(f1_ranks[:, 1]), np.mean(time_ranks[:, 1])],
    ["Random Forest", np.mean(accuracy_ranks[:, 2]), np.mean(f1_ranks[:, 2]), np.mean(time_ranks[:, 2])]
]
final_ranks_df = pd.DataFrame(final_ranks_data, columns=final_rank_headers)

# Display the DataFrames
print("\nFold Data:")
print(fold_df)
print("\nRank Data:")
print(rank_df)
print("\nFinal Ranks Data:")
print(final_ranks_df)



Average Accuracy: [0.92849807 0.93349382 0.95414175]
Standard Deviation Accuracy: [0.0112383  0.00842124 0.00722971]

Average F1 Score: [0.90771262 0.91392753 0.94090806]
Standard Deviation F1 Score: [0.01451257 0.01109659 0.00925153]

Average Time: [1.46348414 0.30664206 1.28122416]
Standard Deviation Time: [0.35243552 0.0936425  0.1651338 ]

Fold Data:
   Fold  Accuracy (Logistic)  Accuracy (SVM)  Accuracy (Random Forest)  \
0     1             0.908894        0.928416                  0.947939   
1     2             0.939130        0.941304                  0.950000   
2     3             0.926087        0.934783                  0.947826   
3     4             0.930435        0.932609                  0.950000   
4     5             0.939130        0.934783                  0.960870   
5     6             0.917391        0.921739                  0.958696   
6     7             0.947826        0.952174                  0.963043   
7     8             0.932609        0.936957      

In [7]:
#Using the Nemeth and Friedman tests to determine the accuracy metric

for i in range (0,3):
    Sum_of_squared_differences_for_accuracy_1=(np.mean(accuracy_ranks[:,i]) - (k + 1) / 2)**2 +Sum_of_squared_differences_for_accuracy_1
Sum_of_squared_differences_for_accuracy_1=n*Sum_of_squared_differences_for_accuracy_1
#print(Sum_of_squared_differences_for_accuracy_1)

for i in range (0,10):
  for j in range (0,3):
    Sum_of_squared_differences_for_accuracy_2 =((accuracy_ranks[i,j]) - ((k + 1) / 2))**2+ Sum_of_squared_differences_for_accuracy_2
Sum_of_squared_differences_for_accuracy_2=(1/(n*(k-1)))*Sum_of_squared_differences_for_accuracy_2
#print(Sum_of_squared_differences_for_accuracy_2)

friedman_statistic_accuracy=Sum_of_squared_differences_for_accuracy_1/Sum_of_squared_differences_for_accuracy_2
print(f"friedman test statistic of accuracy metric is: {friedman_statistic_accuracy}")

if friedman_statistic_accuracy>critical_value:
  print("null hypothesis is rejected")
  critical_difference=q*np.sqrt((k*(k+1))/(6*n))
  print(f"critical_difference: {critical_difference}")
  for i in range(k):
      for j in range(i + 1, k):
          difference = np.abs(np.mean(accuracy_ranks[:,i]) - np.mean(accuracy_ranks[:,j]))
          print(f"Difference between {Classifiers[i]} and {Classifiers[j]}: {difference}")

          if difference > critical_difference:
              print(f"Classifier {Classifiers[i]} and {Classifiers[j]} have a significant difference.")
          else:
              print(f"Classifier {Classifiers[i]} and {Classifiers[j]} do not have a significant difference.")

else:
  print("null hypothesis cannot be rejected")

friedman test statistic of accuracy metric is: 18.2
null hypothesis is rejected
critical_difference: 1.0478214542564015
Difference between Logistic Regression and SVM: 0.7999999999999998
Classifier Logistic Regression and SVM do not have a significant difference.
Difference between Logistic Regression and Random Forest: 1.9
Classifier Logistic Regression and Random Forest have a significant difference.
Difference between SVM and Random Forest: 1.1
Classifier SVM and Random Forest have a significant difference.


In [5]:
#Applying Nemeth and Friedman tests to the f1-score mertic

for i in range (0,3):
    Sum_of_squared_differences_for_f1_1=(np.mean(f1_ranks[:,i]) - (k + 1) / 2)**2 +Sum_of_squared_differences_for_f1_1
Sum_of_squared_differences_for_f1_1=n*Sum_of_squared_differences_for_f1_1
#print(Sum_of_squared_differences_for_f1_1)

for i in range (0,10):
  for j in range (0,3):
    Sum_of_squared_differences_for_f1_2 =((f1_ranks[i,j]) - ((k + 1) / 2))**2+ Sum_of_squared_differences_for_f1_2
Sum_of_squared_differences_for_f1_2=(1/(n*(k-1)))*Sum_of_squared_differences_for_f1_2
#print(Sum_of_squared_differences_for_f1_2)

friedman_statistic_f1=Sum_of_squared_differences_for_f1_1/Sum_of_squared_differences_for_f1_2
print(f"friedman test statistic of F1-Score metric is: {friedman_statistic_f1}")

if friedman_statistic_f1>critical_value:
  print("null hypothesis is rejected")
  critical_difference=q*np.sqrt((k*(k+1))/(6*n))
  print(f"critical_difference: {critical_difference}")
  for i in range(k):
      for j in range(i + 1, k):
          difference = np.abs(np.mean(f1_ranks[:,i]) - np.mean(f1_ranks[:,j]))
          print(f"Difference between {Classifiers[i]} and {Classifiers[j]}: {difference}")

          if difference > critical_difference:
              print(f"Classifier {Classifiers[i]} and {Classifiers[j]} have a significant difference.")
          else:
              print(f"Classifier {Classifiers[i]} and {Classifiers[j]} do not have a significant difference.")

else:
  print("null hypothesis cannot be rejected")

friedman test statistic of F1-Score metric is: 18.2
null hypothesis is rejected
critical_difference: 1.0478214542564015
Difference between Logistic Regression and SVM: 0.7999999999999998
Classifier Logistic Regression and SVM do not have a significant difference.
Difference between Logistic Regression and Random Forest: 1.9
Classifier Logistic Regression and Random Forest have a significant difference.
Difference between SVM and Random Forest: 1.1
Classifier SVM and Random Forest have a significant difference.


In [6]:
#Applying the Nemeth and Friedman tests for time metrics

for i in range (0,3):
    Sum_of_squared_differences_for_Time_1=(np.mean(time_ranks[:,i]) - (k + 1) / 2)**2 +Sum_of_squared_differences_for_Time_1
Sum_of_squared_differences_for_Time_1=n*Sum_of_squared_differences_for_Time_1
#print(Sum_of_squared_differences_for_Time_1)

for i in range (0,10):
  for j in range (0,3):
    Sum_of_squared_differences_for_Time_2 =((time_ranks[i,j]) - ((k + 1) / 2))**2+ Sum_of_squared_differences_for_Time_2
Sum_of_squared_differences_for_Time_2=(1/(n*(k-1)))*Sum_of_squared_differences_for_Time_2
#print(Sum_of_squared_differences_for_Time_2)

friedman_statistic_time=Sum_of_squared_differences_for_Time_1/Sum_of_squared_differences_for_Time_2
print(f"friedman test statistic of time metric is: {friedman_statistic_time}")

if friedman_statistic_time>critical_value:
  print("null hypothesis is rejected")
  critical_difference=q*np.sqrt((k*(k+1))/(6*n))
  print(f"critical_difference: {critical_difference}")
  for i in range(k):
      for j in range(i + 1, k):
          difference = np.abs(np.mean(time_ranks[:,i]) - np.mean(time_ranks[:,j]))
          print(f"Difference between {Classifiers[i]} and {Classifiers[j]}: {difference}")

          if difference > critical_difference:
              print(f"Classifier {Classifiers[i]} and {Classifiers[j]} have a significant difference.")
          else:
              print(f"Classifier {Classifiers[i]} and {Classifiers[j]} do not have a significant difference.")

else:
  print("null hypothesis cannot be rejected")

friedman test statistic of time metric is: 15.8
null hypothesis is rejected
critical_difference: 1.0478214542564015
Difference between Logistic Regression and SVM: 1.7000000000000002
Classifier Logistic Regression and SVM have a significant difference.
Difference between Logistic Regression and Random Forest: 0.40000000000000036
Classifier Logistic Regression and Random Forest do not have a significant difference.
Difference between SVM and Random Forest: 1.2999999999999998
Classifier SVM and Random Forest have a significant difference.
