#As the first step, Data preprocessing is done, which includes loading the dataset,cleaning the dataset and then splitting it into training and testing sets with a percentage of 80% to 20%. Finally the data is scaled to a common scale using minmax scaler which ranges the data from 0 to 1

In [8]:

import pandas as pd

#load the dataset and include headers
headers = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 
           'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail',
           'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses',
           'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
           'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
           'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs',
           'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
           'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
           'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re',
           'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',
           'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average',
           'capital_run_length_longest', 'capital_run_length_total', 'spam']

dataset=pd.read_csv('dataset/spambase.data',header=None, names=headers)


In [9]:
#1. EMPTY VALUES

#checking if there are empty values
print(dataset.isnull().sum())

#'There are no missing values'


word_freq_make                0
word_freq_address             0
word_freq_all                 0
word_freq_3d                  0
word_freq_our                 0
word_freq_over                0
word_freq_remove              0
word_freq_internet            0
word_freq_order               0
word_freq_mail                0
word_freq_receive             0
word_freq_will                0
word_freq_people              0
word_freq_report              0
word_freq_addresses           0
word_freq_free                0
word_freq_business            0
word_freq_email               0
word_freq_you                 0
word_freq_credit              0
word_freq_your                0
word_freq_font                0
word_freq_000                 0
word_freq_money               0
word_freq_hp                  0
word_freq_hpl                 0
word_freq_george              0
word_freq_650                 0
word_freq_lab                 0
word_freq_labs                0
word_freq_telnet              0
word_fre

In [10]:
#2.DUPLICATES

#checking for duplicate values
duplicate=dataset.duplicated()
num_duplicates = duplicate.sum()
print('number of duplicates :', num_duplicates)
#removing duplicate values
dataset=dataset.drop_duplicates()


duplicate=dataset.duplicated()
num_duplicates = duplicate.sum()
print('number of duplicates (after):',num_duplicates)

number of duplicates : 391
number of duplicates (after): 0


In [13]:
#SPLITTING THE DATASET INTO TRAINING AND TESTING
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(dataset.iloc[:, :-1], dataset.iloc[:, -1], test_size=0.2, random_state=42)

In [14]:
#scaling the data to improve performance
from sklearn.preprocessing import MinMaxScaler, StandardScaler


scaler = MinMaxScaler()
X_train_= scaler.fit_transform(X_train)
X_test_=scaler.transform(X_test)



#KNN model is trained. first the best hyperparameters are taken using gridsearchCv and then those selected hyperparameters are instantiated and the model is trained.

In [15]:
##getting the best hyperparameters

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create a KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter grid to get the best hyperparameters
param_grid = {'n_neighbors': np.arange(1, 20),'weights': ['uniform', 'distance'],'p': [1, 2]}

# Create a grid search object
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the training data
grid_search.fit(X_train_, y_train)

# Print the best hyperparameters and accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)


Best hyperparameters: {'n_neighbors': 6, 'p': 1, 'weights': 'distance'}
Best accuracy: 0.9094426391418027


In [26]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Instantiate the KNN model with the selected hyperparameters
knn_model = KNeighborsClassifier(n_neighbors=6, weights='distance', p=1)

# Train the KNN model on the training set
knn_model.fit(X_train_, y_train)


y_pred = knn_model.predict(X_test_)

# Evaluate the performance of the KNN model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.9121140142517815
Precision: 0.9411764705882353
Recall: 0.8467966573816156
F1 score: 0.8914956011730205


In [27]:
#generating the classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.93       483
           1       0.94      0.85      0.89       359

    accuracy                           0.91       842
   macro avg       0.92      0.90      0.91       842
weighted avg       0.91      0.91      0.91       842



In [28]:
#confusion matrix

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))


[[464  19]
 [ 55 304]]


#Decision tree model is trained. First the best hyperparameters are taken using gridsearchCv and then those selected hyperparameters are instantiated and the model is trained.

In [22]:
##getting the best hyperparameters

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

# instantiate the decision tree model
dt = DecisionTreeClassifier(random_state=42)

# define the hyperparameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# perform a grid search to find the best hyperparameters
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_, y_train)

# print the best hyperparameters
print(grid_search.best_params_)

{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [31]:
dt = grid_search.best_estimator_
dt.fit(X_train, y_train)

# predict the labels of the testing set
predictions_test = dt.predict(X_test_)

# evaluate the performance of the model
accuracy = accuracy_score(y_test, predictions_test)
precision = precision_score(y_test, predictions_test)
recall = recall_score(y_test, predictions_test)
f1 = f1_score(y_test, predictions_test)

# print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

train_accuracy = dt.score(X_train, y_train)
print("Training accuracy:", train_accuracy)


Accuracy: 0.6389548693586699
Precision: 0.9365079365079365
Recall: 0.16434540389972144
F1 score: 0.2796208530805687
Training accuracy: 0.9673396674584323


  "X does not have valid feature names, but"


In [32]:
#generating the classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.93       483
           1       0.94      0.85      0.89       359

    accuracy                           0.91       842
   macro avg       0.92      0.90      0.91       842
weighted avg       0.91      0.91      0.91       842



In [35]:
#confusion matrix

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[464  19]
 [ 55 304]]
