# **Loading the dataset**

In [1]:
#importing pandas library
import pandas as pd

#loading the dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data')

# **Data preprocessing**

In [2]:
#spliting the dataset features as x and labels as y
x = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [3]:
#importing the train test split function
from sklearn.model_selection import train_test_split

#spliting the dataset to training and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

# **KNN Model implementation**

In [4]:
#importing PCA and Standard scaler from scikit learn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#Standardizing the data using standard scaler
scaler = StandardScaler()
X = scaler.fit_transform(x)

In [5]:
#importing RFE and Logistic regression from scikit learn
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

#Applying RFE with logistic regression
lr = LogisticRegression()
rfe = RFE(lr, n_features_to_select=35)
x_train_rfe = rfe.fit_transform(x_train, y_train)
x_test_rfe = rfe.transform(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

The top 35 features of the dataset is selected for the KNN model using recursive feature elimination (RFE) with logistic regression as the estimator. 

In [6]:
#importing cross validation score and K neighbors classifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

KNN = KNeighborsClassifier()

In [7]:
#getting the optimal value for k 
K_range = range(1,51)
K_scores = []

for K in K_range:
  knn = KNeighborsClassifier(n_neighbors=K)
  scores = cross_val_score(knn, x_train_rfe, y_train, cv=10, scoring='accuracy')
  K_scores.append(scores.mean())

optimal_k_val = K_range[K_scores.index(max(K_scores))]

optimal_k_val

5

Cross-validation is done on the KNN model to find the optimal k value.

In [8]:
#Fitting the KNN model
KNN = KNeighborsClassifier(n_neighbors=optimal_k_val)
KNN.fit(x_train_rfe, y_train)

In [9]:
y_predict = KNN.predict(x_test_rfe)

***KNN model evaluation***

In [10]:
#importing classification report and accuracy score from scikit learn
from sklearn.metrics import classification_report, accuracy_score

knn_accuracy = accuracy_score(y_test, y_predict)
knn_cl_report = classification_report(y_test, y_predict)

#printing the reports
print("Accuracy of KNN- ",knn_accuracy,"\n")
print("_"*55)
print(knn_cl_report)

Accuracy of KNN-  0.9101449275362319 

_______________________________________________________
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       803
           1       0.90      0.88      0.89       577

    accuracy                           0.91      1380
   macro avg       0.91      0.91      0.91      1380
weighted avg       0.91      0.91      0.91      1380



# **Decision trees model implementation**

In [11]:
#importing decision tree classifier from scikit learn
from sklearn.tree import DecisionTreeClassifier

DT = DecisionTreeClassifier()
DT.fit(x_train_rfe, y_train)

Y_predict = DT.predict(x_test_rfe)

***Decision tree model evaluation***

In [12]:
dt_accuracy = accuracy_score(y_test, Y_predict)
dt_cl_report = classification_report(y_test, Y_predict)

#printing the reports
print("Accuracy of Decision tree- ",dt_accuracy,"\n")
print("_"*55)
print(dt_cl_report)

Accuracy of Decision tree-  0.9079710144927536 

_______________________________________________________
              precision    recall  f1-score   support

           0       0.91      0.93      0.92       803
           1       0.90      0.87      0.89       577

    accuracy                           0.91      1380
   macro avg       0.91      0.90      0.90      1380
weighted avg       0.91      0.91      0.91      1380

