In [1]:
!pip install pandas scikit-learn matplotlib

Defaulting to user installation because normal site-packages is not writeable


In [15]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.decomposition import PCA



In [3]:
#load data
columns = [f'feature_{i}' for i in range(1, 58)] + ['label']
spambase = pd.read_csv('spambase.data', header=None, names=columns)
spambase

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,label
0,0.00,0.64,0.64,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.778,0.000,0.000,3.756,61,278,1
1,0.21,0.28,0.50,0.0,0.14,0.28,0.21,0.07,0.00,0.94,...,0.000,0.132,0.0,0.372,0.180,0.048,5.114,101,1028,1
2,0.06,0.00,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.010,0.143,0.0,0.276,0.184,0.010,9.821,485,2259,1
3,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.137,0.0,0.137,0.000,0.000,3.537,40,191,1
4,0.00,0.00,0.00,0.0,0.63,0.00,0.31,0.63,0.31,0.63,...,0.000,0.135,0.0,0.135,0.000,0.000,3.537,40,191,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4596,0.31,0.00,0.62,0.0,0.00,0.31,0.00,0.00,0.00,0.00,...,0.000,0.232,0.0,0.000,0.000,0.000,1.142,3,88,0
4597,0.00,0.00,0.00,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.000,0.000,0.0,0.353,0.000,0.000,1.555,4,14,0
4598,0.30,0.00,0.30,0.0,0.00,0.00,0.00,0.00,0.00,0.00,...,0.102,0.718,0.0,0.000,0.000,0.000,1.404,6,118,0
4599,0.96,0.00,0.00,0.0,0.32,0.00,0.00,0.00,0.00,0.00,...,0.000,0.057,0.0,0.000,0.000,0.000,1.147,5,78,0


In [5]:
#check data type
spambase.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   feature_1   4601 non-null   float64 
 1   feature_2   4601 non-null   float64 
 2   feature_3   4601 non-null   float64 
 3   feature_4   4601 non-null   float64 
 4   feature_5   4601 non-null   float64 
 5   feature_6   4601 non-null   float64 
 6   feature_7   4601 non-null   float64 
 7   feature_8   4601 non-null   float64 
 8   feature_9   4601 non-null   float64 
 9   feature_10  4601 non-null   float64 
 10  feature_11  4601 non-null   float64 
 11  feature_12  4601 non-null   float64 
 12  feature_13  4601 non-null   float64 
 13  feature_14  4601 non-null   float64 
 14  feature_15  4601 non-null   float64 
 15  feature_16  4601 non-null   float64 
 16  feature_17  4601 non-null   float64 
 17  feature_18  4601 non-null   float64 
 18  feature_19  4601 non-null   float64 
 19  featur

In [4]:
#convert label column to categorical
spambase['label'] = spambase['label'].astype('category')

#check data type
spambase.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   feature_1   4601 non-null   float64 
 1   feature_2   4601 non-null   float64 
 2   feature_3   4601 non-null   float64 
 3   feature_4   4601 non-null   float64 
 4   feature_5   4601 non-null   float64 
 5   feature_6   4601 non-null   float64 
 6   feature_7   4601 non-null   float64 
 7   feature_8   4601 non-null   float64 
 8   feature_9   4601 non-null   float64 
 9   feature_10  4601 non-null   float64 
 10  feature_11  4601 non-null   float64 
 11  feature_12  4601 non-null   float64 
 12  feature_13  4601 non-null   float64 
 13  feature_14  4601 non-null   float64 
 14  feature_15  4601 non-null   float64 
 15  feature_16  4601 non-null   float64 
 16  feature_17  4601 non-null   float64 
 17  feature_18  4601 non-null   float64 
 18  feature_19  4601 non-null   float64 
 19  featur

###   Task 3 : Train/Test Split and SVM Model Training

In [12]:
#split dataset
X=spambase.iloc[:, :-1]
y=spambase['label']

#split into training(80) and testing(20%)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

print(f'Training data shape: {X_train.shape}, Test data shape: {X_test.shape}')

Training data shape: (3680, 57), Test data shape: (921, 57)


In [17]:
#standardize the data for SVM
#We use StandardScaler to transform the features such that they have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
X_train_scaled =scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
#Train SVM model
svm = SVC(kernel='rbf',random_state=42)
svm.fit(X_train_scaled, y_train)

#predict
y_pred = svm.predict(X_test_scaled)

In [20]:
#Evaluate model
cm= confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)

print('Confusion matrix:\n', cm)
print('Accuracy: ', accuracy)


'''
The confusion matrix shows how well the model distinguishes between spam and nonspam.
Accuracy is a measure of the model's overall performance.
'''

Confusion matrix:
 [[513  18]
 [ 42 348]]
Accuracy:  0.9348534201954397


"\nThe confusion matrix shows how well the model distinguishes between spam and non\x02spam.\nAccuracy is a measure of the model's overall performance.\n"

###   Task 4 : Dimensionality Reduction Using PCA

In [21]:
# Apply PCA to reduce to 29 features
pca = PCA(n_components=29)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

print(f'Original feature shape: {X_train.shape}, Reduced feature shape: {X_train_pca.shape}')

Original feature shape: (3680, 57), Reduced feature shape: (3680, 29)


In [22]:
#retrain model based on PCA reduced data
svm_pca = SVC(kernel='rbf',random_state=42)
svm_pca.fit(X_train_pca, y_train)

#predict
y_pred_pca = svm_pca.predict(X_test_pca)

In [24]:
#Evaluate model of PCA
cm_pca= confusion_matrix(y_test,y_pred_pca)
accuracy_pca = accuracy_score(y_test,y_pred_pca)

print('Confusion matrix after PCA:\n', cm_pca)
print('Accuracy after PCA: ', accuracy_pca)


Confusion matrix after PCA:
 [[512  19]
 [ 44 346]]
Accuracy after PCA:  0.9315960912052117
