## Import Dataset from UCI

In [1]:
from ucimlrepo import fetch_ucirepo
import numpy as np

spam_base = fetch_ucirepo(id=94)
X = spam_base.data.features
y = spam_base.data.targets

In [2]:
X = X.values
X

array([[0.000e+00, 6.400e-01, 6.400e-01, ..., 3.756e+00, 6.100e+01,
        2.780e+02],
       [2.100e-01, 2.800e-01, 5.000e-01, ..., 5.114e+00, 1.010e+02,
        1.028e+03],
       [6.000e-02, 0.000e+00, 7.100e-01, ..., 9.821e+00, 4.850e+02,
        2.259e+03],
       ...,
       [3.000e-01, 0.000e+00, 3.000e-01, ..., 1.404e+00, 6.000e+00,
        1.180e+02],
       [9.600e-01, 0.000e+00, 0.000e+00, ..., 1.147e+00, 5.000e+00,
        7.800e+01],
       [0.000e+00, 0.000e+00, 6.500e-01, ..., 1.250e+00, 5.000e+00,
        4.000e+01]])

In [3]:
y = y.to_numpy().ravel()
y

array([1, 1, 1, ..., 0, 0, 0])

## Reducing the dimensionality of DataFrame to 1-d

## Split dataset into train and test

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=1)

## Make pipline using sklearn and train model

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

pipe_rfc = make_pipeline(StandardScaler(),
                        PCA(n_components=8),
                        RandomForestClassifier(n_estimators=100))

pipe_rfc.fit(X_train, y_train)
y_pred = pipe_rfc.predict(X_test)
print(f'Test Accuracy: {pipe_rfc.score(X_test, y_test):.3f}')

Test Accuracy: 0.931


In [6]:
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=10).split(X_train, y_train)
scores = []

for k, (train, test) in enumerate(kfold):
    pipe_rfc.fit(X_train[train], y_train[train])
    score = pipe_rfc.score(X_train[test], y_train[test])
    scores.append(score)
    print(f'Fold: {k+1}, Class dist: {np.bincount(y_train[train])}, Acc: {score:.3f}')

Fold: 1, Class dist: [1755 1143], Acc: 0.932
Fold: 2, Class dist: [1756 1142], Acc: 0.919
Fold: 3, Class dist: [1756 1142], Acc: 0.904
Fold: 4, Class dist: [1756 1142], Acc: 0.938
Fold: 5, Class dist: [1756 1142], Acc: 0.901
Fold: 6, Class dist: [1756 1142], Acc: 0.916
Fold: 7, Class dist: [1756 1142], Acc: 0.904
Fold: 8, Class dist: [1756 1142], Acc: 0.904
Fold: 9, Class dist: [1756 1142], Acc: 0.901
Fold: 10, Class dist: [1756 1142], Acc: 0.885


In [7]:
print(f'The average accuracy: {np.mean(scores):.3f} +/- {np.std(scores):.3f}')

The average accuracy: 0.910 +/- 0.015
