In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.feature_selection import chi2, SelectKBest

In [3]:
def report(y_true, y_pred):
  report = classification_report(y_true, y_pred)
  print(report)

In [4]:
ads = pd.read_csv('ad.data', header=None)
ads.shape

(3279, 1559)

In [5]:
y = ads.iloc[:, -1].values
X = ads.iloc[:, :-1].values

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
first_model = GaussianNB()
first_model.fit(X_train, y_train)
first_mode_predictions = first_model.predict(X_test)
report(y_test, first_mode_predictions)

              precision    recall  f1-score   support

         ad.       0.34      0.86      0.49       123
      nonad.       0.97      0.76      0.86       861

    accuracy                           0.78       984
   macro avg       0.66      0.81      0.67       984
weighted avg       0.90      0.78      0.81       984



In [9]:
selection = SelectKBest(chi2, k=7)
# The attribute selection occurs based on the selected class (y) so changing the class would change the attributes selecteds
X_new = selection.fit_transform(X, y)
X_new

array([[125.    , 125.    ,   1.    , ...,   1.    ,   0.    ,   1.    ],
       [ 57.    , 468.    ,   8.2105, ...,   1.    ,   0.    ,   0.    ],
       [ 33.    , 230.    ,   6.9696, ...,   0.    ,   0.    ,   0.    ],
       ...,
       [ 23.    , 120.    ,   5.2173, ...,   0.    ,   0.    ,   0.    ],
       [  0.    ,   0.    ,   0.    , ...,   0.    ,   0.    ,   0.    ],
       [ 40.    ,  40.    ,   1.    , ...,   0.    ,   0.    ,   0.    ]])

In [11]:
# Checking the numer of columns (k=7) selected in the SelectKBest
X_new.shape

(3279, 7)

In [12]:
print(selection.get_support())

[ True  True  True ... False False False]


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
second_model = GaussianNB()
second_model.fit(X_train, y_train)
second_mode_predictions = second_model.predict(X_test)
report(y_test, second_mode_predictions)

              precision    recall  f1-score   support

         ad.       0.83      0.71      0.76       123
      nonad.       0.96      0.98      0.97       861

    accuracy                           0.95       984
   macro avg       0.89      0.84      0.87       984
weighted avg       0.94      0.95      0.94       984

