In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np

# Loading the datasets
train = fetch_20newsgroups(
    subset='train', 
    shuffle=True, 
    remove=('headers', 'footers')
)

test = fetch_20newsgroups(
    subset='test', 
    shuffle=True, 
    remove=('headers', 'footers')
)

# Categories
categories = train.target_names

# Train dataset, casting to numpy array
train_raw_input = np.array(train.data)
train_output = np.array(train.target)
train_size = len(train_raw_input)

# Test dataset, casting to numpy array
test_raw_input = np.array(test.data)
test_output = np.array(test.target)
test_size = len(test_raw_input)

# Logging useful information
print(f'Dataset Train: {train_size} elements')
print(f'Dataset Test: {test_size} elements')

Dataset Train: 11314 elements
Dataset Test: 7532 elements


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Preprocessing the data from the origina dataset
vectorizer = TfidfVectorizer(smooth_idf=False, min_df=1, max_df=0.15)
train_input = vectorizer.fit_transform(train_raw_input)
test_input = vectorizer.transform(test_raw_input)

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

# Training the multinomial naive bayes model
classifier = MultinomialNB(alpha=0.0075)
classifier.fit(train_input, train_output)

MultinomialNB(alpha=0.0075)

In [11]:
# Validating the model
predictions = classifier.predict(test_input)

In [12]:
from sklearn import metrics

# Computing the performance metric
score = metrics.accuracy_score(test_output, predictions)
print(f'Score: {score}')

Score: 0.8011152416356877
