# Preprocesamiento de los datos
Se preprocesan los conjuntos de datos de entrenamiento y de validación o prueba, vectorizándolos mediante el mecanismo apropiado.

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn import metrics
import numpy as np

# Loading the datasets
train = fetch_20newsgroups(subset='train', shuffle=True)
test = fetch_20newsgroups(subset='test', shuffle=True)

# Train dataset, casting to numpy array
train_raw_input = np.array(train.data)
train_output = np.array(train.target)
train_size = len(train_raw_input)

# Test dataset, casting to numpy array
test_raw_input = np.array(test.data)
test_output = np.array(test.target)
test_size = len(test_raw_input)

# Logging useful information
print(f'Dataset Train: {train_size} elements')
print(f'Dataset Test: {test_size} elements')

Dataset Train: 11314 elements
Dataset Test: 7532 elements


In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Preprocessing
vectorizer = TfidfVectorizer(encoding="latin1")
train_input = vectorizer.fit_transform(train_raw_input)
test_input = vectorizer.transform(test_raw_input)

# Entrenamiento
Se entrena el modelo con el subconjunto de entrenamiento

In [3]:
from src.multinomial_naive_bayes import MultinomialNaiveBayes

# Training the multinomial naive bayes model
classifier = MultinomialNaiveBayes(alpha=0.01)
classifier.fit(train_input, train_output)

# Validación del modelo
Se pone a prueba el modelo utilizando el subconjunto de validación o prueba, y se analiza la métrica. En primera instancia, se observa la exactitud.

In [9]:
predictions = classifier.predict(test_input.todense())

In [11]:
accuracy = metrics.accuracy_score(test_output, predictions)

In [12]:
print(f'Accuracy obtained: {accuracy}')

Accuracy obtained: 0.8352363250132767
