In [99]:
import os 
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

stemmer = nltk.stem.SnowballStemmer('english')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
path_20N = 'datasets/20news-18828/20news-18828/'

In [61]:
def preprocess_text(text):
    text = text.lower()
    ### Remover correos electrónicos
    text = re.sub(r'[\w\d]+@[\w\d]+\.[\w\d]+\.?[\w\d]*\.?[\w\d]*\.?[\w\d]*\.?[\w\d]*',' ',text)
    #### Reemplazar números con la etiqueta NUM
    text = re.sub(r'\d+', 'NUM', text)
    # Remover con un expresión regular carateres especiales (no palabras) excepto signos de puntuación.
    text = re.sub(r'[^\w\s\.,:;\'\?]', ' ', str(text))
    # remover __ 
    text = re.sub(r'_+',' ',str(text))
    # minúsculas
    text = text.lower()
    # stemming
    text = " ".join([stemmer.stem(word) for word in text.split()])

    return text

In [62]:
def pipline_text(path20N):
    categoria = []
    texto = [] 

    for root, dirs, files in os.walk(path_20N):
        for file in files:
            categoria.append(root[root.rfind('/')+1:])
            with open(os.path.join(root,file),'r',encoding='latin-1') as file: 
                text = file.read()
                texto.append(preprocess_text(text))
    return texto,categoria
        

textos,categorias = pipline_text(path20N=path_20N)

# tf (counts) representation

In [63]:
vectorizer = CountVectorizer(max_features=4000, stop_words=stopwords.words('english') )

# Ahora le solicitamos utilizando nuestro conjunto de datos que construya el vocabulario y tambien transforme nuestro texto
texto_features = vectorizer.fit_transform(textos).toarray()


18828

In [90]:
#Divide the dataset into training (60%), validation (10%) and test (30%).
x_temp, x_test, y_temp, y_test = train_test_split(texto_features,categorias,test_size=0.3)
x_train, x_val, y_train,y_val = train_test_split(x_temp,y_temp,test_size=0.1)


## tfidf representation


In [112]:
vectorizer_tfidf = TfidfVectorizer(max_features=4000, stop_words=stopwords.words('english'))
texto_features_tfidf = vectorizer_tfidf.fit_transform(textos)

In [113]:
x_temp_tfidf,x_test_tfidf,y_temp_tfidf,y_test_tfidf = train_test_split(texto_features_tfidf,categorias,test_size=0.3)
x_train_tfidf,x_val_tfidf,y_train_tfidf,y_val_tfidf = train_test_split(x_temp_tfidf,y_temp_tfidf)

# Naive Bayes

In [115]:
nb = MultinomialNB()
nb.fit(x_train,y_train)
predictions = nb.predict(x_val)
print(accuracy_score(y_val,predictions))

0.7511380880121397


In [116]:
nb_tfidf = MultinomialNB()
nb_tfidf.fit(x_train_tfidf,y_train_tfidf)
predictions_tfidf = nb_tfidf.predict(x_val_tfidf)
print(accuracy_score(y_val_tfidf,predictions_tfidf))


0.7987860394537177
