In [1]:
import os 
import re
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
stemmer = nltk.stem.SnowballStemmer('english')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## I. For the 20N dataset compare two classifiers NB and LR to identify the 20 different newsgroups.

In [2]:
path_20N = 'datasets/20news-18828/20news-18828/'

#### - Create your own processing pipeline for the task and justify it.

In [3]:
def preprocess_text(text):
    text = text.lower()
    ### Remover correos electrónicos
    text = re.sub(r'[\w\d]+@[\w\d]+\.[\w\d]+\.?[\w\d]*\.?[\w\d]*\.?[\w\d]*\.?[\w\d]*',' ',text)
    #### Reemplazar números con la etiqueta NUM
    text = re.sub(r'\d+', 'NUM', text)
    # Remover con un expresión regular carateres especiales (no palabras) excepto signos de puntuación.
    text = re.sub(r'[^\w\s\.,:;\'\?]', ' ', str(text))
    # remover __ 
    text = re.sub(r'_+',' ',str(text))
    # minúsculas
    text = text.lower()
    # stemming
    text = " ".join([stemmer.stem(word) for word in text.split()])

    return text

In [4]:
def pipline_text(path20N):
    categoria = []
    texto = [] 

    for root, dirs, files in os.walk(path_20N):
        for file in files:
            categoria.append(root[root.rfind('/')+1:])
            with open(os.path.join(root,file),'r',encoding='latin-1') as file: 
                text = file.read()
                texto.append(preprocess_text(text))
    return texto,categoria
        

textos,categorias = pipline_text(path20N=path_20N)

#### - Divide the dataset into training (60%), validation (10%) and test (30%).
#### - Train NB and LR using the following vector representations:

##### -- tf (counts) representation (sklearn: CountVectorizer).

In [5]:
vectorizer_tf = CountVectorizer(max_features=4000, stop_words=stopwords.words('english') )
texto_features_tf = vectorizer_tf.fit_transform(textos).toarray()
#Divide the dataset into training (60%), validation (10%) and test (30%).
x_temp_tf, x_test_tf, y_temp_tf, y_test_tf = train_test_split(texto_features_tf,categorias,test_size=0.3)
x_train_tf, x_val_tf, y_train_tf,y_val_tf = train_test_split(x_temp_tf,y_temp_tf,test_size=0.1)

#### - tfidf representation (sklearn: TfidfVectorizer)


In [6]:
vectorizer_tfidf = TfidfVectorizer(max_features=4000, stop_words=stopwords.words('english'))
texto_features_tfidf = vectorizer_tfidf.fit_transform(textos)
#Divide the dataset into training (60%), validation (10%) and test (30%).
x_temp_tfidf,x_test_tfidf,y_temp_tfidf,y_test_tfidf = train_test_split(texto_features_tfidf,categorias,test_size=0.3)
x_train_tfidf,x_val_tfidf,y_train_tfidf,y_val_tfidf = train_test_split(x_temp_tfidf,y_temp_tfidf)

#### ▪ Train NB and LR using the following vector representations:
▪ tf (counts) representation (sklearn: CountVectorizer).

▪ tfidf representation (sklearn: TfidfVectorizer).

In [7]:
### Hacemos un entrenamiento tradicional para ver los resultados, haciendo un split de 70-30
### y tomar este modelo como linea base de comparación
nb_tf = MultinomialNB()
nb_tf.fit(x_temp_tf,y_temp_tf)  ## x_temp_tf,y_temp_tf el 70% de los datos
predictions_tf = nb_tf.predict(x_test_tf)
print(f" tf Naive Bayes accuracy score : {accuracy_score(y_test_tf,predictions_tf)}, params:{nb_tf.get_params()}")

nb_tfidf = MultinomialNB()
nb_tfidf.fit(x_temp_tfidf,y_temp_tfidf) ## x_temp_tfidf,y_temp_tfidf el 70% de los datos
predictions_tfidf = nb_tfidf.predict(x_test_tfidf)
print(f" tfidf Naive Bayes accuracy_score: {accuracy_score(y_test_tfidf,predictions_tfidf)}, params: {nb_tfidf.get_params()}")

 tf Naive Bayes accuracy score : 0.7905824039653035, params:{'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}
 tfidf Naive Bayes accuracy_score: 0.8350150469109577, params: {'alpha': 1.0, 'class_prior': None, 'fit_prior': True, 'force_alpha': True}


In [8]:
### Hacemos un entrenamiento tradicional para ver los resultados, haciendo un split de 70-30
### y tomar este modelo como linea base de comparación
logistic_model_SGD_tf = SGDClassifier(loss='log_loss', learning_rate='constant', eta0=0.0001)
logistic_model_SGD_tf.fit(x_temp_tf,y_temp_tf)
predictions_SGD_tf = logistic_model_SGD_tf.predict(x_test_tf)
print(f'tf accuracy_score: {accuracy_score(y_test_tf,predictions_SGD_tf)}')

logistic_model_SGD_tfidf = SGDClassifier(loss='log_loss', learning_rate='constant', eta0=0.0001)
logistic_model_SGD_tfidf.fit(x_temp_tfidf,y_temp_tfidf)
predictions_SGD_tfidf = logistic_model_SGD_tfidf.predict(x_test_tfidf)
print(f'tfidf accuracy_score: {accuracy_score(y_test_tfidf,predictions_SGD_tfidf)}')

tf accuracy_score: 0.7643830766507347
tfidf accuracy_score: 0.682421667551779


In [9]:
## para Naive Bayes buscamos el alpha más adecuado para suavizar con laplace, 
## usualmente alpha=1 y usamos el método de crosvalidation.
grid = {'alpha':[0.001,0.01,0.1,1,10]}  # buscamos sobre estos alphas para ver cuál es mejor
nb_tf_crossval = MultinomialNB()
grid_search_tf_crossval = GridSearchCV(nb_tf_crossval,grid, cv=10)   # 10 folders
grid_search_tf_crossval.fit(x_temp_tf,y_temp_tf)
print(f"mejores alpha: {grid_search_tf_crossval.best_params_}, mejor score: {grid_search_tf_crossval.best_score_}")
predictions_nb_tf_crossval = grid_search_tf_crossval.predict(x_test_tf)
print(f"tf cross validations accuracy_score: {accuracy_score(y_test_tf,predictions_nb_tf_crossval)}")


mejores alpha: {'alpha': 0.1}, mejor score: 0.7996810703500276
tf cross validations accuracy_score: 0.7976633032395114


#

In [10]:
## para Naive Bayes buscamos el alpha más adecuado para suavizar con laplace, 
## usualmente alpha=1 y usamos el método de crosvalidation.
grid = {'alpha':[0.001,0.01,0.1,1,10]}  # buscamos sobre estos alphas para ver cuál es mejor
nb_tfid_crossval = MultinomialNB()
grid_search_tfid_crossval = GridSearchCV(nb_tfid_crossval,grid, cv=10,scoring='accuracy')   # 10 folders
grid_search_tfid_crossval.fit(x_temp_tfidf,y_temp_tfidf)
print(f"mejores alpha: {grid_search_tfid_crossval.best_params_}, mejor score: {grid_search_tfid_crossval.best_score_}")
predictions_nb_tfid_crossval = grid_search_tfid_crossval.predict(x_test_tfidf)
print(f"tfid cross validations accuracy_score: {accuracy_score(y_test_tfidf,predictions_nb_tfid_crossval)}")



mejores alpha: {'alpha': 0.1}, mejor score: 0.8524920987714065
tfid cross validations accuracy_score: 0.8534253850238981


In [11]:
# param_grid_SGVclassifier = {
#     'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
#     'eta0': [0.001, 0.01, 0.1, 1],  # Only relevant for 'constant', 'invscaling', and 'adaptive'
#     'alpha': [0.0001, 0.001, 0.01, 0.1]  # Regularization term
# }

# sgvclas_tf_crv = SGDClassifier(loss='log_loss',max_iter=1000,tol=0.001)
# grid_search_sgvclas_tf_crv = GridSearchCV(sgvclas_tf_crv,param_grid_SGVclassifier,cv=10,scoring='accuracy')
# grid_search_sgvclas_tf_crv.fit(x_temp_tf,y_temp_tf) # 70% data
# print(f'mejores params: {grid_search_sgvclas_tf_crv.best_params_}, mejor score: {grid_search_sgvclas_tf_crv.best_score_}')
# predictions_sgvclass_tf_crv = grid_search_sgvclas_tf_crv.predict(x_test_tf)
# print(f'tf score sgv crossv: {accuracy_score(y_test_tf,predictions_sgvclass_tf_crv)}')

In [None]:
param_grid_SGVclassifier = {
    'learning_rate': ['constant'],
    'eta0': [0.001, 0.01, 0.1, 1],  
}

sgvclas_tf_crv = SGDClassifier(loss='log_loss',max_iter=1000,tol=0.001)
grid_search_sgvclas_tf_crv = GridSearchCV(sgvclas_tf_crv,param_grid_SGVclassifier,cv=10,scoring='accuracy')
grid_search_sgvclas_tf_crv.fit(x_temp_tf,y_temp_tf) # 70% data
print(f'mejores params: {grid_search_sgvclas_tf_crv.best_params_}, mejor score: {grid_search_sgvclas_tf_crv.best_score_}')
predictions_sgvclass_tf_crv = grid_search_sgvclas_tf_crv.predict(x_test_tf)
print(f'tf score sgv crossv: {accuracy_score(y_test_tf,predictions_sgvclass_tf_crv)}')

In [None]:
# hacemos una función para entrenamiento por épocas para 
# el SGDClassifier evitando sobreajuste

def SGD_epochs(xtrain,ytrain,xval,yval,num_epochs,tolerance=0.1,stop=5):
    best_accuracy = 0
    num_noimprovements = 0
    best_model = None
    clases = np.unique(ytrain)
    model = SGDClassifier(loss='log_loss',learning_rate='constant',eta0=0.0001) 
    for i in range(num_epochs):
        # print(f'época: {i}') 
        model.fit(xtrain,ytrain)  # entrenar con pequeños lotes de datos 
        predictions = model.predict(xval)

        accuracy = accuracy_score(yval,predictions)

        if accuracy > best_accuracy*(1+tolerance):
            best_accuracy = accuracy
            best_model = model 
            num_noimprovements = 0 
        else: 
            num_noimprovements += 1

        print(f'best accuracy: {best_accuracy},epoch: {i}')
        if num_noimprovements>(stop-2):
            print(f'existe posible sobreajuste, época: {i}')
            return  best_model
        
    return best_model

model_tf = SGD_epochs(x_train_tf,y_train_tf,x_val_tf,y_val_tf,100)
predictions_tf2 = model_tf.predict(x_test_tf)
print(f'tf accuracy: {accuracy_score(y_test_tf,predictions_tf2)}')
model_tf.get_params()

In [30]:
model_tf.get_params()

{'alpha': 0.0001,
 'average': False,
 'class_weight': None,
 'early_stopping': False,
 'epsilon': 0.1,
 'eta0': 0.0001,
 'fit_intercept': True,
 'l1_ratio': 0.15,
 'learning_rate': 'constant',
 'loss': 'log_loss',
 'max_iter': 1000,
 'n_iter_no_change': 5,
 'n_jobs': None,
 'penalty': 'l2',
 'power_t': 0.5,
 'random_state': None,
 'shuffle': True,
 'tol': 0.001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}