In [None]:
from sklearn.datasets import fetch_20newsgroups # dataset de texto para classificação contendo 20 classes
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer # carrega Vectorizer e TFIDF
from sklearn.naive_bayes import MultinomialNB # algoritmo do Naive Bayes
from sklearn.pipeline import Pipeline # Cria pipeline contendo todas as transformações e modelo
from nltk.stem.snowball import SnowballStemmer # Função que retorna a palavra a sua raiz
import numpy as np
from sklearn.linear_model import SGDClassifier # Algoritmo Gradient Descendente Stocastico
from sklearn.model_selection import GridSearchCV
import nltk 
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import warnings
import matplotlib.pyplot as plt 

warnings.simplefilter('ignore')
#nltk.download()

In [None]:
newsgroups = fetch_20newsgroups(subset='train') # Carrega o dataset de treinamento do fetch 20 news groups

In [None]:
list(newsgroups.target_names) # retorna as classes disponíveis para treinamento

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
categories = ['sci.space', 'sci.electronics', 'talk.religion.misc', 'comp.sys.mac.hardware', 'sci.med', 'talk.politics.misc', 'talk.politics.guns', 'alt.atheism', 'talk.politics.mideast', 'rec.motorcycles'] # Lista com as classes que vamos trabalhar
df_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
df_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True)

In [None]:
df_train.target_names

['alt.atheism',
 'comp.sys.mac.hardware',
 'rec.motorcycles',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
print("\n".join(df_train.data[1].split("\n")[:30]))

From: ingles@engin.umich.edu (Ray Ingles)
Subject: Re: Concerning God's Morality (was: Americans and Evolution)
Organization: University of Michigan Engineering, Ann Arbor
Lines: 110
Distribution: world
NNTP-Posting-Host: syndicoot.engin.umich.edu

In article <1993Apr2.155057.808@batman.bmd.trw.com> jbrown@batman.bmd.trw.com writes:
[why do babies get diseases, etc.]
>What God did create was life according to a protein code which is
>mutable and can evolve.  Without delving into a deep discussion of
>creationism vs evolutionism,

 Here's the (main) problem. The scenario you outline is reasonably 
consistent, but all the evidence that I am familiar with not only does
not support it, but indicates something far different. The Earth, by
latest estimates, is about 4.6 billion years old, and has had life for
about 3.5 billion of those years. Humans have only been around for (at
most) about 200,000 years. But, the fossil evidence inidcates that life
has been changing and evolving, and, in fa

# Feature Enginnering

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(df_train.data)
X_train_counts.shape

(5386, 64650)

In [None]:
tfidf_transformer = TfidfTransformer() 
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# Abordagem I
**Treinamento do modelo**

In [None]:
clf = MultinomialNB()
clf.fit(X_train_tfidf, df_train.target)

In [None]:
clf_1 = Pipeline([
    ('vect', CountVectorizer()),    # Passo 1, aplicar o count vectorizer nos textos 
    ('tfidf', TfidfTransformer()),  # Passo 2, aplicar o TFIDF nos textos
    ('clf', MultinomialNB())])      # Passo 3, aplicar o algoritmo Naive Bayes

In [None]:
clf_trained = clf_1.fit(df_train.data, df_train.target)

In [None]:
pred = clf_trained.predict(df_test.data)

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8262688232013385


In [None]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                       precision    recall  f1-score   support

          alt.atheism       0.73      0.84      0.78       319
comp.sys.mac.hardware       0.96      0.92      0.94       385
      rec.motorcycles       0.93      0.98      0.95       398
      sci.electronics       0.92      0.78      0.85       393
              sci.med       0.90      0.87      0.88       396
            sci.space       0.87      0.94      0.90       394
   talk.politics.guns       0.55      0.97      0.71       364
talk.politics.mideast       0.86      0.97      0.91       376
   talk.politics.misc       0.93      0.47      0.62       310
   talk.religion.misc       0.96      0.25      0.40       251

             accuracy                           0.83      3586
            macro avg       0.86      0.80      0.80      3586
         weighted avg       0.86      0.83      0.82      3586



# Tuning de parametros
Modelo usando Naive bayes com Grid Search

In [None]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [None]:
gs_clf = GridSearchCV(clf_trained, parameters, n_jobs=-1) # Define o grid search para buscar os melhores parametros 
gs_clf = gs_clf.fit(df_train.data, df_train.target) # treinamento do modelo 

In [None]:
# Para ver a melhor pontuação média e os parâmetros, execute o seguinte código
print(gs_clf.best_score_)
gs_clf.best_params_

0.9576680912932405


{'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [None]:
pred = gs_clf.predict(df_test.data) # Faz predição no dado de teste

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8747908533184606


In [None]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                       precision    recall  f1-score   support

          alt.atheism       0.82      0.87      0.85       319
comp.sys.mac.hardware       0.89      0.94      0.91       385
      rec.motorcycles       0.95      0.97      0.96       398
      sci.electronics       0.90      0.85      0.87       393
              sci.med       0.90      0.86      0.88       396
            sci.space       0.91      0.93      0.92       394
   talk.politics.guns       0.77      0.93      0.84       364
talk.politics.mideast       0.96      0.95      0.95       376
   talk.politics.misc       0.81      0.67      0.73       310
   talk.religion.misc       0.80      0.66      0.72       251

             accuracy                           0.87      3586
            macro avg       0.87      0.86      0.86      3586
         weighted avg       0.88      0.87      0.87      3586



# Abordagem II

In [None]:
# Pipeline de machine learning 
clf_2 = Pipeline([
    ('vect', CountVectorizer()), # Passo 1, aplicar o count vectorizer nos textos 
    ('tfidf', TfidfTransformer()), # Passo 2, aplicar o TFIDF nos textos
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=25, random_state=62))]) # Passo 3, aplicar o algoritmo Naive Bayes

In [None]:
svm_trained = clf_2.fit(df_train.data, df_train.target) # realiza o treinamento do modelo no pipeline

In [None]:
pred = svm_trained.predict(df_test.data) # Faz predição no dado de teste

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8580591187953152


# Tuning de parametros

**Modelo usando SGD com Grid Search**

In [None]:
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

In [None]:
gs_clf_svm = GridSearchCV(svm_trained, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(df_train.data, df_train.target)

In [None]:
# Assertividade e melhores parametros
print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.9391029848252291


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [None]:
# Assertividade e melhores parametros
print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.9391029848252291


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8580591187953152


In [None]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                       precision    recall  f1-score   support

          alt.atheism       0.76      0.82      0.78       319
comp.sys.mac.hardware       0.87      0.95      0.91       385
      rec.motorcycles       0.92      0.99      0.95       398
      sci.electronics       0.91      0.81      0.86       393
              sci.med       0.89      0.87      0.88       396
            sci.space       0.91      0.97      0.94       394
   talk.politics.guns       0.71      0.95      0.81       364
talk.politics.mideast       0.94      0.92      0.93       376
   talk.politics.misc       0.86      0.57      0.69       310
   talk.religion.misc       0.82      0.57      0.67       251

             accuracy                           0.86      3586
            macro avg       0.86      0.84      0.84      3586
         weighted avg       0.86      0.86      0.85      3586



# Abordagem III

**Remover os stopwords**

In [None]:
nb_clf_stp = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),    # Passo 1, aplicar o count vectorizer nos textos excluindo stopwords
    ('tfidf', TfidfTransformer()),  # Passo 2, aplicar o TFIDF nos textos
    ('clf', MultinomialNB())])      # Passo 3, aplicar o algoritmo Naive Bayes

In [None]:
nb_clf_stp_trained = nb_clf_stp.fit(df_train.data, df_train.target) # realiza o treinamento do modelo no pipeline

In [None]:
pred = nb_clf_stp_trained.predict(df_test.data) # Faz predição no dado de teste

In [None]:
acc = np.mean(pred == df_test.target)
print('Accuracy = ', acc)

creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

Accuracy =  0.8538761851645287
                       precision    recall  f1-score   support

          alt.atheism       0.77      0.87      0.82       319
comp.sys.mac.hardware       0.92      0.94      0.93       385
      rec.motorcycles       0.93      0.99      0.96       398
      sci.electronics       0.91      0.83      0.87       393
              sci.med       0.91      0.85      0.88       396
            sci.space       0.87      0.97      0.91       394
   talk.politics.guns       0.65      0.96      0.77       364
talk.politics.mideast       0.91      0.97      0.94       376
   talk.politics.misc       0.89      0.54      0.67       310
   talk.religion.misc       0.95      0.41      0.57       251

             accuracy                           0.85      3586
            macro avg       0.87      0.83      0.83      3586
         weighted avg       0.87      0.85      0.85      3586



In [None]:
nb_clf_tuned = GridSearchCV(nb_clf_stp_trained, parameters, n_jobs=-1) # Define o grid search para buscar os melhores parametros 
nb_clf_tuned = nb_clf_tuned.fit(df_train.data, df_train.target) # treinamento do modelo 

In [None]:
# Assertividade e melhores parametros
print(nb_clf_tuned.best_score_)
nb_clf_tuned.best_params_

0.9550698273738465


{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [None]:
pred = nb_clf_tuned.predict(df_test.data) # Faz predição no dado de teste

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8742331288343558


In [None]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                       precision    recall  f1-score   support

          alt.atheism       0.84      0.86      0.85       319
comp.sys.mac.hardware       0.88      0.93      0.91       385
      rec.motorcycles       0.94      0.97      0.96       398
      sci.electronics       0.89      0.84      0.86       393
              sci.med       0.89      0.85      0.87       396
            sci.space       0.90      0.93      0.92       394
   talk.politics.guns       0.82      0.92      0.87       364
talk.politics.mideast       0.98      0.93      0.96       376
   talk.politics.misc       0.79      0.69      0.73       310
   talk.religion.misc       0.74      0.71      0.72       251

             accuracy                           0.87      3586
            macro avg       0.87      0.86      0.86      3586
         weighted avg       0.87      0.87      0.87      3586



# Abordagem IV
**Stopword com SVM**

In [None]:
# Pipeline de machine learning 
svm_stp_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')), # Passo 1, aplicar o count vectorizer nos textos 
    ('tfidf', TfidfTransformer()), # Passo 2, aplicar o TFIDF nos textos
    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, max_iter=25, random_state=62))]) # Passo 3, aplicar o algoritmo Naive Bayes

In [None]:
svm_stp_trained = svm_stp_clf.fit(df_train.data, df_train.target) # realiza o treinamento do modelo no pipeline

In [None]:
pred = svm_stp_trained.predict(df_test.data) # Faz predição no dado de teste

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.8622420524261015


In [None]:
svm_stp_tun = GridSearchCV(svm_stp_trained, parameters_svm, n_jobs=-1)
svm_stp_tun = svm_stp_tun.fit(df_train.data, df_train.target)

In [None]:
# Assertividade e melhores parametros
print(svm_stp_tun.best_score_)
svm_stp_tun.best_params_

0.9398461334394481


{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [None]:
pred = svm_stp_tun.predict(df_test.data) # Faz predição no dado de teste

In [None]:
acc = np.mean(pred == df_test.target)
print('>>>> Acurácia: ', acc)

>>>> Acurácia:  0.85861684327942


In [None]:
creport = classification_report(df_test.target, pred, target_names=df_test.target_names)
print(creport)

                       precision    recall  f1-score   support

          alt.atheism       0.75      0.83      0.79       319
comp.sys.mac.hardware       0.89      0.94      0.91       385
      rec.motorcycles       0.92      0.99      0.95       398
      sci.electronics       0.91      0.79      0.85       393
              sci.med       0.89      0.83      0.86       396
            sci.space       0.89      0.97      0.93       394
   talk.politics.guns       0.73      0.94      0.82       364
talk.politics.mideast       0.94      0.92      0.93       376
   talk.politics.misc       0.83      0.62      0.71       310
   talk.religion.misc       0.83      0.61      0.70       251

             accuracy                           0.86      3586
            macro avg       0.86      0.84      0.85      3586
         weighted avg       0.86      0.86      0.86      3586

