# MBA Machine Learning
## Analise de Sentimentos e Processamento de Texto
## Discente: Daniel Gaias Malagurti

Importação de bibliotecas e frameworks

In [25]:
import numpy as np

In [26]:
from sklearn.datasets import fetch_20newsgroups

In [27]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfTransformer

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
from sklearn.pipeline import Pipeline

In [31]:
from sklearn.linear_model import SGDClassifier

In [32]:
from sklearn import metrics

In [33]:
from sklearn.model_selection import GridSearchCV

Especificando categorias

In [34]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med' ]

Carregando Dados

In [36]:
twenty_train = fetch_20newsgroups(subset= 'train', categories= categories, shuffle= True, random_state= 42)

In [37]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [40]:
len(twenty_train.data) 

2257

In [41]:
len(twenty_train.filenames) 

2257

In [44]:
print("\n".join(twenty_train.data[0].split("\n")[:3])) 

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [45]:
print (twenty_train.target_names[twenty_train.target[0]]) 

comp.graphics


In [46]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int32)

In [47]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


Tokenizing Process

In [48]:
count_vect = CountVectorizer()

In [50]:
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [51]:
X_train_counts.shape

(2257, 35788)

In [52]:
count_vect.vocabulary_.get(u'algorithm')

4690

Downscaling

In [53]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [54]:
X_train_tf = tf_transformer.transform(X_train_counts)

In [55]:
X_train_tf.shape

(2257, 35788)

In [56]:
tfidf_transformer = TfidfTransformer()

In [57]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [58]:
X_train_tfidf.shape

(2257, 35788)

Treinamento

In [59]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [60]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']

In [61]:
X_new_counts = count_vect.transform(docs_new)

In [62]:
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [63]:
predicted = clf.predict(X_new_tfidf)

In [64]:
for doc,category in zip(docs_new, predicted):
    print('%r=> %s' % (doc, twenty_train.target_names[category]))

'God is love'=> soc.religion.christian
'OpenGL on the GPU is fast'=> comp.graphics


Criando Pipeline

In [65]:
text_clf=Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', MultinomialNB()),
])                   

In [66]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

Avaliação do desempenho

In [67]:
twenty_test = fetch_20newsgroups(subset='test',
    categories = categories,shuffle=True,random_state=42)

In [68]:
docs_test = twenty_test.data

In [69]:
predicted = text_clf.predict(docs_test)

In [70]:
np.mean(predicted == twenty_test.target)

0.8348868175765646

Testando SVM

In [71]:
text_clf=Pipeline([('vect',CountVectorizer()),
                   ('tfidf',TfidfTransformer()),
                   ('clf',SGDClassifier(loss='hinge',
                                        penalty='l2',
                                        alpha=1e-3,
                                        random_state=42,
                                        max_iter=5,tol=None)),
                   ])                                

In [72]:
text_clf.fit(twenty_train.data,twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...dom_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False))])

In [73]:
predicted = text_clf.predict(docs_test)

In [74]:
np.mean(predicted==twenty_test.target)

0.9127829560585885

Demonstrativos de comparação de desempenho

In [75]:
print(metrics.classification_report(twenty_test.target,predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

             micro avg       0.91      0.91      0.91      1502
             macro avg       0.92      0.91      0.91      1502
          weighted avg       0.92      0.91      0.91      1502



In [76]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]], dtype=int64)

Ajuste de Parametros com pesquisa de grade

In [77]:
parameters={'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf' : (True, False),
            'clf__alpha': (1e-2, 1e-3),
            }

In [78]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs= -1)

In [79]:
gs_clf = gs_clf.fit(twenty_train.data[:400],twenty_train.target[:400])

In [80]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'