# Naive-Bayes-Classification

## News Classifier

dataset: http://qwone.com/~jason/20Newsgroups/

In [1]:
# Imports
import numpy as np 
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
# Defining labels to classify
categories = ['rec.motorcycles', 'sci.electronics', 'misc.forsale', 'talk.religion.misc']

In [3]:
# Trainning
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True)

In [23]:
twenty_train.target_names

['misc.forsale', 'rec.motorcycles', 'sci.electronics', 'talk.religion.misc']

In [10]:
len(twenty_train.data)

2151

In [21]:
# Visualizing one sample
print("\n".join(twenty_train.data[10].split("\n")[:]))
print(twenty_train.target_names[twenty_train.target[10]])

From: moffatt@bnr.ca (John Thomson)
Subject: Re: What is Zero dB????
Nntp-Posting-Host: bcarhdd
Organization: Bell-Northern Research, Ottawa, Canada
X-Newsreader: TIN [version 1.1 PL6]
Lines: 47

Joseph Chiu (josephc@cco.caltech.edu) wrote:
: sehari@iastate.edu (Babak Sehari) writes:
: 
: >Similarly, people usually use dB for dBm. Another common mistake is spelling
: >``db'' instead of ``dB'' as you did in your article. See the ``B'' is for 
: >``Bell'' company, the mother of AT&T and should be capitalized.
: 
: Thus, a deciBell (deci-, l., tenth of + Bell) is a fractional part of the 
: original Bell.  For example, SouthWestern Bell is a deciBell.

Out of what hat did you pull this one?  dB is a ratio not an RBOC!        

: And the measure of current, Amp, is actually named after both the AMP company
: and the Amphenol company.  Both companies revolutionized electronics by
: simulatenously realizing that the performance of connectors and sockets 
: were affected by the amount of curr

In [24]:
# Visualizing class of 10 first registers
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

rec.motorcycles
talk.religion.misc
talk.religion.misc
misc.forsale
misc.forsale
rec.motorcycles
misc.forsale
talk.religion.misc
talk.religion.misc
misc.forsale


## Bag of Words

In [25]:
# Tokenizing
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2151, 31564)

In [28]:
print(X_train_counts)

  (0, 13628)	1
  (0, 26891)	3
  (0, 5894)	4
  (0, 9652)	4
  (0, 13910)	5
  (0, 8774)	6
  (0, 24813)	3
  (0, 26892)	3
  (0, 27417)	1
  (0, 23822)	2
  (0, 17210)	1
  (0, 31479)	4
  (0, 30541)	2
  (0, 30276)	1
  (0, 30378)	1
  (0, 14304)	2
  (0, 2743)	2
  (0, 5504)	2
  (0, 9653)	1
  (0, 7411)	1
  (0, 2884)	1
  (0, 24326)	1
  (0, 28664)	3
  (0, 21346)	1
  (0, 9452)	1
  :	:
  (2150, 24382)	1
  (2150, 11129)	1
  (2150, 13844)	1
  (2150, 4396)	1
  (2150, 10096)	1
  (2150, 25204)	1
  (2150, 8466)	1
  (2150, 27369)	1
  (2150, 23325)	1
  (2150, 26195)	3
  (2150, 28352)	1
  (2150, 10265)	1
  (2150, 7461)	1
  (2150, 5353)	1
  (2150, 8806)	1
  (2150, 5686)	2
  (2150, 26219)	1
  (2150, 4481)	1
  (2150, 25760)	1
  (2150, 570)	2
  (2150, 3408)	2
  (2150, 31313)	1
  (2150, 28844)	1
  (2150, 16832)	1
  (2150, 13818)	1


In [32]:
# Term Frequency Times Inverse Document Frequency (Tfidf)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2151, 31564)

In [33]:
print(X_train_tfidf)

  (0, 31479)	0.27986206976760586
  (0, 31223)	0.07112247020370334
  (0, 31159)	0.040768161697355525
  (0, 31109)	0.02390955848250578
  (0, 30974)	0.03542038246709194
  (0, 30833)	0.034021050880748735
  (0, 30594)	0.03208156637720816
  (0, 30541)	0.044965341372976204
  (0, 30470)	0.0550207767858142
  (0, 30378)	0.056209058798599686
  (0, 30276)	0.05754170962902774
  (0, 29874)	0.07241583335350622
  (0, 29798)	0.029568681093915857
  (0, 28716)	0.06620756508072162
  (0, 28664)	0.038535080151361395
  (0, 28592)	0.030403804110316544
  (0, 28322)	0.07364172215144714
  (0, 28315)	0.03399829273236834
  (0, 27417)	0.010980895021046152
  (0, 26892)	0.24955864924319732
  (0, 26891)	0.24955864924319732
  (0, 26602)	0.03493123157161606
  (0, 26416)	0.0443564901543458
  (0, 26258)	0.06291738729899189
  (0, 26003)	0.048441922386978274
  :	:
  (2150, 6338)	0.15259710801405602
  (2150, 6291)	0.029615804755250184
  (2150, 5854)	0.0552847817847905
  (2150, 5785)	0.05455024086348416
  (2150, 5686)	0.15574

## Creating Model

In [34]:
# Creating Model
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [44]:
# Predictions
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

ValueError: Input has n_features=8 while the model has been trained with n_features=31564

In [54]:
# Criando um Pipeline - to vectorize, create term frequency inverse and classifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [55]:
# Fit
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [56]:
# Accuracy Model
twenty_test = fetch_20newsgroups(subset = 'test', categories = categories, shuffle = True, random_state = 42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)    

0.9036312849162011

In [57]:
# Metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names = twenty_test.target_names))

                    precision    recall  f1-score   support

      misc.forsale       0.97      0.86      0.91       390
   rec.motorcycles       0.83      0.99      0.90       398
   sci.electronics       0.89      0.92      0.91       393
talk.religion.misc       1.00      0.80      0.89       251

          accuracy                           0.90      1432
         macro avg       0.92      0.89      0.90      1432
      weighted avg       0.91      0.90      0.90      1432



In [45]:
# Confusion Matrix
metrics.confusion_matrix(twenty_test.target, predicted)

array([[336,  23,  31,   0],
       [  3, 395,   0,   0],
       [  4,  26, 363,   0],
       [  3,  33,  15, 200]])

In [58]:
# Params to GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [47]:
# GridSearchCV
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [48]:
# Fit
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [59]:
# Testing
twenty_train.target_names[gs_clf.predict(['We need sale this until tomorrow'])[0]]

'rec.motorcycles'

In [60]:
# Score
gs_clf.best_score_        

0.915

In [61]:
#  Params used
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: False
vect__ngram_range: (1, 2)
