In [54]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [201]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV


##**Loading the 20 newsgroups dataset**

---



In [101]:
data= fetch_20newsgroups()
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [67]:
categories = ['talk.politics.guns', 'sci.crypt','talk.religion.misc', 'rec.autos']

##**Displaying the file content**

In [223]:
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
print(twenty_train.data[7])

From: CPKJP@vm.cc.latech.edu (Kevin Parker)
Subject: Insurance Rates on Performance Cars SUMMARY
Organization: Louisiana Tech University
Lines: 244
NNTP-Posting-Host: vm.cc.latech.edu
X-Newsreader: NNR/VM S_1.3.2

     I recently posted an article asking what kind of rates single, male
drivers under 25 yrs old were paying on performance cars. Here's a summary of
the replies I received.
 
 
 
 
-------------------------------------------------------------------------------
 
I'm not under 25 anymore (but is 27 close enough).
 
1992 Dodge Stealth RT/Twin Turbo (300hp model).
No tickets, no accidents, own a house, have taken defensive driving 1,
airbag, abs, security alarm, single.
 
$1500/year  $500 decut. State Farm Insurance (this includes the additional $100
for the $1,000,000 umbrella policy over my car and house)  The base
policy is the standard $100,000 - $100,000 - $300,000 policy required in DE.
 
After 2nd defensive driving course it will be 5% less.
 
I bought the car in Septem

In [69]:
twenty_train.target_names

['rec.autos', 'sci.crypt', 'talk.politics.guns', 'talk.religion.misc']

In [70]:
len(twenty_train.data)

2112

In [71]:
len(twenty_train.filenames)

2112

In [74]:
print("\n".join(twenty_train.data[3].split("\n")[:3]))

From: karn@servo.qualcomm.com (Phil Karn)
Subject: Re: Keeping Your Mouth Shut (was: Hard drive security)
Nntp-Posting-Host: servo.qualcomm.com


In [75]:
print(twenty_train.target_names[twenty_train.target[0]])

talk.politics.guns


In [76]:
twenty_train.target[:10]

array([2, 2, 1, 1, 2, 0, 3, 0, 2, 1])

In [81]:
for t in twenty_train.target[:10]:
 print(twenty_train.target_names[t])

talk.politics.guns
talk.politics.guns
sci.crypt
sci.crypt
talk.politics.guns
rec.autos
talk.religion.misc
rec.autos
talk.politics.guns
sci.crypt


##**Tokenizing text with Tfidfvectorizer**

In [152]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(twenty_train.data)
X_train_tfidf.shape



(2112, 36712)

In [171]:
tfidf_vect.vocabulary_.get(u'algorithm')


5917

##**Training a classifier using Multinomial NB classifier**

In [172]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [175]:
docs_new = ['Hi, my name is Navya', 'Pursuing masters in UNCC']
X_new_tfidf = tfidf_vect.transform(docs_new)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):print('%r => %s' % (doc, twenty_train.target_names[category]))

'Hi, my name is Navya' => rec.autos
'Pursuing masters in UNCC' => sci.crypt


##**Building a pipeline to vectorized data with a classifier**

In [190]:

text_clf = Pipeline([('vect', TfidfVectorizer()),('clf', MultinomialNB()),])

In [191]:
text_clf.fit(twenty_train.data, twenty_train.target)


Pipeline(steps=[('vect', TfidfVectorizer()), ('clf', MultinomialNB())])

##**Evaluation of the performance on the test set**


In [192]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8798862828713575

In [193]:
text_clf = Pipeline([('vect', TfidfVectorizer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                      alpha=1e-3, random_state=42,
                      max_iter=5, tol=None)),
                     ])

In [194]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', TfidfVectorizer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [195]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9474058280028429

In [196]:
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))

                    precision    recall  f1-score   support

         rec.autos       0.90      1.00      0.95       396
         sci.crypt       0.98      0.95      0.97       396
talk.politics.guns       0.96      0.93      0.94       364
talk.religion.misc       0.96      0.88      0.92       251

          accuracy                           0.95      1407
         macro avg       0.95      0.94      0.94      1407
      weighted avg       0.95      0.95      0.95      1407



In [205]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'clf__alpha': (1e-2, 1e-3),}

In [206]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [207]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [208]:
twenty_train.target_names[gs_clf.predict(['My name is Navya'])[0]]

'rec.autos'

In [209]:
gs_clf.best_score_

0.925

In [210]:
for param_name in sorted(parameters.keys()):print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
vect__ngram_range: (1, 1)


##**Detailed summary**

In [225]:
gs_clf.cv_results_

{'mean_fit_time': array([0.20294218, 0.7544692 , 0.19458795, 0.73895764]),
 'mean_score_time': array([0.03600707, 0.07819118, 0.03518462, 0.07582355]),
 'mean_test_score': array([0.91 , 0.92 , 0.925, 0.92 ]),
 'param_clf__alpha': masked_array(data=[0.01, 0.01, 0.001, 0.001],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_vect__ngram_range': masked_array(data=[(1, 1), (1, 2), (1, 1), (1, 2)],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'clf__alpha': 0.01, 'vect__ngram_range': (1, 1)},
  {'clf__alpha': 0.01, 'vect__ngram_range': (1, 2)},
  {'clf__alpha': 0.001, 'vect__ngram_range': (1, 1)},
  {'clf__alpha': 0.001, 'vect__ngram_range': (1, 2)}],
 'rank_test_score': array([4, 2, 1, 2], dtype=int32),
 'split0_test_score': array([0.85  , 0.85  , 0.8875, 0.8875]),
 'split1_test_score': array([0.925 , 0.925 , 0.925 , 0.9125]),
 'split2_test_score': array([0.9125, 

In [227]:
%%shell
jupyter nbconvert --to html /content/drive/MyDrive/BigData/InfoFeatures.ipynb
