# NewGroups Dataset with scikit-learn

In [46]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']

# load the list of files matching those categories
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

# The returned dataset is a scikit-learn “bunch”: a simple holder object with fields 
# that can be both accessed as python dict keys or object attributes for convenience, 
# for instance the target_names holds the list of the requested category names
print(twenty_train.target_names)

print(len(twenty_train.data))

print(len(twenty_train.filenames))

# first lines of the first file
print("\n".join(twenty_train.data[0].split("\n")[:3]))

# the label of the file is the folder which hold it. target_names(label) -> target(news)
print(twenty_train.target_names[twenty_train.target[0]])

# scikit-leanr holds file as array of integer that corresponds to the index
# of the category name. 
print(twenty_train.target[:10])

# get back category name
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
2257
2257
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
comp.graphics
[1 1 3 3 3 3 3 2 2 2]
comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


#### Tokenizing text with scikit-learn

In [124]:
from sklearn.feature_extraction.text import CountVectorizer

# build a dictionary of features and trasnforms documents to feature vectors
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape)

print(count_vect.vocabulary_.get(u'algorithm')) 
#the result of this print is the frequency of this word in the whole corpus

(2257, 35788)
4690


### TermFrequency (TF) and InverseDocumentFrequency (IDF)

Ora trasformiamo la matrice di X[i,j], dove i è l'indice della parola e j è il count della parola,
in una matrice Y[i,j] dove i sarà la Term Frequency e j la Inverse Document Frequency

In [48]:
# TF: count of a word in doc / total number of words

from sklearn.feature_extraction.text import TfidfTransformer

# fit method to fit our estimator to the tada
tf_transf = TfidfTransformer(use_idf=False).fit(X_train_counts)
# transform method to transofmr the count-matrix to a tf-idf representation
X_train_tf = tf_transf.transform(X_train_counts)
print(X_train_tf.shape)

# Si può usare questa per fare i due passaggi insieme:
tfid_trasnf = TfidfTransformer()
X_train_tfidf = tfid_trasnf.fit_transform(X_train_counts)
print(X_train_tfidf.shape)


(2257, 35788)
(2257, 35788)


## Training a Classifier

### Naive Bayes classifier

1. .fit => per fittare gli estimator ai data
2. .Transform => per trasformare la count-matrix to tf-idf representation

In [49]:
from sklearn.naive_bayes import MultinomialNB
# Carichiamo il modello di bayes
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

# Queste sono le parole che deve predirre l'ouput. Le trasformiamo e basta perché sono già fittate
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_count = count_vect.transform(docs_new)
X_new_tfidf = tfid_trasnf.transform(X_new_count)

# Predict
predicted = clf.predict(X_new_tfidf)

for i in range(0,4):
    print(twenty_train.target_names[i])

for doc, category in zip(docs_new,predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))


alt.atheism
comp.graphics
sci.med
soc.religion.christian
'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


### Building a Pipeline

La pipeline è costituita da vectorizer => transformer => classifier. 
Scikit-learn ha un oggettto pipeline incorporato che velocizza questo processo

In [69]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
        ('vect',CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB)
])

### Evaluation of the performance on the test set

In [116]:
from sklearn.pipeline import Pipeline

text_clf = Pipeline([
        ('vect',CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB)
])

# text_clf.fit(twenty_train.data, twenty_train.target)
print(twenty_train.target.shape)

import numpy as np

twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

(2257,)


NotFittedError: Vocabulary not fitted or provided

### SGCD classifier

In [79]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer),
    ('clf', SGDClassifier(loss='hinge',penalty=12,
                        alpha=1e-3, random_state=42,
                        max_iter=5, tol=None))
])

text_clf.fit(twenty_train.data, twenty_train.target)

predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

AttributeError: 'csr_matrix' object has no attribute 'fit'