In [1]:
%matplotlib inline
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_20newsgroups

from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import string

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB



In [2]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [3]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [4]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [5]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [6]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [7]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [8]:
docs_new = ['God is love', 'OpenGL on the GPU is fast','Act now for instant money savings','win a free ipad today']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'Act now for instant money savings' => soc.religion.christian
'win a free ipad today' => soc.religion.christian


In [9]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])

In [10]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [11]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target) 

0.83488681757656458

In [12]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),])
_ = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)            


0.9127829560585885

In [13]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502



In [14]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[258,  11,  15,  35],
       [  4, 379,   3,   3],
       [  5,  33, 355,   3],
       [  5,  10,   4, 379]])

In [15]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}

In [16]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [17]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [18]:
twenty_train.target_names[gs_clf.predict(['God is love'])]

  if __name__ == '__main__':


'soc.religion.christian'

In [19]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

score     

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)


0.90000000000000002

# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||




# Exercise 1

In [20]:
# """Build a language detector model

# The goal of this exercise is to train a linear classifier on text features
# that represent sequences of up to 3 consecutive characters so as to be
# recognize natural languages by using the frequencies of short character
# sequences as 'fingerprints'.

# """
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer


In [21]:

# The training data folder must be passed as first argument
# languages_data_folder = '../scikit-learn/doc/tutorial/text_analytics/data/languages/'
languages_data_folder = 'language_data/paragraphs'
dataset = load_files(languages_data_folder)


In [22]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)


In [23]:

# TASK: Build a an vectorizer that splits strings 
# into sequence of 1 to 3 characters instead of word tokens

vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char',use_idf=False)

In [24]:

# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf

clf = Pipeline([('vec', vectorizer),('clf', Perceptron()),])



In [25]:
# TASK: Fit the pipeline on the training set

clf.fit(docs_train, y_train)


Pipeline(steps=[('vec', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
   ...n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
      verbose=0, warm_start=False))])

In [26]:
# TASK: Predict the outcome on the testing set in a variable named y_predicted

y_predicted = clf.predict(docs_test)

np.mean(y_predicted == y_test) 


0.98190045248868774

In [27]:
# Print the classification report
print(metrics.classification_report(y_test, y_predicted,target_names=dataset.target_names))


             precision    recall  f1-score   support

         ar       1.00      1.00      1.00        16
         de       1.00      0.98      0.99        47
         en       0.98      1.00      0.99        54
         es       1.00      0.95      0.97        60
         fr       0.91      1.00      0.95        48
         it       0.96      0.98      0.97        45
         ja       1.00      1.00      1.00        38
         nl       1.00      1.00      1.00        23
         pl       1.00      0.96      0.98        26
         pt       1.00      0.98      0.99        55
         ru       1.00      0.97      0.98        30

avg / total       0.98      0.98      0.98       442



In [28]:

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)



[[16  0  0  0  0  0  0  0  0  0  0]
 [ 0 46  0  0  1  0  0  0  0  0  0]
 [ 0  0 54  0  0  0  0  0  0  0  0]
 [ 0  0  0 57  2  1  0  0  0  0  0]
 [ 0  0  0  0 48  0  0  0  0  0  0]
 [ 0  0  0  0  1 44  0  0  0  0  0]
 [ 0  0  0  0  0  0 38  0  0  0  0]
 [ 0  0  0  0  0  0  0 23  0  0  0]
 [ 0  0  0  0  0  1  0  0 25  0  0]
 [ 0  0  0  0  1  0  0  0  0 54  0]
 [ 0  0  1  0  0  0  0  0  0  0 29]]


In [29]:

#import pylab as pl
#pl.matshow(cm, cmap=pl.cm.jet)
#pl.show()

# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))



The language of "This is a language detection test." is "en"
The language of "Ceci est un test de détection de la langue." is "fr"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "de"






# |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

# Exercise 2

In [30]:
# """Build a sentiment analysis / polarity model
# Sentiment analysis can be casted as a binary text classification problem,
# that is fitting a linear classifier on features extracted from the text
# of the user messages so as to guess wether the opinion of the author is
# positive or negative.
# In this examples we will use a movie review dataset.
# """
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [31]:
if __name__ == "__main__":
    # NOTE: we put the following in a 'if __name__ == "__main__"' protected
    # block to be able to use a multi-core grid search that also works under
    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows
    # The multiprocessing module is used as the backend of joblib.Parallel
    # that is used when n_jobs != 1 in GridSearchCV

    # the training data folder must be passed as first argument
    movie_reviews_data_folder = './txt_sentoken/'
    dataset = load_files(movie_reviews_data_folder, shuffle=False)
    print("n_samples: %d" % len(dataset.data))

    # split the dataset in training and test set:
    docs_train, docs_test, y_train, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.25, random_state=None)

    # TASK: Build a vectorizer / classifier pipeline that filters out tokens
    # that are too rare or too frequent
    
    pipeline = Pipeline(
        [
        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),
        ('clf',LinearSVC(C=1000)),
        ]
    )

    # TASK: Build a grid search to find out whether unigrams or bigrams are
    # more useful.
    # Fit the pipeline on the training set using grid search for the parameters

    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],}
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)
    grid_search.fit(docs_train, y_train)
    
    # TASK: print the cross-validated scores for the each parameters set
    # explored by the grid search
    print(grid_search.grid_scores_)

    # TASK: Predict the outcome on the testing set and store it in a variable
    # named y_predicted
    y_predicted = grid_search.predict(docs_test)

    # Print the classification report
    print(metrics.classification_report(
            y_test, y_predicted,target_names=dataset.target_names))

    # Print and plot the confusion matrix
    cm = metrics.confusion_matrix(y_test, y_predicted)
    print(cm)

    #import matplotlib.pyplot as plt
    #plt.matshow(cm)
    #plt.show()

n_samples: 2000
[mean: 0.82600, std: 0.00438, params: {'vect__ngram_range': (1, 1)}, mean: 0.84200, std: 0.01046, params: {'vect__ngram_range': (1, 2)}]
             precision    recall  f1-score   support

        neg       0.87      0.87      0.87       245
        pos       0.87      0.88      0.88       255

avg / total       0.87      0.87      0.87       500

[[212  33]
 [ 31 224]]
