In [1]:
# Setup
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# Use scikit-learn to Experiment with 20newsgroups data

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [3]:
# Load the data
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
    categories=categories, shuffle=True, random_state=42)

In [4]:
print twenty_train.keys()
print twenty_train.target #.shape
print len(twenty_train.data)

['description', 'DESCR', 'filenames', 'target_names', 'data', 'target']
[1 1 3 ..., 2 2 2]
2257


In [5]:
print twenty_train.target_names
print twenty_train.data[0]

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.



In [6]:
# Extract 'count' feature based on 'bag-of-words' format
from sklearn.feature_extraction.text import CountVectorizer
import time
tic = time.time()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print time.time() - tic
X_train_counts.shape

0.735000133514


(2257, 35788)

In [7]:
# Compute 'tf', 'tf-idf'feature from 'count' feature.
# Usually, we choose 'tf-idf' as the trained feature
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [8]:
# Train a classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [9]:
# Predict new variable
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [10]:
# Build a pipeline to integrate all above steps
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [11]:
# Predict pipeline
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [12]:
# Evaluation the performance on the test set
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)  

0.83488681757656458

# Use scikit-learn to Experiment with IMDB data

In [13]:
# Get IMDB data
import os
def getIMDBData(dirPre, dataLabel):
    res = {}
    for dl in dataLabel:
        dirPath = os.path.join(dirPre, dl)
        fileNames = os.listdir(dirPath)

        docs = []
        for fN in fileNames:
            doc = open(os.path.join(dirPath, fN), 'r').read()
            docs.append(doc)
            
        res[dl] = docs
        
    return res
    
dataPathPre = os.path.join('dataset', 'hw1_dataset_nb')
devType = ['train', 'test']
dataLabel = ['pos', 'neg']

IMDB_train = getIMDBData(os.path.join(dataPathPre, devType[0]), dataLabel)
IMDB_test = getIMDBData(os.path.join(dataPathPre, devType[1]), dataLabel)
IMDB_stop_words = open(os.path.join(dataPathPre, 'sw.txt'), 'r').read()
stop_words = IMDB_stop_words.split()

In [14]:
# Extract 'counts' feature based on 'bag-of-word'
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer="word", stop_words=stop_words)
X_train_counts = count_vect.fit_transform(IMDB_train['pos'] + IMDB_train['neg'])
X_train_counts.shape

(25000, 74195)

In [15]:
# Get 'tf' feature from 'counts' feature
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(25000, 74195)

In [16]:
# Get 'tf-idf' feature from 'counts' feature
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(25000, 74195)

In [17]:
# Train the MultinomialNB based on 'tf-idf' feature,
# and their corresponding targets.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, [1] * 12500 + [0] * 12500)

In [18]:
# Predict the feature
docs_new = IMDB_test['neg'][0:5] + IMDB_test['pos'][0:5]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

docs_target_names = ['negative', 'positive']
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, docs_target_names[category]))
    print '\n'

"Once again Mr  Costner has dragged out a movie for far longer than necessary  Aside from the terrific sea rescue sequences  of which there are very few I just did not care about any of the characters  Most of us have ghosts in the closet  and Costner's character are realized early on  and then forgotten until much later  by which time I did not care  The character we should really care about is a very cocky  overconfident Ashton Kutcher  The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet  His only obstacle appears to be winning over Costner  Finally when we are well past the half way point of this stinker  Costner tells us all about Kutcher's ghosts  We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing  No magic here  it was all I could do to keep from turning it off an hour in " => negative


"This is an example of why the majority of action films are the same  Generic and b

In [19]:
# Build pipeline of above steps
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [20]:
# Train pipeline
text_clf = text_clf.fit(IMDB_train['pos'] + IMDB_train['neg'], [1] * 12500 + [0] * 12500)

In [21]:
# Evaluate performance of test set
import numpy as np
predicted = text_clf.predict(IMDB_test['pos'] + IMDB_test['neg'])
np.mean(predicted == [1] * 12500 + [0] * 12500)  

0.83072000000000001