In [3]:
import pyprind
import pandas as pd
import os
import sys
import numpy as np
sys.getdefaultencoding()

'utf-8'

In [4]:
pbar = pyprind.ProgBar(50000)
label = {'pos':1,'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
    for l in ('pos','neg'):
        path = '/Users/MLUSER/Documents/aclImdb/%s/%s'%(s,l)
        for file in os.listdir(path):
            with open(os.path.join(path,file),'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt,label[l]]],ignore_index=True)
            pbar.update()


0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:02:20


In [5]:
df.columns = ['review','sentiment']

In [6]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('/Users/MLUSER/Documents/aclImdb/moive_data.csv',encoding='utf-8',index=False)

In [7]:
df = pd.read_csv('/Users/MLUSER/Documents/aclImdb/moive_data.csv')

df.head()

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
#
count = CountVectorizer()
docs = np.array(['the sun is shining',
                'the weather is sweet',
                'the sun is shining and the weather is sweet'])
bag = count.fit_transform(docs)
print (count.vocabulary_)
print (bag.toarray())

{'weather': 6, 'and': 0, 'sweet': 4, 'shining': 2, 'the': 5, 'is': 1, 'sun': 3}
[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


As we can see from executing the preceding command, the vocabulary is stored in a Python dictionary, which maps the unique words that are mapped to integer indices. Next let us print the feature vectors that we just created:

When we are analyzing text data, we often encounter words that occur across multiple documents from both classes. Those frequently occurring words typically don't contain useful or discriminatory information. In this subsection, we will learn about a useful technique called term frequency-inverse document frequency (tf-idf) that can be used to downweight those frequently occurring words in the feature vectors. The tf-idf can be de ned as the product of the term frequency and the inverse document frequency:

$$\text{tf-idf}(t,d)=\text{tf (t,d)}\times \text{idf}(t,d)$$

Here the tf(t, d) is the term frequency that we introduced in the previous section,
and the inverse document frequency *idf(t, d)* can be calculated as:

$$\text{idf}(t,d) = \text{log}\frac{n_d}{1+\text{df}(d, t)},$$

where $n_d$ is the total number of documents, and *df(d, t)* is the number of documents *d* that contain the term *t*. Note that adding the constant 1 to the denominator is optional and serves the purpose of assigning a non-zero value to terms that occur in all training samples; the log is used to ensure that low document frequencies are not given too much weight.

Scikit-learn implements yet another transformer, the `TfidfTransformer`, that takes the raw term frequencies from `CountVectorizer` as input and transforms them into tf-idfs:

In [47]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)

## <font color='red'> Cleaning text data </font>

- 去除不需要的字符
- 文本內會有包含HTML標記,標點符號和其他非字母字符

In [48]:
df.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [49]:
import re
#http://plcdn.qiniudn.com/wp-content/uploads/2011/12/regular-expressions-cheat-sheet-v2.png

In [50]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

In [57]:
a = preprocessor(df.loc[0,'review'])
a

'in 1974 the teenager martha moxley maggie grace moves to the high class area of belle haven greenwich connecticut on the mischief night eve of halloween she was murdered in the backyard of her house and her murder remained unsolved twenty two years later the writer mark fuhrman christopher meloni who is a former la detective that has fallen in disgrace for perjury in o j simpson trial and moved to idaho decides to investigate the case with his partner stephen weeks andrew mitchell with the purpose of writing a book the locals squirm and do not welcome them but with the support of the retired detective steve carroll robert forster that was in charge of the investigation in the 70 s they discover the criminal and a net of power and money to cover the murder murder in greenwich is a good tv movie with the true story of a murder of a fifteen years old girl that was committed by a wealthy teenager whose mother was a kennedy the powerful and rich family used their influence to cover the mur

In [58]:
df['review'] = df['review'].apply(preprocessor)

## Processing documents into tokens

In [41]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()


def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [44]:
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [45]:
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [42]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MLUSER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [43]:
from nltk.corpus import stopwords

stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:]
if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

# Training a logistic regression model for document classification

In [60]:
df.head()

Unnamed: 0,review,sentiment
0,in 1974 the teenager martha moxley maggie grac...,1
1,ok so i really like kris kristofferson and his...,0
2,spoiler do not read this if you think about w...,0
3,hi for all the people who have seen this wonde...,1
4,i recently bought the dvd forgetting just how ...,0


In [62]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [80]:
tfidf = TfidfVectorizer()
lr = LogisticRegression()
lr_tfidf = Pipeline([('vec',tfidf),('lr',lr)])
param_grid = [{'vec__'}]


In [82]:
lr_tfidf.get_params().keys()

dict_keys(['vec__encoding', 'lr__fit_intercept', 'lr__max_iter', 'vec__stop_words', 'lr', 'vec__use_idf', 'vec__max_df', 'vec__analyzer', 'lr__intercept_scaling', 'vec__norm', 'lr__class_weight', 'lr__solver', 'lr__multi_class', 'lr__n_jobs', 'lr__tol', 'vec__tokenizer', 'vec__binary', 'vec__ngram_range', 'lr__random_state', 'lr__dual', 'vec__lowercase', 'lr__verbose', 'vec__smooth_idf', 'vec__dtype', 'vec__sublinear_tf', 'vec__decode_error', 'vec__strip_accents', 'vec__vocabulary', 'lr__penalty', 'vec__preprocessor', 'steps', 'memory', 'vec__token_pattern', 'lr__C', 'vec__input', 'vec', 'lr__warm_start', 'vec__max_features', 'vec__min_df'])