In [7]:
import nltk 
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

fileid = movie_reviews.fileids()

documents = [list(movie_reviews.words(fileid)) for fileid in movie_reviews.fileids()]

word_count = {}
for text in documents:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1

sorted_features = sorted(word_count, key=word_count.get, reverse=True)

tokenizer=RegexpTokenizer(r"\w{3,}")
english_stops=set(stopwords.words('english'))

documents=[movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

tokens = [[token for token in tokenizer.tokenize(doc) if token not in english_stops] for doc in documents]
word_count = {}
for text in tokens:
    for word in text:
        word_count[word] = word_count.get(word, 0) + 1

sorted_features = sorted(word_count, key=word_count.get, reverse=True)

print(sorted_features[:10])

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\홍사빈\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much']


In [8]:
word_features = sorted_features[:100]

def document_features(document, word_features):
    word_count={}
    for word in document:
        word_count[word]=word_count.get(word, 0) + 1
        
    features=[]
    
    for word in word_features:
        features.append(word_count.get(word, 0))
    return features

feature_sets=[document_features(d, word_features) for d in tokens]

In [9]:
reviews=[movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(vocabulary=word_features)
print(cv)

CountVectorizer(vocabulary=['film', 'one', 'movie', 'like', 'even', 'good',
                            'time', 'story', 'would', 'much', 'character',
                            'also', 'get', 'two', 'well', 'characters', 'first',
                            'see', 'way', 'make', 'life', 'really', 'films',
                            'plot', 'little', 'people', 'could', 'scene', 'man',
                            'bad', ...])


In [11]:
reviews_cv=cv.fit_transform(reviews)
print(cv.get_feature_names_out()[:20])
print(word_features[:20])

['film' 'one' 'movie' 'like' 'even' 'good' 'time' 'story' 'would' 'much'
 'character' 'also' 'get' 'two' 'well' 'characters' 'first' 'see' 'way'
 'make']
['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much', 'character', 'also', 'get', 'two', 'well', 'characters', 'first', 'see', 'way', 'make']


In [12]:
print('#type of count vectors: ', type(reviews_cv))
print('#shape of count vectors: ', reviews_cv.shape)
print('#sample of count vector:')
print(reviews_cv[0, :10])

#type of count vectors:  <class 'scipy.sparse._csr.csr_matrix'>
#shape of count vectors:  (2000, 100)
#sample of count vector:
  (0, 0)	6
  (0, 1)	3
  (0, 2)	6
  (0, 3)	3
  (0, 4)	3
  (0, 5)	2
  (0, 8)	1
