In [1]:
# imports the movie_reviews corpus from NLTK

from nltk.corpus import movie_reviews

**download and set up the "movie_reviews" corpus from NLTK if not available**

In [2]:
# import nltk
# nltk.download('movie_reviews')

In [3]:
movie_reviews.categories()

['neg', 'pos']

In [4]:
# we have 2000 reviews in total 1k +ve and 1k -ve
movie_reviews.fileids("neg") # we are only accessing -ve reviews

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [5]:
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [6]:
# prepares text documents with their sentiment categories for analysis

documents = []
for category in movie_reviews.categories():
    for f_id in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(f_id), category))
documents[0:5]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg'),
 (['"', 'quest', 'for', 'camelot', '"', 'is', 'warner', ...], 'neg'),
 (['synopsis', ':', 'a', 'mentally', 'unstable', 'man', ...], 'neg')]

In [7]:
# Length of documents

len(documents)

2000

In [8]:
# Shuffling the documents for unbiased model training  

import random
random.shuffle(documents)
documents[0:10]

[(['m', ':', 'i', '-', '2', ',', 'the', 'sequel', 'to', ...], 'neg'),
 (['among', 'multitude', 'of', 'erotic', 'thrillers', ...], 'neg'),
 (['at', 'times', ',', 'you', "'", 'd', 'think', 'edtv', ...], 'neg'),
 (['seen', 'september', '13', ',', '1998', 'at', '4', ...], 'pos'),
 (['ladies', 'and', 'gentlemen', ',', '1997', "'", 's', ...], 'pos'),
 (['"', 'you', 'can', "'", 't', 'have', 'any', 'of', ...], 'neg'),
 (['there', 'are', 'times', 'when', 'the', 'success', ...], 'pos'),
 (['it', 'shows', 'that', 'america', 'remains', ...], 'pos'),
 (['when', 'bulworth', 'ended', ',', 'i', 'allowed', ...], 'pos'),
 (['note', ':', 'some', 'may', 'consider', 'portions', ...], 'pos')]

In [None]:
# Maps Part-of-Speech tags to WordNet POS tags for lemmatization

from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith("J"):
        return wordnet.ADJ
    elif tag.startswith("V"):
        return wordnet.VERB
    elif tag.startswith("N"):
         return wordnet.NOUN
    elif tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
from nltk import pos_tag
w = "better"
# pos_tag(w) # run this and u get an error 
# i.e., TypeError: tokens: expected a list of strings, got a string
# so we have to pass list
pos_tag([w])

In [11]:
from nltk.corpus import stopwords
import string
stops = set(stopwords.words("english"))
punc = list(string.punctuation)
stops.update(punc)

In [12]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [14]:
# When execute this, it's going to take some time 

documents = [(clean_review(document), category) for document, category in documents]

In [15]:
len(documents)

2000

In [16]:
# we clean the data
documents[0]

(['2',
  'sequel',
  'mission',
  'impossible',
  'james',
  'bond',
  'wannabe',
  'film',
  'fails',
  'even',
  'come',
  'close',
  'film',
  'wit',
  'humor',
  'entertainment',
  'value',
  'try',
  'spy',
  'romance',
  'movie',
  'without',
  'suspense',
  'film',
  'look',
  'like',
  'extend',
  'commercial',
  'dude',
  'think',
  'look',
  'cool',
  'throwaway',
  'sunglass',
  'film',
  'prefers',
  'techie',
  'gadget',
  'anything',
  'human',
  'coolest',
  'thing',
  'movie',
  'hole',
  'story',
  'trite',
  'thing',
  'movie',
  'usage',
  'dove',
  'throughout',
  'peace',
  'symbol',
  'film',
  'play',
  'wet',
  'fantasy',
  'dream',
  'techie',
  'violence',
  'except',
  'choreographed',
  'action',
  'sequence',
  'film',
  'dull',
  'three',
  'quarter',
  'time',
  'fill',
  'many',
  'dead',
  'spot',
  'story',
  'garner',
  'concern',
  'wooden',
  'character',
  'superficial',
  'romance',
  'developed',
  'action',
  'scene',
  'might',
  'look',
  'goo

In [17]:
# while creating this features should not be done with complete datasets it has to be done with training set only
# so we are going to split
train_docu = documents[0:1400]
test_docu = documents[1400:]

In [18]:
train_docu[0]

(['2',
  'sequel',
  'mission',
  'impossible',
  'james',
  'bond',
  'wannabe',
  'film',
  'fails',
  'even',
  'come',
  'close',
  'film',
  'wit',
  'humor',
  'entertainment',
  'value',
  'try',
  'spy',
  'romance',
  'movie',
  'without',
  'suspense',
  'film',
  'look',
  'like',
  'extend',
  'commercial',
  'dude',
  'think',
  'look',
  'cool',
  'throwaway',
  'sunglass',
  'film',
  'prefers',
  'techie',
  'gadget',
  'anything',
  'human',
  'coolest',
  'thing',
  'movie',
  'hole',
  'story',
  'trite',
  'thing',
  'movie',
  'usage',
  'dove',
  'throughout',
  'peace',
  'symbol',
  'film',
  'play',
  'wet',
  'fantasy',
  'dream',
  'techie',
  'violence',
  'except',
  'choreographed',
  'action',
  'sequence',
  'film',
  'dull',
  'three',
  'quarter',
  'time',
  'fill',
  'many',
  'dead',
  'spot',
  'story',
  'garner',
  'concern',
  'wooden',
  'character',
  'superficial',
  'romance',
  'developed',
  'action',
  'scene',
  'might',
  'look',
  'goo

In [19]:
# Now let's create array that containing all the words
all_words = []
for doc in train_docu:
    all_words += doc[0]


In [24]:
len(all_words)

499550

In [44]:
import nltk

In [45]:
# Here we are going to count freq of each words
frq = nltk.FreqDist(all_words)
common = frq.most_common(3000)
features = [i[0] for i in common] # most common words

In [46]:
def get_feature_dict(words):
    cur_features = {}
    words_set = set(words)
    for w in features:
        cur_features[w] = w in words_set 
        
    return cur_features 

In [47]:
# Eg of get_feature_dict function how it works
get_feature_dict(train_docu[0][0])

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': True,
 'see': True,
 'go': True,
 'time': True,
 'well': True,
 'scene': True,
 'even': True,
 'good': True,
 'story': True,
 'take': True,
 'would': False,
 'much': False,
 'come': False,
 'also': True,
 'give': False,
 'life': False,
 'look': False,
 'two': True,
 'way': True,
 'bad': True,
 'seem': False,
 'know': True,
 'end': True,
 'first': True,
 '--': True,
 'year': False,
 'work': False,
 'plot': True,
 'thing': True,
 'really': False,
 'say': True,
 'show': True,
 'people': True,
 'play': False,
 'little': True,
 'man': False,
 'could': True,
 'never': True,
 'star': True,
 'love': False,
 'new': False,
 'best': True,
 'try': True,
 'director': False,
 'great': True,
 'big': True,
 'action': False,
 'performance': False,
 'u': True,
 'many': False,
 'watch': False,
 'want': True,
 'actor': True,
 'think': True,
 'find': False,
 'role': True,
 'another': False,
 'audience': T

In [48]:
train_data = [(get_feature_dict(doc), category) for doc, category in train_docu]

In [49]:
test_data = [(get_feature_dict(doc), category) for doc, category in test_docu]

In [50]:
# we are going to use Naive bayes classifier in NLTK
from nltk import NaiveBayesClassifier

In [51]:
clf = NaiveBayesClassifier.train(train_data)

In [52]:
nltk.classify.accuracy(clf, test_data)

0.8116666666666666

In [53]:
clf.show_most_informative_features(15)

Most Informative Features
                   anger = True              pos : neg    =     10.9 : 1.0
                  hudson = True              neg : pos    =      9.4 : 1.0
              schumacher = True              neg : pos    =      8.7 : 1.0
             outstanding = True              pos : neg    =      8.6 : 1.0
                  poorly = True              neg : pos    =      8.3 : 1.0
                  martha = True              neg : pos    =      7.5 : 1.0
                   ideal = True              pos : neg    =      7.4 : 1.0
                  forgot = True              neg : pos    =      6.8 : 1.0
               stupidity = True              neg : pos    =      6.8 : 1.0
                  sloppy = True              neg : pos    =      6.5 : 1.0
                   mulan = True              pos : neg    =      6.5 : 1.0
             magnificent = True              pos : neg    =      6.3 : 1.0
               laughable = True              neg : pos    =      6.2 : 1.0

So NLTK classifier works as It has,


training_data = [
    ({'feature1': 'value1', 'feature2': 'value2'}, 'label1'),
    ({'feature1': 'value3', 'feature2': 'value4'}, 'label2'),
    # ... more training instances
]


Certainly! The training_data you provided is an example of how to structure labeled training instances for a text classification task. Each training instance consists of two main parts:

In array there is a tuple which contain dict.

Feature Dictionary: This is a Python dictionary that represents the features (attributes or characteristics) of a piece of text. The keys of the dictionary correspond to the features, and the values represent the values or attributes of those features. In your example, 'feature1' and 'feature2' are placeholders for actual features, and 'value1', 'value2', etc., are placeholders for the values associated with those features.

Label: This is the category or class that the text belongs to. It represents what you want the classifier to learn to predict. In your example, 'label1' and 'label2' are placeholders for the actual labels or categories that the text instances should be classified into.

But this is not the case with sklearn classifier, here we have X --> features and y ---> output label.


In [54]:
from nltk import DecisionTreeClassifier

In [55]:
dt = DecisionTreeClassifier.train(train_data)

In [56]:
nltk.classify.accuracy(dt, test_data)

0.6083333333333333

#### Let's use sklearn classifier

In [57]:
from sklearn.svm import SVC

# Using the below code we don't need to convert the (array which contain tuple--->dict) format code into X,y so 
# we are using this dummy classifier
from nltk.classify.scikitlearn import SklearnClassifier


In [58]:
svc = SVC()

In [59]:
clf_sklearn = SklearnClassifier(svc)

In [60]:
clf_sklearn.train(train_data)

<SklearnClassifier(SVC())>

In [61]:
nltk.classify.accuracy(clf_sklearn, test_data)

0.8583333333333333

###### Conclusion:

Naive Bayes Classifier (NLTK):

Accuracy: 0.811 (approximately)
Performance: The Naive Bayes classifier achieved an accuracy of around 81.1% on your dataset.

Support Vector Classifier (SVC) with SklearnClassifier (NLTK):

Accuracy: 0.858 (approximately)
Performance: The Support Vector Classifier (SVC) with the SklearnClassifier from NLTK achieved a higher accuracy of around 85.8% on your dataset.

Decision Tree Classifier (NLTK):

Accuracy: 0.608 (approximately)
Performance: The Decision Tree classifier achieved an accuracy of around 60.8% on your dataset.

###### Note:
The performance of these classifiers can vary depending on the specific characteristics of data and the preprocessing techniques that is been used.