In [0]:
# Importing the necessary packages

import nltk
from nltk.corpus import movie_reviews

In [0]:
# Let us explore 'movie_reviews'

In [0]:
nltk.download()

In [13]:
# A list of all the words in 'movie_reviews'

movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [0]:
# Prints total number of words in 'movie_reviews'

len(movie_reviews.words())

1583820

In [0]:
movie_reviews.categories()

['neg', 'pos']

In [0]:
# Displays frequency of words in 'movie_reviews'

nltk.FreqDist(movie_reviews.words())

In [0]:
# Prints frequency of the word 'happy'.

nltk.FreqDist(movie_reviews.words())['happy']

215

In [0]:
# Displays frequency of 15 most common words in 'movie_reviews'

nltk.FreqDist(movie_reviews.words()).most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

In [0]:
# Prints all fileids

movie_reviews.fileids()

# Prints file ids of positive reviews

movie_reviews.fileids('pos')

# Prints file ids of negative reviews.

movie_reviews.fileids('neg')

In [0]:
# Prints all words in movie_review with file id 'neg/cv001_19502.txt'

movie_reviews.words('neg/cv947_11316.txt')

['what', 'were', 'they', 'thinking', '?', 'nostalgia', ...]

In [0]:
# Now that we have explored the data set, let us get into the project

In [0]:
# all_words is a dictionary which contains the frequency of words in 'movie_reviews'

all_words = nltk.FreqDist(movie_reviews.words())

In [0]:
len(all_words)

39768

In [0]:
# Defining the feature_vector

feature_vector = list(all_words)[:4000]

In [0]:
# Let us try to manually analyze the sentiment of one movie review.

In [0]:
# Initialization

feature = {}

# One movie review is chosen

review = movie_reviews.words('neg/cv954_19932.txt')

# 'True' is assigned if word in feature_vector can also be found in review. Otherwise 'False'

for x in range(len(feature_vector)):
  feature[feature_vector[x]] = feature_vector[x] in review

# The words which are assigned 'True' are printed 

[x for x in feature_vector if feature[x] == True]

In [0]:
# Now let us try to analyze the sentiments of movie reviews using Machine Learning.

In [0]:
# Document is a list of (words of review, category of review)

document = [(movie_reviews.words(file_id),category) for file_id in movie_reviews.fileids() for category in movie_reviews.categories(file_id)]

document

In [18]:
len(document)

2000

In [0]:
# we define a function that finds the features

def find_feature(word_list):

  # Initialization

  feature = {}

  # For loop to find the feature. 'True' is assigned if word in feature_vector can also be found in review. Otherwise 'False'

  for x in feature_vector:
    feature[x] = x in word_list

  return feature

In [0]:
# Checking the function 'find_feature'

find_feature(document[0][0])

In [0]:
# Feature_sets stores the 'feature' of every review

feature_sets = [(find_feature(word_list),category) for (word_list,category) in document]

In [0]:
len(feature_sets)

2000

In [0]:
# The necessary packages and classifiers are imported

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn import model_selection

In [0]:
# Splitting into training and testing sets

train_set,test_set = model_selection.train_test_split(feature_sets,test_size = 0.25)

In [0]:
print(len(train_set))
print(len(test_set))

In [0]:
# The model is trained on Training data

model = SklearnClassifier(SVC(kernel = 'linear'))
model.train(train_set)

<SklearnClassifier(SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False))>

In [0]:
# The trained model is tested on Testing data and accuracy is calculated

accuracy = nltk.classify.accuracy(model, test_set)

print('SVC Accuracy : {}'.format(accuracy))