In [1]:
import nltk
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to C:\Users\Do-
[nltk_data]     While\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Do-
[nltk_data]     While\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from sklearn.datasets import load_files

moviedir = r'C:\Users\Do-While\AppData\Roaming\nltk_data\corpora\movie_reviews'
# loading all files. 
movie = load_files(moviedir, shuffle=True)

### Create a bag of words representation of the corpus
- Default setting gives unigrams, so below is unigram representation

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(movie.data)

### Creating bag but with Bi-grams representation

In [4]:
vectorizer2 = CountVectorizer(ngram_range=(2,2))
X2 = vectorizer2.fit_transform(movie.data)
X2.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Create a tf-idf representation
- Again, default ngram is a unigram

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizerT = TfidfVectorizer()
X3 = vectorizerT.fit_transform(movie.data)
X3.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.06688897, 0.        , ..., 0.        , 0.        ,
        0.        ]])

### Now with Bi-gram representation

In [6]:
vectorizerT2 = TfidfVectorizer(ngram_range=(2,2))
X4 = vectorizerT2.fit_transform(movie.data)
X4.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## 3. Use Bayesian Classification to classify as either positive or negative review

### For bag of words unigram representation

In [7]:
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, movie.target, 
                                                          test_size = 0.30, random_state = 12)

clf_uni = MultinomialNB()
clf_uni.fit(X_train, y_train)
y_predict = clf_uni.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.815


### For bag of words bigram representation

In [8]:
del X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X2, movie.target, 
                                                          test_size = 0.30, random_state = 12)

clf_bi = MultinomialNB()
clf_bi.fit(X_train, y_train)
y_predict = clf_bi.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.8433333333333334


### For TF-IDF unigram representation

In [9]:
del X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X3, movie.target, 
                                                          test_size = 0.30, random_state = 12)

clf_bi = MultinomialNB()
clf_bi.fit(X_train, y_train)
y_predict = clf_bi.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.8066666666666666


### For TF-IDF bigram representation

In [10]:
del X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X4, movie.target, 
                                                          test_size = 0.30, random_state = 12)

clf_bi = MultinomialNB()
clf_bi.fit(X_train, y_train)
y_predict = clf_bi.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.83


## Step 4: User SVM Linear kernel to classify
### First bag of words unigram representation

In [11]:
from sklearn import svm

del X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, movie.target, 
                                                          test_size = 0.30, random_state = 12)
clf = svm.SVC(kernel="linear")

clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.8283333333333334


### Bag of words bigram representation

In [12]:
del X_train, X_test, y_train, y_test, clf

X_train, X_test, y_train, y_test = train_test_split(X2, movie.target, 
                                                          test_size = 0.30, random_state = 12)
clf = svm.SVC(kernel="linear")

clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.8016666666666666


### TF-IDF Unigram

In [13]:
del X_train, X_test, y_train, y_test, clf

X_train, X_test, y_train, y_test = train_test_split(X3, movie.target, 
                                                          test_size = 0.30, random_state = 12)
clf = svm.SVC(kernel="linear")

clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.8433333333333334


### TF-IDF Bigram

In [14]:
del X_train, X_test, y_train, y_test, clf

X_train, X_test, y_train, y_test = train_test_split(X4, movie.target, 
                                                          test_size = 0.30, random_state = 12)
clf = svm.SVC(kernel="linear")

clf.fit(X_train, y_train)

y_predict = clf.predict(X_test)

print("Accuracy of model: ",accuracy_score(y_test, y_predict) )

Accuracy of model:  0.8433333333333334
