# NLTK; Sentiment Analysis; RegexpTokenizer; Multinomial Naive Bayes

In [1]:
from nltk.tokenize import RegexpTokenizer  # Regular Exression Tokenizer - it tokenizes text and can remove symbols!!!
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fross\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
X_train = ["This %%*was awesome an awesome movie", 
           "Great movie! I liked it a lot", 
           "Happy Ending! awesome acting by the hero", 
           "loved it! truly great", 
           "bad not upto the mark", 
           "could have been better", 
           "Surrely a Disappointing movie"]

y_train = [1, 1, 1, 1, 0, 0, 0]

X_test = ["I was happy & happy and I loved the acting in the movie", 
          "The movie I saw was bad"]

### data cleaning; RegexpTokenizer

In [4]:
tokenizer = RegexpTokenizer(r'\w+')  # r'\w+' means that we want words and we want to concatinate those words
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [5]:
def get_cleaned_data(text):
    text = text.lower()
    
    # tokenization
    tokens = tokenizer.tokenize(text)  # RegexpTokenizer!!!
    
    # removing stopwords
    tokens_without_stopwords = [_ for _ in tokens if _ not in en_stopwords]
    
    # stemming
    stemmed_tokens_without_stopwords = [ps.stem(_) for _ in tokens_without_stopwords]
    
    # convertion from a list back into a string
    cleaned_text = ' '.join(stemmed_tokens_without_stopwords)
    
    return cleaned_text

In [6]:
X_train_cleaned = [get_cleaned_data(_) for _ in X_train]
X_test_cleaned = [get_cleaned_data(_) for _ in X_test]

In [7]:
X_train_cleaned

['awesom awesom movi',
 'great movi like lot',
 'happi end awesom act hero',
 'love truli great',
 'bad upto mark',
 'could better',
 'surr disappoint movi']

In [8]:
X_test_cleaned

['happi happi love act movi', 'movi saw bad']

### vectorization; ngram use

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(ngram_range=(1, 2))  # !!!

In [11]:
# vectorization of X_train
X_train_cleaned_vectorized = cv.fit_transform(X_train_cleaned)  # .toarray()
X_train_cleaned_vectorized

<7x34 sparse matrix of type '<class 'numpy.int64'>'
	with 38 stored elements in Compressed Sparse Row format>

In [12]:
cv.get_feature_names_out()

array(['act', 'act hero', 'awesom', 'awesom act', 'awesom awesom',
       'awesom movi', 'bad', 'bad upto', 'better', 'could',
       'could better', 'disappoint', 'disappoint movi', 'end',
       'end awesom', 'great', 'great movi', 'happi', 'happi end', 'hero',
       'like', 'like lot', 'lot', 'love', 'love truli', 'mark', 'movi',
       'movi like', 'surr', 'surr disappoint', 'truli', 'truli great',
       'upto', 'upto mark'], dtype=object)

In [13]:
# vectorization of X_test
X_test_cleaned_vectorized = cv.transform(X_test_cleaned)  # .toarray()
X_test_cleaned_vectorized

<2x34 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

### Multinomial Naive Bayes as our prediction model

Multinomial Naive Bayes explained - https://www.youtube.com/watch?v=O2L2Uv9pdDA

In [14]:
from sklearn.naive_bayes import MultinomialNB

In [15]:
mnb = MultinomialNB()

In [16]:
mnb.fit(X_train_cleaned_vectorized, y_train)  # MultinomialNB() can consume not only array, but sparse matrix too!!!

### prediction on train data

In [17]:
mnb.predict(X_train_cleaned_vectorized)

array([1, 1, 1, 1, 0, 0, 0])

### prediction on test data

In [18]:
mnb.predict(X_test_cleaned_vectorized)

array([1, 0])

### cheking predictions

In [19]:
X_test

['I was happy & happy and I loved the acting in the movie',
 'The movie I saw was bad']