#The purpose of this notebook is to experiment with data processing workflow for NLP task using the nltk package

In [24]:
from torchtext import datasets
import pandas as pd
import sklearn.feature_extraction.text as sktext
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [9]:
imdb = datasets.IMDB(split='train')

In [12]:
def tokenize(text): #tokenize and lemmatize the text with pos tag and remove stop words
    text = text.lower()
    text = re.sub(r'\<.*?\>', '', text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tagged = pos_tag(tokens)
    stop_words = set(stopwords.words('english'))
    text = []
    for word, tag in tagged:
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        
        if word not in stop_words:
            if len(word) > 1:
                remainder = re.sub('[a-z]','',word)
                remainder = re.sub("'",'',remainder)
                if remainder == '':
                    text.append(lemmatizer.lemmatize(word, pos))
    return text


In [13]:
labels = []
lines = []

for label, line in imdb:
    lines.append(' '.join(tokenize(line)))
    labels.append(label)
    


df = pd.DataFrame({'text': lines, 'label': labels})


In [14]:

df = df.sample(n=8000) #The data is too large so we only use a small sample
df.head()

Unnamed: 0,text,label
3596,one would expect movie famous comedian lead ro...,neg
13067,excellent film deal life old man look back yea...,pos
288,watch glimcher mystery many ridiculous plot tw...,neg
12546,watch mccoys reunion glad see richard crenna k...,pos
8495,'s write poster birth give year live take jour...,neg


In [15]:
#cheak to see if the data is balanced


df.groupby('label').count()



Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
neg,4024
pos,3976


In [20]:
vectorizer = sktext.CountVectorizer(max_features=3000)
counts = vectorizer.fit_transform(df['text'])
count_df = pd.DataFrame(counts.toarray(), columns=vectorizer.get_feature_names())


In [21]:
nb_classifier = MultinomialNB()
nb_classifier.fit(counts, df['label'])

MultinomialNB()

In [25]:
Pred = nb_classifier.predict(counts)
target_names = ['neg', 'pos']
print(classification_report(df['label'], Pred, target_names=target_names))


              precision    recall  f1-score   support

         neg       0.86      0.87      0.86      4024
         pos       0.87      0.85      0.86      3976

    accuracy                           0.86      8000
   macro avg       0.86      0.86      0.86      8000
weighted avg       0.86      0.86      0.86      8000



In [36]:
def predict_sentiment(text):
    text =' '.join(tokenize(text))
    text = vectorizer.transform([text])
    return nb_classifier.predict(text)

#predict_sentiment('I love this movie')
predict_sentiment("It was the worst movie I've ever seen")


array(['neg'], dtype='<U3')