# VK23-2 Qualification NLP Case 1

In [20]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import string

wnl = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer('english')
eng_stopwords = stopwords.words('english')

In [21]:
with open("pos.txt", "r", encoding="latin-1") as positive_file:
    positive = positive_file.read()

with open("neg.txt", "r", encoding="latin-1") as negative_file:
    negative = negative_file.read()

In [22]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return 'n'

### Pre Processing

In [23]:
list_words = word_tokenize(positive) + word_tokenize(negative)
print(list_words)
list_words = [word for word in list_words if word.lower() not in eng_stopwords]
print(list_words)
list_words = [word for word in list_words if word not in string.punctuation]
print(list_words)
list_words = [word for word in list_words if word.isalpha()]
print(list_words)

global tagged, ner 
tagged = pos_tag(list_words)
list_words = [wnl.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]
list_words = [snowball_stemmer.stem(word) for word in list_words]
fd = FreqDist(list_words)
list_words = [word for word in fd.most_common(1000)]
print(list_words)

[('starbuck', 944), ('coffe', 518), ('go', 467), ('get', 423), ('custom', 369), ('order', 364), ('drink', 345), ('time', 296), ('store', 270), ('say', 267), ('servic', 251), ('one', 229), ('make', 217), ('card', 209), ('like', 204), ('ask', 190), ('give', 182), ('would', 178), ('tell', 170), ('locat', 162), ('take', 155), ('day', 152), ('back', 147), ('employe', 138), ('alway', 136), ('cup', 135), ('manag', 134), ('use', 131), ('never', 126), ('year', 124), ('call', 124), ('even', 120), ('good', 113), ('want', 111), ('wait', 109), ('could', 107), ('know', 107), ('come', 106), ('work', 105), ('money', 103), ('place', 97), ('way', 94), ('peopl', 93), ('pay', 92), ('everi', 91), ('barista', 90), ('friend', 87), ('tri', 87), ('charg', 86), ('bad', 85), ('see', 85), ('buy', 85), ('purchas', 82), ('great', 79), ('receiv', 79), ('realli', 77), ('food', 77), ('experi', 76), ('well', 76), ('morn', 74), ('new', 74), ('rude', 74), ('look', 72), ('staff', 72), ('leav', 72), ('need', 71), ('think',

In [33]:
labeled_sentence = []
for sentence in positive.split("\n"):
    labeled_sentence.append((sentence, "pos"))
for sentence in negative.split("\n"):
    labeled_sentence.append((sentence, "neg"))

### Creating Dataset (Tokenize, Punctuation, Lowercase, Stemmer, Lemmatize)

In [34]:
dataset = []
for sent, label in labeled_sentence:
    dict = {"key":"value"}
    word = [word for word in word if word.lower() not in eng_stopwords]
    word = [word for word in word if word not in string.punctuation]
    word = [word for word in word if word.isalpha()]
    word = [snowball_stemmer.stem(word) for word in word]
    word = [wnl.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tag(word)]
    for feature in list_words:
        key = feature
        value = feature in word
        dict[key] = value
    dataset.append((dict, label))
    
print(label)

 pos
** at the Starbucks by the fire station on 436 in Altamonte Springs, FL made my day and finally helped me figure out the way to make my drink so IÃ¢ÂÂd love it. She took time out to talk to me for 2 minutes to make my experience better than what IÃ¢ÂÂm used to. It was much appreciated! IÃ¢ÂÂve had bad experiences one after another at the Starbucks thatÃ¢ÂÂs closest to me in my work building with my drinks not being great along with not great customer service from specific baristas. Niko was refreshing to speak to and pleasant. The drink was perfect! StoreAmber and LaDonna at the Starbucks on Southwest Parkway are always so warm and welcoming. There is always a smile in their voice when they greet you at the drive-thru. And their customer service is always spot-on, they always get my order right and with a smile. I would actually give them more than 5 stars if they were available. 11956 pos
I just wanted to go out of my way to recognize a Starbucks employee Billy at the Fra

### Create Dataset Training and Test

In [26]:
import random
random.shuffle(dataset)
counter = int(len(dataset) * 0.7)
training_data = dataset[:counter]
testing_data = dataset[counter:]

### Look Accuracy

In [27]:
from nltk.classify import NaiveBayesClassifier,accuracy
classifier = NaiveBayesClassifier.train(training_data)
accuracy = accuracy(classifier, testing_data)
print(accuracy)

0.784037558685446


### Write Model

In [28]:
import pickle
file = open("model.pickle", "wb")
pickle.dump(classifier, file)
file.close()

### Open Model

In [29]:
from nltk.classify import NaiveBayesClassifier,accuracy
file = open("model.pickle", "rb")
classifier = pickle.load(file)
file.close()
accuracy = accuracy(classifier, testing_data)
print(accuracy)


0.784037558685446


In [37]:
print('Natural Language Processing')
print('1. Classification')
print('0. Exit')
command = input('Enter command: ')

if (command == '1'):
    review = input("Input Review : ")
    words = word_tokenize(review)
    result = classifier.classify(FreqDist(words))
    print(result)

Natural Language Processing
1. Classification
0. Exit
neg
