# VK23-2 Qualification NLP Case 1

In [43]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, gutenberg
from nltk.probability import FreqDist
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import string

wnl = WordNetLemmatizer()
snowball_stemmer = SnowballStemmer('english')
eng_stopwords = stopwords.words('english')

In [32]:
with open("pos.txt", "r", encoding="latin-1") as positive_file:
    positive = positive_file.read()

with open("neg.txt", "r", encoding="latin-1") as negative_file:
    negative = negative_file.read()

In [33]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'
    elif treebank_tag.startswith('V'):
        return 'v'
    elif treebank_tag.startswith('N'):
        return 'n'
    elif treebank_tag.startswith('R'):
        return 'r'
    else:
        return 'n'

### Pre Processing

In [34]:
list_words = word_tokenize(positive) + word_tokenize(negative)
list_words = [word for word in list_words if word.lower() not in eng_stopwords]
list_words = [word for word in list_words if word not in string.punctuation]
list_words = [word for word in list_words if word.isalpha()]

tagged = pos_tag(list_words)
ner = ne_chunk(tagged)

list_words = [wnl.lemmatize(word, get_wordnet_pos(tag)) for word, tag in tagged]
list_words = [snowball_stemmer.stem(word) for word in list_words]
fd = FreqDist(list_words)
list_words = [word for word, count in fd.most_common(100)]

In [35]:
labeled_sentence = []
for sentence in positive.split("\n"):
    labeled_sentence.append((sentence, "Not Spam"))
for sentence in negative.split("\n"):
    labeled_sentence.append((sentence, "Spam"))

In [36]:
dataset = []

for sent, label in labeled_sentence:
    dict = {"key":"value"}
    word = word_tokenize(sent)
    for feature in list_words:
        key = feature
        value = feature in word
        dict[key] = value
    dataset.append((dict, label))

### Create Dataset Training and Test

In [37]:
import random
random.shuffle(dataset)
counter = int(len(dataset) * 0.7)
training_data = dataset[:counter]
testing_data = dataset[counter:]

In [38]:
from nltk.classify import NaiveBayesClassifier,accuracy
classifier = NaiveBayesClassifier.train(training_data)
accuracy = accuracy(classifier, testing_data)
print(accuracy * 100, '%')

88.23529411764706 %


### Write Model

In [39]:
import pickle
file = open("model.pickle", "wb")
pickle.dump(classifier, file)
file.close()

### Open Model

In [40]:
from nltk.classify import NaiveBayesClassifier,accuracy
file = open("model.pickle", "rb")
classifier = pickle.load(file)
file.close()
accuracy = accuracy(classifier, testing_data)
print(accuracy)

0.8823529411764706


### Corpora

In [50]:
def corpora():
    words = gutenberg.raw('milton-paradise.txt').split()

    i = 0
    for word in words:
        i += 1
        synsets = wordnet.synsets(word)
        for synset in synsets:
            print(f"{synset}: {synset.definition()}")
            for lemma in synset.lemmas():
                print(f"Synonym: {lemma.name()}")
                for antonym in lemma.antonyms():
                    print(f"Antonym: {antonym.name()}")
            print()

In [52]:
print('Natural Language Processing')
print('1. Classification')
print('2. View Model NER')
print('3. Corpora')
print('4. Most Informative Features')
print('0. Exit')
command = input('Enter command: ')

if (command == '1'):
    review = input("Input Review : ")
    words = word_tokenize(review)
    classifier = pickle.load(open("model.pickle", "rb"))
    result = classifier.classify(FreqDist(words))
    print(result)
elif (command == '2'):
    print(ner)
elif (command == '3'):
    corpora()
elif (command == '4'):
    classifier = pickle.load(open("model.pickle", "rb"))
    print(classifier.show_most_informative_features(10))

Natural Language Processing
1. Classification
2. View Model NER
3. Corpora
4. Most Informative Features
0. Exit
Most Informative Features
                     txt = True             Spam : Not Sp =     39.3 : 1.0
                    cash = True             Spam : Not Sp =     12.9 : 1.0
                    stop = True             Spam : Not Sp =     11.5 : 1.0
                  number = True             Spam : Not Sp =     10.7 : 1.0
                    free = True             Spam : Not Sp =     10.1 : 1.0
                    text = True             Spam : Not Sp =      8.5 : 1.0
                    call = True             Spam : Not Sp =      7.8 : 1.0
                  friend = True             Spam : Not Sp =      4.4 : 1.0
                     min = True             Spam : Not Sp =      3.8 : 1.0
                     pls = True             Spam : Not Sp =      3.1 : 1.0
None
