In [2]:
import nltk

In [3]:
import pandas as pd

dataset = pd.read_csv('dataset.csv')

dataset.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,negative
1,what did just say vote for modi welcome bjp t...,positive
2,asking his supporters prefix chowkidar their n...,positive
3,answer who among these the most powerful world...,positive
4,with upcoming election india saga going import...,positive


In [17]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import FreqDist
from random import shuffle
import pickle

stop_words = stopwords.words('english')
stemmer = PorterStemmer()
lem = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = [lem.lemmatize(token) for token in tokens]
    return tokens

all_words = []
for text in dataset["clean_text"]:
    # all_words.extend(preprocess_text(text))
    for word in preprocess_text(text):
        all_words.append(word)

fd = FreqDist(all_words)

featured_word = {word for word, count in fd.most_common(100)}

feature_sets = []

for text, category in zip(dataset["clean_text"], dataset["category"]):
    feature = {}

    processed_text = preprocess_text(text)
    for word in processed_text:
        feature[word] = word in featured_word

    feature_sets.append((feature, category))

# print(feature_sets)
shuffle(feature_sets)

split_index = int(len(feature_sets) * 0.8)
train_set = feature_sets[:split_index]
test_set = feature_sets[split_index:]

classifier = nltk.NaiveBayesClassifier.train(train_set)

# accuracy = nltk.classify.accuracy(classifier, test_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)

print(accuracy)

classifier.show_most_informative_features(10)

file = open('model.pickle', 'wb')
pickle.dump(classifier, file)
file.close()

0.6894977168949772
Most Informative Features
                    poor = False          negati : positi =     13.4 : 1.0
                  better = True           positi : negati =     11.3 : 1.0
                    fail = False          negati : positi =     11.1 : 1.0
                    find = False          negati : positi =      9.6 : 1.0
                   idiot = False          negati : positi =      9.6 : 1.0
              propaganda = False          negati : positi =      9.6 : 1.0
                    hate = True           negati : positi =      9.2 : 1.0
                   first = True           positi : negati =      8.9 : 1.0
                  behind = False          negati : positi =      8.3 : 1.0
                    best = True           positi : negati =      7.4 : 1.0


In [18]:
import os

def load_model():
    if os.path.exists('./model.pickle'):
        file = open('model.pickle', 'rb')
        return pickle.load(file)
    else:
        return None 
    
model = load_model()

evaluated_sentence = "beter, good, and hate"

fd_evaluate = FreqDist(word_tokenize(evaluated_sentence))
category = model.classify(fd_evaluate)
print(category)

positive


In [19]:
# WordNet Synonym/Antonym Analysis
from nltk.corpus import wordnet as wn
import string

def wordnet_analyze(text):
    words = word_tokenize(text)

    words = [word for word in words if word not in string.punctuation and word.isalpha()]

    for word in words:
        synsets = wn.synsets(word)

        syn_list = []
        antonym_list = []

        for synset in synsets:
            for lemma in synset.lemmas():
                syn_list.append(lemma.name())
                for antonym in lemma.antonyms():
                    antonym_list.append(antonym.name())
    

        print(f"Word: {word}")
        print("---------------------------------------------")
        print("Synonym")
        if len(syn_list) == 0:
            print("No synonym")
        else:
            for syn in syn_list:
                print(f"(+){syn}")
        
        print("Antonym")
        if len(antonym_list) == 0:
            print("No antonym")
        else:
            for antonym in antonym_list:
                print(f"(-){antonym}")
        print("---------------------------------------------")


wordnet_analyze('This is an example text!')

Word: This
---------------------------------------------
Synonym
No synonym
Antonym
No antonym
---------------------------------------------
Word: is
---------------------------------------------
Synonym
(+)be
(+)be
(+)be
(+)exist
(+)be
(+)be
(+)equal
(+)be
(+)constitute
(+)represent
(+)make_up
(+)comprise
(+)be
(+)be
(+)follow
(+)embody
(+)be
(+)personify
(+)be
(+)be
(+)live
(+)be
(+)cost
(+)be
Antonym
(-)differ
---------------------------------------------
Word: an
---------------------------------------------
Synonym
(+)Associate_in_Nursing
(+)AN
Antonym
No antonym
---------------------------------------------
Word: example
---------------------------------------------
Synonym
(+)example
(+)illustration
(+)instance
(+)representative
(+)model
(+)example
(+)exemplar
(+)example
(+)model
(+)good_example
(+)example
(+)deterrent_example
(+)lesson
(+)object_lesson
(+)case
(+)instance
(+)example
(+)exercise
(+)example
Antonym
No antonym
---------------------------------------------
Word: te