In [1]:
from nltk.corpus import wordnet, stopwords
from nltk.tokenize import word_tokenize
import string
from nltk.stem import SnowballStemmer,WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.classify import NaiveBayesClassifier, accuracy
import pandas as pd
import random
import pickle
from nltk.probability import FreqDist

Loading Dataset

In [2]:
def loadDataset():
    Dataset = pd.read_csv("IMDB Dataset.csv").sample(n=100)
    review_list = Dataset['review'].to_list()
    sentiment_list = Dataset['sentiment'].to_list()
    return review_list,sentiment_list

Helper for Preprocessing

In [3]:
#Making List of element that is going to be removed from data
eng_stop_words = stopwords.words('english')
punctuation = string.punctuation

#Tools for preprocessing
stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

In [4]:
#Function for preprocessing

def removestopwords(word_list):
    return [word.lower() for word in word_list if word.lower() not in eng_stop_words]

def removesymbol(word_list):
    return [word for word in word_list if word not in punctuation]

def removenumber(word_list):
    return [word for word in word_list if not word.isnumeric()]

def stemmingword(word_list):
    return [stemmer.stem(word) for word in word_list]

def get_tag(label):
    if label == 'jj':
        return 'a'
    elif label in ['nn','rb','vb']:
        return label[0]
    else:
        return None

def lemmatizingword(word_list):
    lemmatized_list = []
    tagging = pos_tag(word_list)
    for word, tag in tagging:
        label =  get_tag(tag)
        if label != None:
            lemmatized_list.append(lemmatizer.lemmatize(word,label))
        else:
            lemmatized_list.append(lemmatizer.lemmatize(word))

    return lemmatized_list

In [5]:
#Tokenize
def dataPreprocessing(review_list,sentiment_list):
    word_list = []

    for sentence in review_list:
        words = word_tokenize(sentence)
        for word in words:
            word_list.append(word.lower())

    word_list = removestopwords(word_list)
    word_list = removesymbol(word_list)
    word_list = removenumber(word_list)
    #word_list = stemmingword(word_list)
    word_list = lemmatizingword(word_list)

    labeled_list = list(zip(review_list,sentiment_list))

    feature_set = []

    for sentence, label in labeled_list:
        feature = {}
        words_list = []
        words_list = word_tokenize(sentence)
        words_list = removestopwords(words_list)
        words_list = removesymbol(words_list)
        words_list = removenumber(words_list)
        #words_list = stemmingword(words_list)
        words_list = lemmatizingword(words_list)

        for word in words_list:
            feature[word] = (word in word_list)
        feature_set.append((feature,label))
    return feature_set    

Using model to train data

In [6]:
def buildingModel():
    review_list, sentiment_list = loadDataset()
    feature_set = dataPreprocessing(review_list,sentiment_list)
    random.shuffle(feature_set)
    training_count = int(len(feature_set)*0.8)
    train_data = feature_set[:training_count]
    test_data = feature_set[training_count:]

    classifer = NaiveBayesClassifier.train(train_data)

    file = open("model.pickle","wb")
    pickle.dump(classifer,file)
    file.close()

    print(classifer.show_most_informative_features(n=5))
    print("Training Accuracy:",accuracy(classifer,test_data))
    
    print("Training Model Complete...")
    input("Press enter to continue...")
    return classifer

Find existing model in file's directory

In [7]:
def findModel():
    try:
        file = open("model.pickle","rb")
        classifier = pickle.load(file)
        file.close()
    except:
        classifier = buildingModel()
    return classifier

In [8]:
def writeTweet():
    tweet = ''
    while True:
        tweet = input("Input the tweet (must contain atleast 5 words): ")
        if(len(tweet.split(" "))>=5):
            break
    return tweet
    

In [9]:
def analyzeTweet(tweet,classifer):
    if tweet is None:
        print("Tweet doesnot exist")
        return
    words = word_tokenize(tweet)
    tagging = pos_tag(words)
    print("Tweet Part Of Speech Tag :")
    indexnum = 1
    for word, tag in tagging:
        print(f"{indexnum}. {word} : {tag}")
        indexnum+=1
    input("Press enter to continue...")
    for word in words:
        print(f"Word: {word}")
        print("---------------------------------------")
        synset = wordnet.synsets(word)
        try:
            syn = synset[0]
            synonym = syn.lemmas()[0]
            antonym = synonym.antonyms()[0]
            print("Synonym")
            print(f"(+){synonym.name()}")
            print("         Antonym")
            print(f"        (-){antonym.name()}")
            print("---------------------------------------")
        except:
            print("The word doesnot have any synonym or antonym")
            print("-------------------------------------------------")
    print("Tweet Category :",classifer.classify(FreqDist(words)))
    input("Press enter to continue...")

Main menu

In [10]:
classifer = findModel()
tweet = ''

while True:
    print("1. Write tweet")
    print("2. Analyze tweet")
    print("3. Exit")
    choice = input()

    match choice:
        case "1":
            tweet = writeTweet()
        case "2":
            analyzeTweet(tweet,classifer)
        case "3":
            break

1. Write tweet
2. Analyze tweet
3. Exit
1. Write tweet
2. Analyze tweet
3. Exit
1. Write tweet
2. Analyze tweet
3. Exit
Tweet Part Of Speech Tag :
1. Monkey : NNP
2. can : MD
3. not : RB
4. improve : VB
5. himself : PRP
6. stupid : JJ
Word: Monkey
---------------------------------------
The word doesnot have any synonym or antonym
-------------------------------------------------
Word: can
---------------------------------------
The word doesnot have any synonym or antonym
-------------------------------------------------
Word: not
---------------------------------------
The word doesnot have any synonym or antonym
-------------------------------------------------
Word: improve
---------------------------------------
Synonym
(+)better
         Antonym
        (-)worsen
---------------------------------------
Word: himself
---------------------------------------
The word doesnot have any synonym or antonym
-------------------------------------------------
Word: stupid
------------------