In [1]:
from ahocorapy.keywordtree import KeywordTree
import sys
sys.path.append('..\\sentiment analysis\\release')
import sentiment_prediction as senti

Using TensorFlow backend.


In [2]:
def KeywordWeight(searchResult, words, weights):
    '''
    This function selects the keywords (in searchResult) with their weights from the dictionary records.
    '''
    keyword = []
    keyword_weight = []
    for result in searchResult:
        keyword.append(result[0])
    for k in keyword:
        for i in range(len(words)):
            if words[i] == k:
                keyword_weight.append(weights[i])
                break
    return keyword, keyword_weight
        
def scoring(keyword, weight, real_sentiment, intended_sentiment, sentilevel):
    '''
    Calculate the score based on this formula: Score = (weights 1 + weight 2 +...+ weight n) * discount factor
    '''
    score = 0
    for i in range(len(weight)):
        score += float(weight[i])
    if real_sentiment == intended_sentiment:
        discount = sentilevel
    else:
        discount = -sentilevel
    score = score * discount
    return score

def searcher(filepath):
    '''
    Matching the words in dictionary with the tweet provided by user.
    '''
    kwtree_word = []
    kwtree_weight = []
    f = open(filepath)
    for line in f:
        word, weight = line.split(' ')
        word = word.replace('_', ' ')
        kwtree_word.append(word)
        weight = weight.split('\n')[0]
        kwtree_weight.append(weight)
    f.close()
    kwtree = KeywordTree(case_insensitive=True)
    for word in kwtree_word:
        kwtree.add(word)
    kwtree.finalize()
    return kwtree, kwtree_word, kwtree_weight

def topicker(tweet, topics, topic_dict_path, intended_sentiments, analysor):
    '''
    Warpping the topic selection in this function. Please note:
    
    1. It chooses the best match to return, but you can avoid 
       this selection mechanism and return the scores list to store in CouchDB.
    
    2. The performance of this mechanism highly depends on the quality of 
       dictionaries and intended sentiments definded by user.
    '''
    
    def scores_checker(scores):
        Flag = True
        if max(scores) != 0.0:            
            for i in range(len(scores)):
                if scores[i] != 0.0:
                    Flag = False
                    break
        return Flag
    
    scores = []
    prediction = analysor.prediction(tweet)
    real_senti = prediction[0]
    sentilevel = prediction[1]
    for i in range(len(topic_dict_path)):
        intended_sentiment = intended_sentiments[i]
        dictionary = topic_dict_path[i]
        this_searcher, dict_words, dict_weights = searcher(dictionary)
        search_result = this_searcher.search_all(tweet)
        keywords, keywords_weights = KeywordWeight(search_result, dict_words, dict_weights)
        scores.append(scoring(keywords, keywords_weights, intended_sentiment, real_senti, sentilevel))
    if scores_checker(scores):
        matched_topic = 'NONE OF THESE'
        return matched_topic
    else:
        max_index = scores.index(max(scores))
        matched_topic = topics[max_index]
        return matched_topic

### How to use it?

In [3]:
tweet = "smoking makes me feeling better."

topics = ['fast food', 'smoking', 'alcohols']
topics_dictionary_path = ['fastfood.txt', 'smoking.txt', 'alcohols.txt']
intended_sentiments = ['POSITIVE', 'POSITIVE', 'POSITIVE']
modelpath = 'D:\\COMP90024_Assignment2\\sentiment analysis\\sentiment_lstm.h5'
pklpath = 'D:\\COMP90024_Assignment2\\sentiment analysis\\training_data\\large\\sentiment140-freqdist.pkl'
analysor = senti.sentianalysor(modelpath, pklpath, stemmer = False)

tweet = analysor.preprocess_tweet(tweet)
print('Tweet:', tweet, '\n')

matched = topicker(tweet, topics, topics_dictionary_path, intended_sentiments, analysor)
print('Topic matched:', matched)

Tweet: smoking makes me feeling better 

Topic matched: smoking
