In [1]:
"""
A script to annotate the dialog data with sentiment 
"""
#This is how to install packages
import sys
#!{sys.executable} -m pip install vaderSentiment
#!{sys.executable} -m pip install pymongo

In [2]:
"""
Some examples of the code
"""
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
#from vaderSentiment import SentimentIntensityAnalyzer

# --- examples -------
sentences = ["Catch utf-8 emoji such as such as 💘 and 💋 and 😁",  # emojis handled
             "Not bad at all",  # Capitalized negation
             "I hate this stuff",
             "Not nice"
             ]

analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)
    print("{:-<65} {}".format(sentence, str(vs)))

Catch utf-8 emoji such as such as 💘 and 💋 and 😁------------------ {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'compound': 0.7003}
Not bad at all--------------------------------------------------- {'neg': 0.0, 'neu': 0.513, 'pos': 0.487, 'compound': 0.431}
I hate this stuff------------------------------------------------ {'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}
Not nice--------------------------------------------------------- {'neg': 0.7, 'neu': 0.3, 'pos': 0.0, 'compound': -0.3252}


In [3]:
# here is connection to the db
# set up connection to the MongoDB: sudo service mongod start (27017 is the default port)
from pymongo import MongoClient
import json

class Mongo_Connector():
    '''
    Wrapper class for some of the pymongo functions: http://api.mongodb.com/python/current/tutorial.html
    '''

    def __init__(self, db_name):
        # spin up database
        self.mongo_client = MongoClient()
        self.db = self.mongo_client[db_name]
        print("Connection success.")
    
    def count_all_docs(self, collection_name):
        count = self.db[collection_name].count_documents({})
        print ("%d dialogues in %s" % (count, collection_name))
    
    def bulk_load(self, collection_name, json_file_path, chunk_size=1000):
        '''
        Imports a big dataset from a single JSON file
        '''
        # load JSON
        with open(json_file_path) as f:
            all_docs = json.loads(f.read())
        
        n_docs = len(all_docs)
        print("Loading %d dialogues"%n_docs)
                
        # iterate over the docs
        chunk = []
        for _id, dialogue in all_docs.items():
            dialogue['_id'] = _id
            chunk.append(dialogue)
            if len(chunk) == chunk_size:
                # insert chunk   
                self.db[collection_name].insert_many(chunk)
                chunk = []
        # insert the last chunk
        if chunk:
            self.db[collection_name].insert_many(chunk)

        # show the doc counter
        self.count_all_docs(collection_name)


db_name = 'cm'
mongo = Mongo_Connector(db_name)

Connection success.


In [4]:
# here to annotate the dstc6 messages

collection = mongo.db['dstc6']
cursor = collection.find({})
#varI = cursor[0]
#print(varI)

In [5]:
count = 0
for document in cursor:
    doc = document
    id = doc['_id']
    for i in range(len(doc['turns'])):
        ut = doc['turns'][i]
        vs = analyzer.polarity_scores(doc['turns'][i]['utterance'])
        if vs['neg'] > vs['pos'] and vs['neg'] > vs['neu']: 
            doc['turns'][i]['sentiment'] = 'N'
        elif vs['pos'] > vs['neg'] and vs['pos'] > vs['neu']:
            doc['turns'][i]['sentiment'] = 'P'
        else:
            doc['turns'][i]['sentiment'] = 'Ne'
    collection.update_one({'_id': id}, {"$set": doc}, upsert=True)
    if count % 10 ==0: print("document " + str(count) + " annotaed")
    count += 1

document 0 annotaed
document 10 annotaed
document 20 annotaed
document 30 annotaed
document 40 annotaed
document 50 annotaed
document 60 annotaed
document 70 annotaed
document 80 annotaed
document 90 annotaed


In [4]:
# the msdialog sentiment annotation

collection = mongo.db['msdialog']
cursor = collection.find({})

In [5]:
count = 0
for document in cursor:
    doc = document
    id = doc['_id']
    for i in range(len(doc['utterances'])):
        ut = doc['utterances'][i]
        vs = analyzer.polarity_scores(doc['utterances'][i]['utterance'])
        if vs['neg'] > vs['pos'] and vs['neg'] > vs['neu']: 
            doc['utterances'][i]['sentiment'] = 'N'
        elif vs['pos'] > vs['neg'] and vs['pos'] > vs['neu']:
            doc['utterances'][i]['sentiment'] = 'P'
        else:
            doc['utterances'][i]['sentiment'] = 'Ne'
    collection.update_one({'_id': id}, {"$set": doc}, upsert=True)
    if count % 5000 ==0: print("document " + str(count) + " annotaed")
    count += 1

document 0 annotaed
document 5000 annotaed
document 10000 annotaed
document 15000 annotaed
document 20000 annotaed
document 25000 annotaed
document 30000 annotaed
document 35000 annotaed


In [25]:
# testing if the dstc6 messages are annotated now

collection = mongo.db['dstc6']
cursor = collection.find({})
print (cursor[0]['turns'][10]['sentiment'])

P


In [26]:
# testing if the dstc6 messages are annotated now

collection = mongo.db['msdialog']
cursor = collection.find({})
print (cursor[0]['utterances'][0]['sentiment'])

Ne
