**Step by step guide to the codes used in the sentiment analysis**
**of the North Carolina Medicaid Reform Public Comments in 2016**

**1. Load helper library files needed for the code**

In [1]:
# Load helper files
import http.client, urllib.request, urllib.parse, urllib.error, base64
import pandas as pd
import string
import json
import pprint
import hashlib
from colorama import Fore, Back, Style

**2. Set the variables for the subscription key and region for the Azure Cognitive Services**

In [2]:
# Set the keys and region where the cognitive services are accessed
"""
    Replace: 
    1. Dummy key with your own Azure key from the Azure Cognitive Services resource
    2. Dummy endpoint endpoint from the Azure Cognitive Services resource without the https://
       For example, if the endpoint on the resource is https://eastus.azure.com, 
       replace the 'Dummy endpoint' with 'eastus.azure.com'
"""
subscription_key = "150f10fe96b34990a527b9de60aae711" # 
cognitive_services_region = "westus2.api.cognitive.microsoft.com" 

# Set the request headers and parameters
headers = {
    # Request headers
    'Content-Type': 'application/json',
    'Ocp-Apim-Subscription-Key':subscription_key
}
params = urllib.parse.urlencode({
  # Request parameters
  'showStats': 'false',
  'model-version': 'V1',
})

**3. Load the downloaded comment file into a data frame**

In [3]:
# Load the comment file into a data frame
"""
    The file used in this demonstrations 'NC Medicaid Comments 062016a.csv' is downloaded 
    directly from the NC MEDICAID site listed below
    https://public.medicaid.gov/connect.ti/public.comments/questionnaireResults?qid=1886531 
    Public Comments on this site are from June to July 2016
"""
commentData = pd.read_csv("NC Medicaid Comments 062016a.csv", header=0, names=["comment"])

commentData['nwords'] = commentData.comment.apply(lambda x: len(x.split()))
commentData['hashed'] = commentData.comment.apply(lambda x: hash("".join(x.split())))

# Remove duplicated record but keep the first occurence of the record
commentData.drop_duplicates(keep='first',inplace=True)
# Reindex the data frame to prevent gaps in the indexes
commentData.reset_index(drop=True, inplace=True)
commentData.head()

Unnamed: 0,comment,nwords,hashed
0,Please make sure that federally qualified heal...,53,4603853928484254947
1,The Medicaid reform proposal ignores some high...,415,-3005735205235233030
2,As an employee of a Western North Carolina Com...,206,3113130487395667677
3,I strongly urge you to include Medicaid Expasi...,79,1737853792026592288
4,The Department of Social Services needs to be ...,30,-5225945899384959805


**4. Help functions**. <br>
 -  _process_comment_  takes a dataframe with the comments in text and call the other functions to process frame
 -  _comment_sentiment_ takes a comment and returns the sentiment by calling the Text Analytics API
 -  _comment_summary_  takes the result from _comment_sentiment_ and summarizes the analysis

In [7]:
def process_comment (comment_df): 
    """
    Take the data frame, get the sentiments and save result to a CSV file

    Args:
        comment_df -- Data frame containing the text to analyze.
    Returns:
         A data frame consisting of the relevant columns
         'id','sentiment', 'positive','negative','neutral'. 
    """
    df2 = comment_df
    # Drop any existing index and use a new one
    df2.reset_index(drop=True, inplace=True) 
    print(u"Processing records in data frame....")
    for i, row in df2.iterrows():
        #print(u"Processing Record... #{}".format(i+1))
        text_data = df2.loc[i,"comment"].encode("utf-8").decode("ascii", "ignore")
        sentimentResult = comment_sentiment (text_data, i+1)
        sentimentSummary = comment_summary(sentimentResult)
        # Add result to data frame
        df2.loc[i, "id"] = i+1
        df2.loc[i, "sentiment"] = sentimentSummary['Sentiment']
        df2.loc[i, "positive"] = sentimentSummary['Positive']
        df2.loc[i, "negative"] = sentimentSummary['Negative']
        df2.loc[i, "neutral"] = sentimentSummary['Neutral']
        dfx = df2[['id','sentiment', 'positive','negative','neutral']]
    print(u"Processing completed....")
    # Ensure that numbers are represented as integers and not float
    convert_dict = {'id': int, 
                'positive': int,
                'negative': int,
                'neutral': int,
                'sentiment': str
               } 
  
    dfx = dfx.astype(convert_dict)
    return  dfx

In [5]:
def comment_sentiment(comment="Welcome to sentiment analysis with Azure Cognitive Services Text Analytics API.", cid=1):
    """
    Take a single comment in string and analyze the sentiment

    Args:
        comment --  The text content to analyze. 
        Default comment:
                "Welcome to sentiment analysis with Azure Cognitive Services Text Analytics API"
        cid -- The numeric id of the comment analyzed. 
        Default value is 1
    """
    language = "en"
    try:
        document = {"id": cid, "language": language, "text": comment }
        body = {"documents": [document]}
        #print(document)
        #print(str(document))
        conn = http.client.HTTPSConnection(cognitive_services_region)
        conn.request("POST", "/text/analytics/v3.0-preview.1/sentiment", str(body), headers)
        response = conn.getresponse()
        data = response.read()
        # Extract key phrases
        return data
    except Exception as e:
        print("[Errno {0}] {1}".format(e.errno, e.strerror))
    finally:
        conn.close()

In [6]:
def comment_summary(commentData):
    """
        Take a single response data from comment_sentiment function and summarizes the result

        Args:
            commentData --  The text response data to summarize. 
    """
    responseJson = json.loads(commentData)
    summary = {"Id": 0, "Sentiment": "", "Positive":0,"Neutral":0,"Negative":0}
    for document in responseJson['documents']:
      summary["Sentiment"] = document['sentiment'].capitalize()
      summary["Id"] = document['id']
      for each in document['sentences']:
         sentimentscore = each['sentiment']
         if sentimentscore == 'positive': 
            summary["Positive"] +=1
         elif sentimentscore  == 'negative':
            summary["Negative"] +=1
         else:
            summary["Neutral"] +=1
    return summary

In [8]:
df = process_comment(commentData)
df.head(10)

Processing records in data frame....
Processing completed....


Unnamed: 0,id,sentiment,positive,negative,neutral
0,1,Mixed,1,1,0
1,2,Mixed,6,6,9
2,3,Negative,0,4,1
3,4,Negative,0,3,2
4,5,Negative,0,1,1
5,6,Mixed,3,4,0
6,7,Neutral,0,0,2
7,8,Mixed,3,1,0
8,9,Mixed,1,1,1
9,10,Mixed,2,1,2


In [9]:
#Save result to CSV to be used in PowerBI
df.to_csv('csv_example.csv')
print(len(df))

1398
