In [1]:
import numpy as np
import pandas as pd
import json
import requests
import urllib
import httplib
import base64
import time

In [2]:
sample_text_array = ["Such a lovely hackathon we are in!", "We are definitely going to", "win and to go Paris!"]

sample_text = {}
for i in range(150):
    sample_text[i] = sample_text_array[i % 3]

# sample_text = {0 : "Such a lovely hackathon we are in!", 
#                1 : "We are definitely going to", 
#                2 : "win and to go Paris!"
#               }

pd_text = pd.DataFrame(pd.Series(sample_text).rename("text"))
pd_text.head()

Unnamed: 0,text
0,Such a lovely hackathon we are in!
1,We are definitely going to
2,win and to go Paris!
3,Such a lovely hackathon we are in!
4,We are definitely going to


In [3]:
pd_text = pd.read_csv("Demo.csv")
len(pd_text)

110

In [4]:
api_endpoint = "https://westus.api.cognitive.microsoft.com/text/analytics/v2.0"
api_key_one = "xxxx"
api_key_two = "xxxx"

In [5]:
def generate_json_request(pd_text):
    request = {}
    request["documents"] = []
    
    for index, row in pd_text.iterrows():
        phrase = {}
        phrase["lang"] = "en"
        phrase["id"] = index
        phrase["text"] = row["text"]
        request["documents"].append(phrase)
    return json.dumps(request)

# json_request = generate_json_request(pd_text)
# print json_request

In [6]:
def process_request(json_request):
    header = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Ocp-Apim-Subscription-Key': 'xxxx',
    }
    try:
        conn = httplib.HTTPSConnection('westus.api.cognitive.microsoft.com')
        
        conn.request("POST", "/text/analytics/v2.0/sentiment", json_request, header)
        response = conn.getresponse()
        sentiment_data = response.read()
        
        conn.request("POST", "/text/analytics/v2.0/keyPhrases", json_request, header)
        response = conn.getresponse()
        key_phrase_data = response.read()
        
        conn.close()
        return sentiment_data, key_phrase_data
    except Exception as e:
        print("[Errno {0}] {1}".format(e.errno, e.strerror))
        return None    

In [7]:
# sentiment_data, key_phrase_data = process_request(json_request)

In [8]:
def add_text_analysis(base_df, sentiment_data, key_phrase_data):
    sentiment_dict = {}
    for entry in json.loads(sentiment_data)["documents"]:
        sentiment_dict[int(entry["id"])] = entry["score"]

    key_phrase_dict = {}
    
    for entry in json.loads(key_phrase_data)["documents"]:
        key_phrase_dict[int(entry["id"])] = entry["keyPhrases"]

    final_pd = pd.DataFrame(base_df)
    final_pd["sentiment_score"] = pd.Series(sentiment_dict)
    final_pd["key_phrases"] = pd.Series(key_phrase_dict)
    return final_pd

In [9]:
def main(base_df):
    json_request = generate_json_request(base_df)
    sentiment_data, key_phrase_data = process_request(json_request)
    final_df = add_text_analysis(base_df, sentiment_data, key_phrase_data)
    return final_df

In [10]:
# final_pd = main(pd_text)

In [11]:
# final_pd

In [12]:
def topic_request(json_request):
    header = {
    'Accept' : 'application/json',
    'Content-Type': 'application/json',
    'Ocp-Apim-Subscription-Key': 'xxxx',
    }
    try:
        conn = httplib.HTTPSConnection('westus.api.cognitive.microsoft.com')        
        conn.request("POST", "/text/analytics/v2.0/topics", json_request, header)
        response = conn.getresponse()
        output_address = response.getheader("operation-location")
        operation_id = output_address.split("/")[-1] 
        print "OperationId: ", operation_id
        conn.close()
        
        request_status = "NotStarted"
        while request_status != "Succeeded" and request_status != "Failed":
            print request_status
            time.sleep(60)
            conn = httplib.HTTPSConnection('westus.api.cognitive.microsoft.com')
            conn.request("GET", "/text/analytics/v2.0/operations/" + operation_id,
                         "",
                         header
                        )
            response = conn.getresponse()
            response_json = json.loads(response.read())
            request_status = response_json["status"]
            if request_status == "Succeeded":
                topic_data = response_json["operationProcessingResult"]["topics"]
                assignment_data = response_json["operationProcessingResult"]["topicAssignments"]
            conn.close()
        if request_status == "Succeeded":
            return topic_data, assignment_data
        return None            
            
    except Exception as e:
        print("[Errno {0}] {1}".format(e.errno, e.strerror))
        return None       

In [13]:
def add_topic_analysis(base_df, topic_data, assignment_data):
    assignment_dict = {}
    distance_dict = {}
    for entry in assignment_data:
        assignment_dict[int(entry["documentId"])] = entry["topicId"]
        distance_dict[int(entry["documentId"])] = entry["distance"]

    topic_id_dict = {}
    key_phrase_dict = {}
    topic_score_dict = {}
    topic_counter = 0
    for entry in topic_data:
        topic_id_dict[topic_counter] = entry["id"]
        key_phrase_dict[topic_counter] = entry["keyPhrase"]
        topic_score_dict[topic_counter] = entry["score"]
        topic_counter += 1
        
    topic_pd = pd.DataFrame(pd.Series(topic_id_dict).rename("topic_id"))
    topic_pd["topic_phrase"] = pd.Series(key_phrase_dict)
    topic_pd["topic_score"] = pd.Series(topic_score_dict)

    final_pd = pd.DataFrame(base_df)
    final_pd["topic_id"] = pd.Series(assignment_dict)
    final_pd["topic_distance"] = pd.Series(distance_dict)
    final_pd = pd.merge(final_pd, topic_pd, on="topic_id", how="left")
    final_pd.sort_values("start_second", inplace=True)
    return final_pd

In [14]:
def get_topics_main(base_df):
    json_request = generate_json_request(base_df)
    topic_data, assignment_data = topic_request(json_request)
    final_df = add_topic_analysis(base_df, topic_data, assignment_data)
    return final_df

In [15]:
print get_topics_main(pd_text)

OperationId:  d22766123ee644eda747857058bff951
NotStarted


KeyboardInterrupt: 

In [102]:
header = {'Accept' : 'application/json', 'Content-Type': 'application/json','Ocp-Apim-Subscription-Key': '28d6776ec31844a1aeb1095be8d99192'}
conn = httplib.HTTPSConnection('westus.api.cognitive.microsoft.com')
conn.request("POST", "/text/analytics/v2.0/topics", generate_json_request(pd_text), header)
response = conn.getresponse()
# print response.msg
address = response.getheader("operation-location")
print address
conn.close()

https://westus.api.cognitive.microsoft.com/text/analytics/v2.0/operations/f2af3d6a2c7f4ff9b684338b6c44f483


In [104]:
print address.split("/")[-1]

['https:', '', 'westus.api.cognitive.microsoft.com', 'text', 'analytics', 'v2.0', 'operations', 'f2af3d6a2c7f4ff9b684338b6c44f483']


In [100]:
conn = httplib.HTTPSConnection('westus.api.cognitive.microsoft.com')
conn.request("GET", "/text/analytics/v2.0/operations/333b6200098544d688268ed4e6ccc110",
             "",
             header
            )
response = conn.getresponse()
print response.msg
print response.status
print response.read()
conn.close()

Transfer-Encoding: chunked
Content-Type: application/json; charset=utf-8
x-ms-transaction-count: 1
x-aml-ta-request-id: eefb145e-667a-4140-b5f1-3887ef33e7ba
X-Content-Type-Options: nosniff
apim-request-id: 603609c1-d941-44bf-8480-38473505f607
Strict-Transport-Security: max-age=31536000; includeSubDomains; preload
Date: Sat, 24 Jun 2017 15:51:05 GMT

200
{"status":"Succeeded","createdDateTime":"2017-06-24T15:44:39Z","operationType":"topics","operationProcessingResult":{"topics":[{"id":"d520399f-e646-4c66-82f3-b0df361fa1fc","score":2.0,"keyPhrase":"christina iran rancho"},{"id":"b792d024-2472-45ef-9af4-f41940840989","score":2.0,"keyPhrase":"iran christina rachel"},{"id":"210665b6-55ff-4064-bb08-b2d089b7e7ab","score":5.0,"keyPhrase":"municipality"},{"id":"87c9f484-194c-4cc7-a3b1-9d300334b686","score":2.0,"keyPhrase":"municipality on christina"},{"id":"78dd4491-ad17-4641-9385-03cdbf1871d8","score":3.0,"keyPhrase":"real estate developer"},{"id":"5c8e8d4d-ae41-40c5-bac0-c9594843a7e6"

In [109]:
output = topic_request(generate_json_request(pd_text))
print type(output)

OperationId:  3aceb6388d2f4ae9964004e54016b80d
NotStarted
NotStarted
Running
Running
Running
Running
Running
<type 'str'>


In [135]:
print len(json.loads(output)["operationProcessingResult"]["topicAssignments"])

42


In [146]:
# json.loads(output)["operationProcessingResult"]["topics"]
add_topic_analysis(pd_text, 
                   json.loads(output)["operationProcessingResult"]["topics"], 
                   json.loads(output)["operationProcessingResult"]["topicAssignments"]).sort_values("start_second")

Unnamed: 0.1,Unnamed: 0,confidence,start_second,text,topic_id,topic_distance,topic_phrase,topic_score
0,0,0.869492,0,here is an example of an online meeting with 2...,3a02ce04-cc57-4f77-b7c1-7d397cb7ed90,0.4119,piece of land,2.0
89,89,0.833244,5000,discount senior project to build a convention ...,3a02ce04-cc57-4f77-b7c1-7d397cb7ed90,0.4119,piece of land,2.0
1,1,0.860131,10000,owned by municipality the members of the numbe...,0921feb5-cff4-4020-9051-3436bf5764ba,0.6430,real estate developer,3.0
12,12,0.858510,15000,the real estate developer or alexander in braz...,e2d3ce37-42fe-47d8-8bab-ce67f50fd924,0.6355,mac,3.0
23,23,0.803286,20000,connected with APC tom in india connected with...,e2d3ce37-42fe-47d8-8bab-ce67f50fd924,0.5036,mac,3.0
34,34,0.860507,25000,and matt in the united states connected with a...,07090802-401f-44cf-988f-43b62798020b,0.5629,municipality on christina,2.0
45,45,0.780116,30000,verse of the number to the municipality of chr...,07090802-401f-44cf-988f-43b62798020b,0.6181,municipality on christina,2.0
56,56,0.795216,35000,united states with the PC he ran in india with...,cf07d95b-1ef8-4b7c-b67f-83d3b6e7ee8c,0.3662,ran in india,1.0
67,67,0.858898,40000,rancho in the united states with it BC and vic...,a01726e0-3b5c-455c-9cb2-1beb7a084f30,0.3662,vicki in france,1.0
78,78,0.910249,45000,in france with an iphone and I have the host o...,e2d3ce37-42fe-47d8-8bab-ce67f50fd924,0.7377,mac,3.0
