In [48]:
import os
import json
import time
from google.cloud import firestore
from google.api_core.exceptions import DeadlineExceeded
from google.api_core.retry import Retry
from openai import OpenAI

# initialize the OpenAI client
client = OpenAI(api_key="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")


def show_json(obj):
    display(json.loads(obj.model_dump_json()))


# Assume the google-cloud-firestore import is available in the development environment


# Initialize Firestore client with specific project ID, only if file is found
if os.path.isfile("vk-linkedin-master-service-account.json"):
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
        "vk-linkedin-master-service-account.json"
    )

db = firestore.Client(project="vk-linkedin", database="linkedin")

# Get the 'extracted' collection
extracted_ref = db.collection("extracted")

# analysis collection
analysis_ref = db.collection("analysis")

In [None]:
#  load to pandas dataframe
import pandas as pd

# Fetch documents
docs = extracted_ref.stream()

# Create a list of dictionaries from documents, assuming each document's fields represent a row
data = [doc.to_dict() for doc in docs]

# Load data into a pandas DataFrame
df = pd.DataFrame(data)


In [None]:
# show the first element of the list of dictionaries data as json
doc_dic = data[0]
print(doc_dic["summary"])

for doc_dic in data:
    if len(doc_dic["summary"]) > 100:
        summary = doc_dic["summary"].replace("\n", " ")
        print(summary)
        break

In [50]:
print(json.dumps(doc_dic, indent=4))

{
    "invitedDateISO": null,
    "addToTargetDateISO": "2024-03-22T03:51:02.709Z",
    "profileUrl": "https://www.linkedin.com/sales/people/ACwAAACYzD0BspG6AqNwNjAqUpdsVXzp4173sXU,OK6o,NAME_SEARCH/",
    "extra": {
        "industry": null,
        "id": 2943,
        "summary": null,
        "locationName": "Scottsdale, Arizona, United States",
        "personId": 23934
    },
    "addToTargetDate": "March 21, 2024 10:51:02 PM",
    "externalIds": [
        {
            "authType": "NAME_SEARCH",
            "type": "sn-hash-id",
            "id": 91715,
            "sentAtToPAS": "2024-03-22T04:27:41.044Z",
            "createdAt": "2024-03-21T04:27:05.087Z",
            "hash": "ACwAAACYzD0BspG6AqNwNjAqUpdsVXzp4173sXU",
            "externalId": "ACwAAACYzD0BspG6AqNwNjAqUpdsVXzp4173sXU",
            "authToken": "OK6o",
            "personId": 23934
        },
        {
            "memberId": 10013757,
            "type": "member-id",
            "id": 91716,
            "sentAtT

In [35]:
#  helper function to wait for the run to finish

def wait_on_run(run, thread):
    """
    Wait for a run to complete.

    This function continuously checks the status of a run in a thread. If the run is queued or in progress, 
    it retrieves the run and waits for half a second before checking again. It stops checking and returns the run 
    once the run is no longer queued or in progress.

    Args:
        run (openai.api_resources.abstract.APIResource): The run to wait for.
        thread (openai.api_resources.abstract.APIResource): The thread that the run is in.

    Returns:
        openai.api_resources.abstract.APIResource: The completed run.

    Note:
        This function uses the OpenAI API, which requires an API key and may incur costs.
    """
    
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id,
        )
        time.sleep(1)
    return run

In [51]:
# analyze the summary with the OpenAI Assitant
def analyze_summary(summary) -> dict:
    """
    Analyze the summary with the OpenAI Assistant.

    This function creates a new thread and adds the summary to it. It then creates a run with a specific assistant ID.
    The function waits for the run to complete and retrieves the messages from the thread. The last message is cleaned
    and converted to a dictionary.

    Args:
        summary (str): The summary to be analyzed.

    Returns:
        dict: The response from the OpenAI Assistant as a dictionary. If the response cannot be decoded as JSON,
        it returns None and prints an error message.

    Raises:
        json.JSONDecodeError: If the response cannot be decoded as JSON.
    """

    
    try: 
    # create a thread
        thread = client.beta.threads.create()

        #  add summary to the thread
        message = client.beta.threads.messages.create(
            thread_id=thread.id, role="user", content=summary
        )

        # create  run
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id="asst_mxA64EdU5qHi9wNVpEmQRQZr",
        )
    except Exception as e:
        print(f"Error createing Open AI threat: {e}")
        return None

    # get the run result
    try:
        run = wait_on_run(run, thread)
    except Exception as e:
        print(f"Error Getting results from Open AI threat: {e}")
        return None

    # get messages
    messages = client.beta.threads.messages.list(thread_id=thread.id)
    message = messages.data[0]  # the last message

    #  clean response
    response = message.content[0].text.value
    response = response.replace("```json\n", "").replace("```", "")

    # convert rsponse to dictionary
    try:
        res = json.loads(response)
        return res
    except json.JSONDecodeError:
        print("Failed to decode JSON: ", response)
        return None

In [52]:
# helper function to analyze the summary
def get_analysis(summary):
    try:
        analysis = analyze_summary(summary)
        return analysis
    except Exception as e:
        print(f"Error analyzing summary: {e}")
        return None


In [53]:
# process all documents in collection `extracted` via generator

# retry policy for DeadlineExceeded in case of timeout in Firestore
retry = Retry(predicate=lambda e: isinstance(e, DeadlineExceeded))


docs_generator = (doc for doc in extracted_ref.stream(retry=retry))

for doc in docs_generator:
    doc_dict = doc.to_dict()

    # chek if the document already exists
    doc_id = doc.id
    doc_ref = analysis_ref.document(doc_id)

    # for new document
    if not doc_ref.get().exists:
        summary = doc_dict["summary"]

        if len(summary) > 50:
            # get analysis of summary via OpenAI Assistant
            try:
                analysis = get_analysis(summary)

                # if analysis is not None
                if analysis is not None:
                    # print('-----------------\n', doc_id, doc_dict['fullName'], summary, analysis)

                    if (
                        "industry" in analysis
                        and "function" in analysis
                        and "seniority" in analysis
                    ):
                        new_doc = {
                            "profileUrl": doc_dict["profileUrl"],
                            "lh_id": doc_dict["lhId"],
                            "industry": analysis["industry"],
                            "function": analysis["function"],
                            "seniority": analysis["seniority"],
                            "summary": summary,
                        }

                        # create document in new collection
                        analysis_ref.document(doc_id).set(new_doc)
                else:
                    # wait for 10 seconds before trying again
                    time.sleep(10)
            except Exception as e:
                print(e)
                time.sleep(10)
                continue
        

Failed to decode JSON:  Dubai, United Arab Emirates
Medsol Diagnostics
Business Development Manager
Failed to decode JSON:  Lupin Diagnostics
Lupin Diagnostics Limited
Head - PMT & Business Operations Group
Mumbai, Maharashtra, India
