In [5]:
import openai
from openai import OpenAI
import nltk
from nltk.tokenize import sent_tokenize
from bs4 import BeautifulSoup
import requests
import os
import matplotlib.pyplot as plt

def open_parse_doc(filename):
    """
    Parse an HTML document from a file and return a BeautifulSoup object.
    
    Args:
        filename (str): The name of the HTML file to be parsed.
    
    Returns:
        BeautifulSoup: A BeautifulSoup object representing the parsed HTML.
    """
    with open(filename, 'r') as file:
        html = file.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup

def get_text(soup):
    """
    Extract text between specified sections in a BeautifulSoup object.
    
    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing the HTML document.
    
    Returns:
        str: Extracted text.
    """
    sec_list = ["Business", "Risk Factors"]
    text = ""
    for i in range(len(sec_list) - 1):
        item_1 = sec_list[i]
        item_2 = sec_list[i + 1]
        a_tag = soup.find('a', text=item_1)
        if a_tag:
            href = a_tag['href']
            href = href[1:]  # removing hashtag in front of id
        a_tag = soup.find('a', text=item_2)
        if a_tag:
            href_2 = a_tag['href']
            href_2 = href_2[1:]  # removing hashtag in front of id
        first_div = soup.find('div', id=href)
        second_div = soup.find('div', id=href_2)
        tag = "div"
        if first_div is None:
            first_div = soup.find('p', id=href)
            second_div = soup.find('p', id=href_2)
            tag = "p"
        if first_div is not None:
            next_div = first_div.find_next_sibling(tag)
            while (next_div != second_div):
                text += next_div.get_text(strip=True)
                next_div = next_div.find_next_sibling(tag)
    return text

def get_topics(text):
    """
    Extract DBPedia topics matching the research and development activities of 'Microsoft'
    from the given text using OpenAI's GPT model.
    
    Args:
        text (str): The text describing research and development activities.
    
    Returns:
        list: List of DBPedia topics.
    """
    research_info = []

    prompt = f"""Extract a set of DBPedia topics that match the research and development activities of 'Microsoft',
    as described in the given text:'{text}'. Give only the title of the DBPedia topic. Dont return any sentence but DBPedia topics. Provide output in the same format as:
    '1. BusinessCompany\n2. iPhone\n3. MacBook Air\n4. MacBook Pro\n5. iMac\n6. Mac mini\n7. Mac Studio\n8. Mac Pro\n9. iPad\n10. iPad Pro\n11. iPad Air\n12. iPad mini\n13. AirPods\n14. AirPods Pro\n15. AirPods Max\n16. Apple TV\n17. Apple TV 4K\n18. Apple TV HD\n19. Apple Watch\n20. Apple Watch Ultra\n21. Apple Watch Series 8\n22. Apple Watch SE\n23. Beats\n24. HomePod mini\n25. AppleCare\n26. Cloud Services\n27. Digital Content\n28. App Store\n29. Apple Arcade\n30. Apple Fitness+\n31. Apple Music\n32. Apple News+\n33. Apple TV+\n34. Apple Card\n35. Apple Pay\n36. Markets and Distribution\n37. Competition\n38. Supply of Components\n39. Research and Development\n40. Intellectual Property\n41. Business Seasonality and Product Introductions\n42. Human Capital\n43. Workplace Practices and Policies\n44. Compensation and Benefits\n45. Inclusion and Diversity\n46. Engagement\n47. Health and Safety\n48. Available Information'
    """
    client = OpenAI(
        api_key="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"
    )

    completion = client.chat.completions.create(
        model="gpt-4-turbo-preview",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )

    # print(completion.choices[0].message)

    # Extract the generated research-related information
    generated_text = completion.choices[0].message.content
    # Append the generated information to the research_info list
    research_info.append(generated_text)
    return research_info

def sort_topics(research_info):
    """
    Sort and process the extracted DBPedia topics from the research information.
    
    Args:
        research_info (list): List of research-related information.
    """
    my_dict = {}
    my_list = []
    for i, info in enumerate(research_info):
        sentences = sent_tokenize(info)
        research_sentences = []
        for sentence in sentences:
            if sentence != "1.":
                modified_string = sentence.split("\n")[0]
                my_list.append(modified_string)

    return my_list
# Dictionary that is going to contain year as a key, list of topics for that year as a value
my_dict = {}
soup = open_parse_doc("Microsoft2023.html")
text = get_text(soup)
research_info = get_topics(text)
list_of_topics = sort_topics(research_info)
my_dict[2023] = list_of_topics

soup = open_parse_doc("Microsoft2022.html")
text = get_text(soup)
research_info = get_topics(text)
list_of_topics = sort_topics(research_info)
my_dict[2022] = list_of_topics

soup = open_parse_doc("Microsoft2021.html")
text = get_text(soup)
research_info = get_topics(text)
list_of_topics = sort_topics(research_info)
my_dict[2021] = list_of_topics

soup = open_parse_doc("Microsoft2020.html")
text = get_text(soup)
research_info = get_topics(text)
list_of_topics = sort_topics(research_info)
my_dict[2020] = list_of_topics

for key,val in my_dict.items():
    print("Year: ", key)
    print("Topics: ", val)

  a_tag = soup.find('a', text=item_1)
  a_tag = soup.find('a', text=item_2)


Year:  2023
Topics:  ['Microsoft', 'Artificial_Intelligence', 'Microsoft_Teams', 'Microsoft_Outlook', 'Bing_(search_engine)', 'Xbox', 'Cloud_computing', 'Microsoft_Cloud', 'Digital_transformation', 'Office_365', 'Dynamics_365', 'LinkedIn', 'Microsoft_Power_Platform', 'Low-code_development_platform', 'Robotic_process_automation', 'Virtual_agent', 'Business_intelligence', 'GitHub', 'Visual_Studio', 'Windows_365', 'Azure_Orbital', 'Microsoft_Mesh', 'Intelligent_edge', 'Internet_of_Things', 'OpenAI', 'Nuance_Communications', 'Mixed_reality', 'Corporate_social_responsibility', 'Sustainability', 'Racial_equity', 'Digital_skills', 'Cybersecurity', 'Generative_AI', 'Human_capital', 'Workplace_flexibility', 'Learning_and_development', 'Productivity_and_Business_Processes', 'Intelligent_Cloud', 'More_Personal_Computing', 'Server_Products_and_Cloud_Services', 'Enterprise_Services', 'Windows_operating_system', 'Patent_licensing', 'Devices_(Microsoft)', 'Gaming_(Microsoft)', 'Search_and_News_Advert

In [None]:
def plot_topics(my_dict):
    """
    Plots R&D topics over the years as a scatter plot with annotations.

    This function takes a dictionary where each key is a year and each value is a list of topics
    related to R&D for that year. It generates a scatter plot with each year marked on the x-axis.
    Topics for each year are displayed as annotations next to the corresponding year mark on the plot.
    The y-axis is not used to encode data and is hidden.

    Parameters:
    - my_dict (dict): A dictionary with years as keys (int or str) and a list of strings (topics) as values.

    Returns:
    - None: The function directly displays the plot and does not return any value.

    Requires:
    - matplotlib.pyplot: This function relies on matplotlib for generating and displaying the plot.
    """
    fig, ax = plt.subplots(figsize=(15, 11))
    years = list(my_dict.keys())
    topics = ["\n".join(my_dict[year]) for year in years]


    ax.scatter(years, [1] * len(years))
    for i, topic in enumerate(topics):
        ax.text(years[i], 1, '  ' + topic, va='center')


    ax.get_yaxis().set_visible(False)  
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.xaxis.set_ticks_position('bottom')
    ax.set_xticks(years)
    ax.set_xticklabels(years, rotation=45, ha='right')
    ax.set_title('R&D Topics by Year')

    plt.show()
plot_topics(my_dict)