In [3]:
import openai
import re
import json
import os
from dotenv import load_dotenv 
import google.generativeai as genai
#import PIL.Image

In [5]:
# load environment variable
load_dotenv()

# load open-api-key from environment variable.
# openai_api_key = os.getenv('OPENAI_API_KEY')
genai_api_key = os.getenv("GOOGLE_API_KEY")


# check if the call was successful
if genai_api_key:
    print("Successful")
else:
    print("Error loading key")

Successful


In [6]:
# get api_key
# openai.api_key = openai_api_key
genai.configure(api_key=genai_api_key)

In [10]:
# write a function to interact with openai model
# def interact(prompt):
#     response = openai.chat.completions.create(
#         model = 'gpt-3.5-turbo',
#         messages = [
#         {
#             "role": "user",
#             "content": prompt
#         }
#         ],
#     )
#     return response.choices[0].message.content

def interact(prompt):
    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    return response.text

In [11]:
# test the function
interact("Hi, anybody there?")

"Hi there! I'm here, how can I help you today? \n"

In [14]:
# Setup the article title
article = "What is data collection?"

# Inject the article title into the base prompt
base_prompt = f'''Write a numbered, hierarchical outline for an article on {article}\n\nHere is an example,\
                of the structure:\n\n1. Introduction \n    a. Definition of digital marketing \n2.\ 
                Types of Digital Marketing \n    a. Search Engine Optimization \n\   
                b. Social Media Marketing \n    c. Content Marketing \n    d. Pay-Per-Click Advertising \n\    
                e. Email Marketing \n3. Benefits of Digital Marketing \n    a. Cost-Effective \n    b. Targeted Audience \n\    
                c. Measurable Results \n    d. Increased Reach \n\n----\n'''

result = interact(base_prompt)
print(result)

Here's a hierarchical outline for an article on "What is Data Collection":

## What is Data Collection?

1. **Introduction**
    a. **Defining Data Collection:** 
        i. The process of gathering raw information.
        ii. Key role in various fields (research, business, technology, etc.).
    b. **Purpose of Data Collection:** 
        i. To understand phenomena, trends, and patterns.
        ii. To support decision-making and problem-solving.
        iii. To improve products, services, and processes. 

2. **Types of Data**
    a. **Quantitative Data:**
        i. Numerical information (e.g., age, height, income, sales figures).
        ii. Measured and analyzed using statistical methods.
    b. **Qualitative Data:**
        i. Descriptive information (e.g., opinions, experiences, beliefs).
        ii. Collected through interviews, surveys, observations, focus groups.
    c. **Structured Data:**
        i. Organized and easily searchable (e.g., data in databases, spreadsheets).
  

In [18]:
# extract main sections
main_sections = re.findall(r"\d+\..*?(?=^\d+|\Z)", result, re.MULTILINE | re.DOTALL)

# extract subsections
sections = {}

for section in main_sections:
    section_tittle = re.search(r"\d+\..+", section).group(0)
    sub_sections = re.findall(r"\s+[a-z]\..+", section, re.MULTILINE)
    sections[section_tittle] = [heading.strip() for heading in sub_sections]

In [19]:
print(sections)

{'1. **Introduction**': ['a. **Defining Data Collection:**', 'i. The process of gathering raw information.', 'b. **Purpose of Data Collection:**', 'i. To understand phenomena, trends, and patterns.'], '2. **Types of Data**': ['a. **Quantitative Data:**', 'i. Numerical information (e.g., age, height, income, sales figures).', 'b. **Qualitative Data:**', 'i. Descriptive information (e.g., opinions, experiences, beliefs).', 'c. **Structured Data:**', 'i. Organized and easily searchable (e.g., data in databases, spreadsheets).', 'd. **Unstructured Data:**', 'i. Raw, unorganized data (e.g., text documents, images, audio recordings).'], '3. **Methods of Data Collection**': ['a. **Surveys:**', 'i. Questionnaires distributed to a sample population.', 'b. **Interviews:**', 'i. Direct conversations with individuals.', 'c. **Observations:**', 'i. Observing and recording behavior, events, or phenomena.', 'd. **Experiments:**', 'i. Controlled environments to test hypotheses.', 'e. **Focus Groups:**

In [36]:
prompt = "Produce an article outline for \"What is data engineering?\"\n\nHere is an example, of the structure. Always return valid JSON.\n\n{\n\"top_heading_one:\": [\"subheading_one\", \"subheading_two\"],\n\"top_heading_two:\": [\"subheading_one\", \"subheading_two\"],\n} \n\nRemember that the ouput must be like the above, and must be parsable JSON.\n\n----\n\n"
result = interact(prompt)

In [37]:
print(result)

{
"What is Data Engineering?": [
"Defining Data Engineering",
"Key Responsibilities of a Data Engineer",
"The Importance of Data Engineering",
"Data Engineering Skills and Technologies",
"Data Engineering Tools and Platforms",
"The Future of Data Engineering"
],
"The Role of Data Engineering in Data Science": [
"Data Engineering for Data Science Projects",
"Data Pipelines and Data Warehousing",
"Data Quality and Data Governance",
"Data Visualization and Analytics",
"Data Engineering for Machine Learning"
],
"Data Engineering Applications and Industries": [
"Data Engineering in Healthcare",
"Data Engineering in Finance",
"Data Engineering in E-commerce",
"Data Engineering in Manufacturing",
"Data Engineering in Marketing"
],
"Conclusion": [
"Summary of Data Engineering Concepts",
"The Value of Data Engineering for Businesses",
"Future Trends in Data Engineering"
]
} 



In [38]:
json.loads(result)

{'What is Data Engineering?': ['Defining Data Engineering',
  'Key Responsibilities of a Data Engineer',
  'The Importance of Data Engineering',
  'Data Engineering Skills and Technologies',
  'Data Engineering Tools and Platforms',
  'The Future of Data Engineering'],
 'The Role of Data Engineering in Data Science': ['Data Engineering for Data Science Projects',
  'Data Pipelines and Data Warehousing',
  'Data Quality and Data Governance',
  'Data Visualization and Analytics',
  'Data Engineering for Machine Learning'],
 'Data Engineering Applications and Industries': ['Data Engineering in Healthcare',
  'Data Engineering in Finance',
  'Data Engineering in E-commerce',
  'Data Engineering in Manufacturing',
  'Data Engineering in Marketing'],
 'Conclusion': ['Summary of Data Engineering Concepts',
  'The Value of Data Engineering for Businesses',
  'Future Trends in Data Engineering']}

In [39]:
try:
  print(json.loads(result))
except Exception as e:
  
  pass
  # Parsing the text and trying to extract the data from the string

{'What is Data Engineering?': ['Defining Data Engineering', 'Key Responsibilities of a Data Engineer', 'The Importance of Data Engineering', 'Data Engineering Skills and Technologies', 'Data Engineering Tools and Platforms', 'The Future of Data Engineering'], 'The Role of Data Engineering in Data Science': ['Data Engineering for Data Science Projects', 'Data Pipelines and Data Warehousing', 'Data Quality and Data Governance', 'Data Visualization and Analytics', 'Data Engineering for Machine Learning'], 'Data Engineering Applications and Industries': ['Data Engineering in Healthcare', 'Data Engineering in Finance', 'Data Engineering in E-commerce', 'Data Engineering in Manufacturing', 'Data Engineering in Marketing'], 'Conclusion': ['Summary of Data Engineering Concepts', 'The Value of Data Engineering for Businesses', 'Future Trends in Data Engineering']}


In [52]:
import yaml

prompt = "Produce an article outline in the format a .yml file for \"What is data collection?\"\n\nAlways return valid YML.\n\nHere is an example:\n\n- name: Example YAML File\n  description: This is an example YAML file.\n  sections:\n    - title: Introduction\n      content: |\n        This is the introduction.\n    - title: Conclusion\n      content: |\n        This is the conclusion.\n\n----\n\n\n"
text = interact(prompt)
cleaned_text = text.strip("```yaml\n")

In [53]:
print(cleaned_text)

- name: What is Data Collection?
  description: An overview of the process of gathering information for analysis and insights.
  sections:
    - title: Introduction
      content: |
        - Define data collection as a fundamental process in various fields.
        - Briefly explain its importance in decision-making, research, and business.
    - title: Key Concepts
      content: |
        - **Data:** Define data as raw facts and figures.
        - **Data Sources:**  Explain different types of data sources (primary, secondary).
        - **Data Collection Methods:**  Outline various methods: 
            - Surveys and questionnaires
            - Interviews
            - Observations
            - Experiments
            - Web scraping
            - APIs
            - Sensors and IoT devices
    - title: Stages of Data Collection
      content: |
        - **Planning and Design:** 
            - Determine objectives and research questions.
            - Identify data sources and coll

In [54]:
# Load the YAML string into a Python object
data = yaml.load(cleaned_text, Loader=yaml.FullLoader)

In [55]:
print(data)

[{'name': 'What is Data Collection?', 'description': 'An overview of the process of gathering information for analysis and insights.', 'sections': [{'title': 'Introduction', 'content': '- Define data collection as a fundamental process in various fields.\n- Briefly explain its importance in decision-making, research, and business.\n'}, {'title': 'Key Concepts', 'content': '- **Data:** Define data as raw facts and figures.\n- **Data Sources:**  Explain different types of data sources (primary, secondary).\n- **Data Collection Methods:**  Outline various methods: \n    - Surveys and questionnaires\n    - Interviews\n    - Observations\n    - Experiments\n    - Web scraping\n    - APIs\n    - Sensors and IoT devices\n'}, {'title': 'Stages of Data Collection', 'content': '- **Planning and Design:** \n    - Determine objectives and research questions.\n    - Identify data sources and collection methods.\n    - Develop data collection instruments (surveys, questionnaires).\n- **Data Gatherin