<a href="https://colab.research.google.com/github/Jaunson/Cool_Demos/blob/main/Gemini_Brand_Research_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Brand Research Notebook

The below notebook makes use of jina.ai to webscrape a particular url and then does brand research on the returned information with Gemini. In order to effectively use this notebook you will both need a jina.ai api key as well as a gemini api key.

if you find this notebook to be helpful and useful, I ask that you consider supporting my work.

[🧋 Buy me a Coffee](buymeacoffee.com/johnjamisod)

Code is hidden by default for ease of use, but can be asily modified.


For Jina.ai go to "API Key & Billing"
- [jina.ai, 1M Tokens free](https://jina.ai/reader)

For Gemini follow the link and then click "Get API Key"
- [Gemini AI Studio](https://aistudio.google.com/app/apikey)




<script type="text/javascript" src="https://cdnjs.buymeacoffee.com/1.0.0/button.prod.min.js" data-name="bmc-button" data-slug="johnjamisod" data-color="#FF5F5F" data-emoji="☕"  data-font="Poppins" data-text="Buy me a coffee" data-outline-color="#000000" data-font-color="#ffffff" data-coffee-color="#FFDD00" ></script>

In [3]:
#@title Initialize API Keys and Supporting Libraries
from urllib.parse import urlparse, urljoin, urlunparse, quote, unquote
from pprint import pprint
import requests
import json
import os
from datetime import datetime, timedelta
import random
from time import sleep
import pandas as pd
import google.generativeai as genai
from google.generativeai import caching

JINA_API_KEY = "" #@param {type:"string"}
GEMINI_API_KEY = "" #@param {type:"string"}

def fetch_url_content(url, jina_api_key, with_links_summary=True, with_images_summary=True, with_generated_alt=True, no_cache=True, timeout=10, custom_headers=None):
    """
    Fetches content from a given URL using Jina AI's API with customizable headers.

    :param url: URL to fetch content from.
    :param jina_api_key: Jina AI API key. Defaults to a predefined key.
    :param with_links_summary: Toggle for 'X-With-Links-Summary' header.
    :param with_images_summary: Toggle for 'X-With-Images-Summary' header.
    :param with_generated_alt: Toggle for 'X-With-Generated-Alt' header.
    :param no_cache: Toggle for 'X-No-Cache' header.
    :param timeout: custom timeout, default 10 seconds.
    :param custom_headers: Optional custom headers to use for the request.
    :return: Response object from the request.

    custom header for cookies:
    "X-Set-Cookie": "<cookie-name-1>=<cookie-value>; domain=<cookie-1-domain>, <cookie-name-2>=<cookie-value>; domain=<cookie-2-domain>; Secure",
    """
    # Default headers based on function arguments
    headers = {
        "Authorization": f"Bearer {jina_api_key}",
        "Accept": "application/json",
        "X-With-Links-Summary": "true" if with_links_summary else "false",
        "X-With-Images-Summary": "true" if with_images_summary else "false",
        "X-With-Generated-Alt": "true" if with_generated_alt else "false",
        "X-No-Cache": "true" if no_cache else "false",
        "X-Timeout": f"{timeout}"
    }

    # If custom headers are provided, update the default headers with them
    if custom_headers:
        headers.update(custom_headers)

    # Make the request
    response = requests.get("https://r.jina.ai/"+url, headers=headers)
    if response.status_code == 200:
        return response.json()["data"]
    else:
        return {"error": response.reason, "status_code": response.status_code}

def normalize_url(url):
    parsed_url = urlparse(url)
    normalized_path = unquote(parsed_url.path).rstrip('/').lower().replace(' ', '-')
    encoded_path = quote(normalized_path, safe='/')
    normalized_url = urlunparse((parsed_url.scheme,
                                 parsed_url.netloc,
                                 encoded_path,
                                 parsed_url.params,
                                 parsed_url.query,
                                 parsed_url.fragment))
    return normalized_url

def join_urls(base, relative):
    return urljoin(base, relative)

def get_root_url(url):
    """
    Extracts the root URL from a given URL.
    """
    parsed_url = urlparse(url)
    root_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
    return root_url

def crawl_website(start_url, max_urls, jina_api_key):
    visited = set()  # Keep track of visited URLs
    to_visit = [start_url]  # Queue of URLs to visit
    root_url = get_root_url(start_url)  # Extract the root URL
    urls_crawled = 0  # Counter for the number of URLs crawled
    df = pd.DataFrame(columns=['title', 'url', 'content', 'images', 'links', 'timestamp_collected', 'root_url'])

    while to_visit and urls_crawled < max_urls:
        url = to_visit.pop(0)  # Get the next URL
        if url in visited:
            continue  # Skip this URL if already visited

        visited.add(url)  # Mark this URL as visited
        try:
            content = fetch_url_content(url, jina_api_key) # Fetch content using the provided function
            current_time = datetime.now().isoformat()
            content['timestamp_collected'] = current_time
            content['root_url'] = root_url
            df.loc[len(df)] = content
            print(f"Visited {url}")
        # Sleep for a random duration between 1 to 5 seconds
            sleep(random.randint(1, 5))
        # Process the content here (e.g., extract and print information)
        except Exception as e:
            print(f"Failed to fetch content from {url}: {e}")
            continue

        urls_crawled += 1  # Increment the counter

        # Assuming 'content' contains a list of URLs in 'links' key
        if 'links' in content and urls_crawled < max_urls:
            for link in content['links']:
                abs_link = urljoin(url, link)  # Ensure the link is absolute
                if get_root_url(abs_link) == root_url:  # Check if the link shares the same root URL
                    to_visit.append(abs_link)

        if urls_crawled >= max_urls:
            print("Reached the maximum number of URLs to crawl.")
            break
    return df

#

# Running your Scrape

Your mileage may vary when using a free vm on colab, generally 30 - 120 urls are enought to generate a useful report. I'd advise that in general you should opt for 120 or fewer urls as this typically fills the 2 Million token limit on gemini.

In [None]:
#@title Enter the website you'd like to scrape along with the number of urls you'd like to crawl. "https://" will be appended to any url you enter

brand_starting_url = '' #@param {type:"string"}
brand_starting_url = "https://" + brand_starting_url
max_urls =  5 #@param {type:"integer"}

df = crawl_website(brand_starting_url, max_urls, JINA_API_KEY)

# Analysis Prompts

All prompts are configured to work through their analysis step-by-step, as well as, give reasoning and confidence levels behind their responses.

- **Persona**: Analyzes website content to construct a profile on the likely audiance personas a brand is targeting. Information includes potential demographics, psychographics, goals, motivations, prefered communication channels, and a narative description of the persona.

- **Content Gap**: Analyzes website content to identify potentially underserved audience segments and gives recommendations on how to potentially extend your web content to better serve the niche.

- **Product/Service**: Analyzes content to provide a comprehensive product/service positions report. Report includes key features / benefits, target audience, unique selling points, and market positioning. If pricing is availble it will attempt to generate a table of products, price ranges, and product categories.

- **Brand**: Analyzes content and generates a report that talks about brand identity, positioning, personality, values, and voice.

In [11]:
#@title Initial Analysis Prompts for Persona, Content Gap, Product/Service, and Branding Analysis

report_builder_prompt = """
        You are an advanced AI assistant specializing in marketing and advertising analysis.
        Your primary function is to process and analyze web content to provide comprehensive,
        data-driven insights for marketing professionals. Follow these guidelines
        for all analyses:

        1. Objectivity: Maintain a neutral, fact-based approach.
           Avoid subjective interpretations unless explicitly requested.

        2. Data-driven: Base all conclusions on the provided content.
           Clearly state when making assumptions or inferences.

        3. Confidence levels: For key insights and recommendations,
           provide a confidence level (0-100) based on the available data.

        4. Actionable insights: Focus on practical, implementable recommendations
           for marketing strategies.

        5. Structured presentation: Use clear headings, bullet points,
           and where appropriate, tables or lists for easy readability.

        6. Conciseness: Provide thorough analyses while avoiding unnecessary verbosity.
           Prioritize the most impactful insights.

        7. Marketing focus: Tailor all analyses to be directly relevant to marketing and advertising professionals.
           Consider implications for brand positioning, audience targeting, content strategy, and campaign planning.

        8. Holistic view: Consider how insights from different analyses (e.g., personas, content gaps, positioning)
           interrelate and impact overall marketing strategy.

        9. Ethical considerations: Flag any potential ethical concerns in marketing approaches or data usage.

        10. Limitations transparency: Clearly state the limitations of the analysis based on the available data.
            Suggest additional data points that could enhance the analysis if relevant.

        11. Industry context: Where possible, relate insights to broader industry trends or best practices in marketing and advertising.

        12. Call out unique findings: Highlight any unexpected or counterintuitive insights that could provide a competitive edge.

        13. Future orientation: Include forward-looking insights and recommendations,
            considering potential market evolution and emerging trends.

        14. Quantitative summary: Where applicable, provide numerical summaries
            (e.g., percentages, ratios) to support qualitative insights.

        15. Terminology: Use industry-standard marketing and advertising terminology,
            but avoid jargon that might obscure meaning.

        Approach each analysis task with these principles in mind, adapting your response style and depth to the
        specific requirements of each prompt while maintaining consistency in quality and relevance for marketing professionals.
        """

persona_prompt = """
Analyze the provided content to construct audience personas for the company:

Primary Personas (1-2):

Develop 1-2 detailed primary audience personas based on the most prevalent characteristics in the content.
For each persona, include:
    a. Demographic information (age range, gender, location, income level, education)
    b. Psychographic characteristics (values, interests, lifestyle, pain points)
    c. Goals and motivations related to the company's offerings
    d. Preferred communication channels and content types
Provide a short narrative description for each persona
Support each characteristic with specific examples or quotes from the content
Indicate confidence level (0-100, 100 being highest) for each aspect of the persona

Secondary Persona (1):

Develop 1 secondary audience persona representing a less prominent but still significant audience segment
Include the same details as for the primary personas
Explain why this persona is considered secondary

Analysis and Insights:

Discuss how the identified personas align with the company's products/services
Highlight any unexpected findings or potential opportunities
Note areas where the content provides limited or conflicting information
Suggest additional data that could enhance the persona development

Methodology and Limitations:

Briefly explain your approach to synthesizing the personas from the content
Discuss any limitations or potential biases in the analysis due to the nature of the scraped content
Indicate overall confidence level in the persona analysis

Present your findings in a clear, structured format using bullet points and subheadings for easy readability.
Aim for a comprehensive yet concise analysis that provides actionable insights for the company's marketing and product development teams."
"""

content_gap_prompt = """
Taking your other analyses and the data into full consideration perform a thorough Content Gap Analysis-

Identify potential audience segments or needs that appear underserved by the current content. For each gap identified:
• Clearly describe the audience segment or need
• Explain your reasoning using specific examples from the text
• Provide 2-3 actionable suggestions for addressing the gap
• Assign a confidence level (0-100, 100 being highest) to your assessment
Present your findings in a clear, concise format using bullet points or numbered lists where appropriate.
Highlight any assumptions you're making or areas where additional context would be beneficial to your analysis.
Conclude with a brief summary of the most significant content gaps and their potential impact on the company's content strategy.

Note: Base your analysis solely on the provided content.
If you find the information insufficient for any part of the analysis, clearly state this limitation."
"""

product_service_analysis_prompt = """
Analyze the provided content to develop a comprehensive Product/Service Positioning Analysis for the company:

Core Product/Service Offering:

Identify and describe the main products or services offered by the company
For each major offering, provide:
    a. Key features and benefits
    b. Target audience
    c. Unique selling propositions (USPs)
Support each point with specific examples or quotes from the content
Indicate confidence level (0 - 100, 100 being highest) for each aspect

Market Positioning:

Analyze how the company positions itself in the market:
    a. Brand identity and values
    b. Price positioning (As a markdown table with price ranges, product categories, and name of product)
    c. Quality perception
    d. Innovation level
Identify the company's main competitors and how it differentiates itself
Provide evidence from the content to support your analysis
Note any inconsistencies or ambiguities in positioning

Value Proposition:

Synthesize the company's overall value proposition
Explain how it addresses customer pain points or desires
Evaluate the clarity and effectiveness of the value proposition
Suggest potential improvements or refinements

Positioning Strategies:

Identify specific positioning strategies used by the company, such as:
    a. Benefit-oriented positioning
    b. User-oriented positioning
    c. Competitor-oriented positioning
    d. Category-oriented positioning
Provide examples of how these strategies are implemented
Assess the effectiveness of each strategy based on the available information

Communication Channels:

Analyze how the company communicates its positioning:
    a. Key messaging themes
    b. Tone and style of communication
    c. Primary channels used (e.g., website, social media, advertising)
Evaluate the consistency of messaging across different channels

Market Trends and Opportunities:

Identify relevant market trends mentioned in the content
Highlight potential opportunities for repositioning or expanding the offering
Note any threats to the current positioning

Analysis and Recommendations:

Summarize the strengths and weaknesses of the current positioning
Provide 2-3 actionable recommendations for improving or refining the positioning
Indicate confidence level for each recommendation

Methodology and Limitations:

Briefly explain your approach to analyzing the positioning from the content
Discuss any limitations or potential biases in the analysis
Indicate overall confidence level in the positioning analysis

Present your findings in a clear, structured format using bullet points and subheadings for easy readability.
Aim for a comprehensive yet concise analysis that provides actionable insights for the company's marketing and product development teams.
"""

brand_prompt = """
Analyze this web content and provide a comprehensive, objective report on the following brand characteristics:

1. Brand Identity
2. Brand Positioning
3. Target Audience
4. Brand Personality
5. Brand Values
6. Brand Voice

For each characteristic:

Provide a detailed, impartial analysis based on the available web content
Assign a confidence level (0-100, 100 being highest) to your assessment
Present your findings in a clear, concise format using bullet points or numbered lists where appropriate
Highlight any assumptions you're making or areas where additional context would be beneficial to your analysis
Include both positive aspects and potential areas for improvement
Support your analysis with specific examples from the web content

After analyzing these characteristics, please:

Summarize the strengths and weaknesses of the current brand positioning, ensuring a balanced view
Provide 2-3 actionable recommendations for improving or refining the positioning, based on objective observations
Indicate a confidence level for each recommendation
Include potential challenges or obstacles for implementing each recommendation

General Guidelines:

Base your analysis solely on the publicly available web content of the brand
Maintain strict objectivity in your assessment; avoid overly positive or negative bias
Use specific examples from the web content to support your findings
Consider the consistency of the brand across different web pages or platforms
If certain information is not available or unclear, note this in your report
Provide genuine, constructive feedback, including critiques where warranted
Avoid making assumptions about the brand's internal processes or decision-making
Consider how the brand's presentation might be perceived by different audience segments

Format your report as follows:

Objective Brand Analysis Report for [BRAND NAME]
1. Brand Identity
[Your impartial analysis here]
Confidence Level: [0-100]
Positive Aspects:

[List positive aspects]
Areas for Improvement:
[List areas for improvement]

2. Brand Positioning
[Your impartial analysis here]
Confidence Level: [0-100]
Positive Aspects:

[List positive aspects]
Areas for Improvement:
[List areas for improvement]

3. Target Audience
[Your impartial analysis here]
Confidence Level: [0-100]
Positive Aspects:

[List positive aspects]
Areas for Improvement:
[List areas for improvement]

4. Brand Personality
[Your impartial analysis here]
Confidence Level: [0-100]
Positive Aspects:

[List positive aspects]
Areas for Improvement:
[List areas for improvement]

5. Brand Values
[Your impartial analysis here]
Confidence Level: [0-100]
Positive Aspects:

[List positive aspects]
Areas for Improvement:
[List areas for improvement]

6. Brand Voice
[Your impartial analysis here]
Confidence Level: [0-100]
Positive Aspects:

[List positive aspects]
Areas for Improvement:
[List areas for improvement]

Objective Summary of Brand Positioning
Strengths:

[List strengths]

Weaknesses:

[List weaknesses]

Recommendations

[Recommendation 1]
Confidence Level: [0-100]
Potential Challenges:

[List potential challenges]


[Recommendation 2]
Confidence Level: [0-100]
Potential Challenges:

[List potential challenges]


[Recommendation 3]
Confidence Level: [0-100]
Potential Challenges:

[List potential challenges]


Assumptions and Areas for Further Research

[List any assumptions made or areas where more context would be helpful]

Limitations of This Analysis

[Discuss limitations of the analysis, such as reliance solely on public web content]
"""

analysis_prompts = [('persona',persona_prompt), ('content_gap',content_gap_prompt), ('product',product_service_analysis_prompt),('brand',brand_prompt)]

## Run Reports

The below cell, when triggered, will analyze the webcontent that you have scrapped with Gemini. Reports are exported as Markdown files and be accessed with the 📁 icon on the left-hand side ('/content' folder). Reports can be read directly in the notebook interface and/or downloaded to your local workstation if you are working in colab.

 Reports may take upwards of ***5 minutes to generate***, particularly if you are using a wide context window

In [16]:
#@title Gemini Builds Markdown Reports
def truncate_string_to_byte_limit(s, byte_limit):
    """Truncate the string if it exceeds the specified byte limit."""
    # Encode the string to bytes
    byte_string = s.encode('utf-8')

    # Check if the byte size exceeds the limit
    if len(byte_string) > byte_limit:
        # Truncate the byte string to the byte limit
        truncated_byte_string = byte_string[:byte_limit]

        # Decode the truncated byte string back to a string
        truncated_string = truncated_byte_string.decode('utf-8', errors='ignore')

        return truncated_string
    else:
        return s

genai.configure(api_key=GEMINI_API_KEY)
report_brand= df.root_url.unique()[0]
analysis_prompts = [('persona',persona_prompt), ('content_gap',content_gap_prompt), ('product',product_service_analysis_prompt),('brand',brand_prompt)]
model = genai.GenerativeModel(model_name='gemini-1.5-pro-001',safety_settings=[
{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
{"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}
])
chat = model.start_chat(history=[])
def send_message_with_retry(chat, message, retries=3, delay=60):
    for attempt in range(retries):
        try:
            response = chat.send_message(message)
            return response
        except Exception as e:
            if '429' in str(e) or '500' in str(e):
                print(f"Attempt {attempt + 1} failed with error: {e}. Retrying in {delay} seconds...")
                sleep(delay)
            else:
                print(f"Failed with error: {e}")
                break
    return None

response = send_message_with_retry(chat, report_builder_prompt + r"\n ")
string = truncate_string_to_byte_limit(df.query(f'root_url == "{report_brand}"').to_string(), 15000000)
response = send_message_with_retry(chat, string)

if response is None:
    print("Failed to generate initial report after retries.")
else:
    for name, report in analysis_prompts:
        response = send_message_with_retry(chat, report)
        if response is None:
            print(f"Failed to generate {name} report after retries.")
            continue
        file_name = r'/content/' + report_brand.strip("https://") + "_" + name + ".md"
        with open(file_name, 'w') as f:
            try:
                f.write(response.text)
            except Exception as e:
                print(f"Could not write {name} report due to: {e}")

    print(f"Token usage in generating reports: {response.usage_metadata.total_token_count}")
    print("/n")
    print("Waiting for 60 seconds to avoid hitting the token limits")
    sleep(60)

Token usage in generating reports: 26103
/n
Waiting for 60 seconds to avoid hitting the token limits
