# Instructions

### Preparation
1. Ensure you have the `requests`, `beautifulsoup4`, and `pandas` libraries installed. If not, install them using `!pip install requests beautifulsoup4 pandas`.
2. Enter the desired save file destinations in lines 90, 91
3. Run the code in the next cell to define the functions (click on the cell, Shift+Enter)

### Usage options

##### Syntax:

`main(URL(s), phrase(s), want_words=False, want_phrases=False)`\
The funciton is called in the 2nd and 3rd code cells. Run it by clicking on the desired cell and pressing Shift+Enter.

##### Arguments:
- `URL(s)` (mandatory): A string or list of strings `['url1', 'url2', 'url3']` containing the URL(s) to analyze
- `phrase(s)` (optional): A string or list of strings `['phrase1', 'phrase2', 'phrase3']` containing the phrase(s) to count. Default is an empty list.
- `want_words` (optional): A boolean `True` or `False` to save word counts to a CSV file. Default is `False`.
- `want_phrases` (optional): A boolean `True` or `False` to save phrase counts to a CSV file. Default is `False`.

### Notes
- The `main` function will print the count of a phrase only if the single phrase is provided as a string.
- This script will not accept numbers as phrases, only words.


In [17]:
import requests
from collections import Counter
from bs4 import BeautifulSoup
import pandas as pd
import re
import os
from datetime import datetime

def get_word_counts(url):
    # Fetch the webpage content
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad status codes

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Get text from the HTML content
    text = soup.get_text()

    # Split the text into words, excluding punctuation except for apostrophes and excluding numbers
    words = re.findall(r"\b[a-zA-Z]+(?:'[a-zA-Z]+)?\b", text)

    # Count the occurrences of each word
    word_counts = Counter(words)

    # Convert the Counter object to a pandas DataFrame
    word_counts_df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
    
    # Add a column with the URL
    word_counts_df.insert(0, 'URL', url)

    # Sort the DataFrame by word counts in descending order
    word_counts_df = word_counts_df.sort_values(by='Word', ascending=True, key=lambda col: col.str.lower()).reset_index(drop=True)

    return word_counts_df

def get_phrase_counts(url, phrases):
    # Fetch the webpage content
    response = requests.get(url)
    response.raise_for_status()  # Raise an error for bad status codes

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text()

    # Return single count if input is a string
    if isinstance(phrases, str):
        count = text.count(phrases)
        if pd.isna(count): 
            count = 0
        return count
    
    # Error if input is empty list
    if len(phrases) == 0:
        phrases_df = pd.DataFrame(columns=['Phrase', 'Count'])
        print('No phrase count: Empty list of phrases')
        return phrases_df
    
    # Convert the list of phrases to a DataFrame
    phrases_df = pd.DataFrame({'Phrase': phrases})
    
    # Count the occurrences of each phrase
    phrase_counts = []
    for phrase in phrases_df['Phrase']:
        count = text.count(phrase)
        phrase_counts.append(count)

    # Update the DataFrame with the counts
    phrases_df['Count'] = phrase_counts
    phrases_df['Count'] = phrases_df['Count'].fillna(0).astype(int)
    
    # Add a column with the URL
    phrases_df.insert(0, 'URL', url)
    
    # Sort the DataFrame by phrase counts in alphabetical order
    phrases_df = phrases_df.sort_values(by="Phrase", ascending=True, key=lambda col: col.str.lower()).reset_index(drop=True)

    return phrases_df

def save_dataframes_to_csv(url, word_counts_df, phrases_df, want_words=False, want_phrases=False):
    
    # Get the current date and time
    now = datetime.now()

    # Format the date and time to YYYYMMDDHHMMSS
    timestamp = now.strftime('%Y%m%d%H%M%S')
    
    # Generate file names based on the URL
    base_name = re.sub(r'\W+', '_', url)
    word_counts_file = os.path.expanduser(f'~/Downloads/{base_name}_word_counts_{timestamp}.csv')
    phrases_file = os.path.expanduser(f'~/Downloads/{base_name}_phrases_{timestamp}.csv')

    # Save the DataFrames as CSV files
    if want_words:
        word_counts_df.to_csv(word_counts_file, index=False)
        print(f'{url} word counts saved to: {word_counts_file}')
    if want_phrases:
        phrases_df.to_csv(phrases_file, index=False)
        print(f'{url} phrase counts saved to: {phrases_file}')
    
    return

# Main function to run the program
def single_url(url, phrases_list=[], want_words=False, want_phrases=False):
    
    # Check if the input is a single phrase, print count
    if isinstance(phrases_list, str):
        try:
            count = get_phrase_counts(url, phrases_list)
            print(f'{url} count of "{phrases_list}": {count}')
        except:
            print(f'Error getting phrase count for {phrases_list}')
        
        # Save string to a list
        phrases_list = [phrases_list]

    # Try getting phrase counts
    try:
        phrases_df = get_phrase_counts(url, phrases_list)
    except:
        print(f'Error getting phrase counts for {url}')
    
    # Try getting word counts
    try:
        word_counts_df = get_word_counts(url)
    except:
        print(f'Error getting word counts for {url}')

    # Save the DataFrames to CSV files
    save_dataframes_to_csv(url, word_counts_df, phrases_df, want_words, want_phrases)
    
    return
    
def main(urls, phrases_list=[], want_words=False, want_phrases=False):
    
    # Check if input is a list of URLs
    if isinstance(urls, list):
        errors = []
        # Loop through the list of URLs
        for url in urls:
            try:
                single_url(url, phrases_list, want_words, want_phrases)
            except:
                print(f'Error processing URL: {url}')
                errors.append(url)
        if len(errors) == 0:
            print('All URLs processed successfully')
        else:
            print(f'Error URLs:\n')
            print(errors, sep='\n')
            print(f'Error processing {len(errors)} URLs. All other URLs processed successfully')
    
    # If input is a single URL, run the single_url function
    elif isinstance(urls, str):
        try:
            single_url(urls, phrases_list, want_words, want_phrases)
            print('URL processed successfully')
        except: 
            print('Error processing URL')
    
    # Error if input is not a list or string
    else:
        print('Error: Invalid URL input')
    
    return
    

In [3]:
# Enter URLs here
urls = [
    'https://developer.atlassian.com/cloud/confluence/getting-started-with-connect/',
    'https://developer.atlassian.com/cloud/confluence/storing-data-with-entity-properties/',
    'https://developer.atlassian.com/cloud/confluence/connect-frameworks-and-tools/'
]

# Enter phrases here
phrases = [
    'complete this tutorial',
    'Atlassian Connect',
    'Confluence Cloud',
    'Connect framework',
    'cloud development site'
]

In [20]:
# Function that takes above URLs, phrases and saves word and phrase counts to CSV files
main(urls, phrases, want_words=True, want_phrases=True)

https://developer.atlassian.com/cloud/confluence/getting-started-with-connect/ count of "is": 35
https://developer.atlassian.com/cloud/confluence/storing-data-with-entity-properties/ count of "is": 12
https://developer.atlassian.com/cloud/confluence/connect-frameworks-and-tools/ count of "is": 8
All URLs processed successfully


In [21]:
# Function that takes a single URL and phrase, only prints the count
main('https://en.wikipedia.org/wiki/SAP', 'SAP')

https://en.wikipedia.org/wiki/SAP count of "SAP": 334
URL processed successfully
