### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup

### Web Scraping using BeautifulSoup

In [29]:
# Base URL of the website
base_url = "https://foodmicrobiology.academy/category/"

# List of categories
categories = [
    "bacteria", "blog", "career-guidance", "commentry", "eukaryotic-microbiology",
    "fermented-foods", "food", "food-quality", "food-regulations", "food-safety",
    "food-spoilage", "fungi", "history", "immunity", "prebiotics", "prions",
    "probiotics-and-gut-health", "public-health", "uncategorized", "viruses", "yeast"
]

# Empty list to store scraped data
articles_data = []

# Loop through each category
for category in categories:
    # Construct the full URL for the first page of the category
    category_url = f"{base_url}{category}/page/1/"
    
    # Fetch the first page to determine the total number of pages
    response = requests.get(category_url)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extract the total number of pages from the subtitle
    subtitle = soup.find('p', class_='archive-subtitle').get_text(strip=True)
    total_pages = int(subtitle.split('/')[-1])  # Extract the number after the "/"

    # Loop through each page in the category
    for page in range(1, total_pages + 1):
        # Construct the URL for each page
        page_url = f"{base_url}{category}/page/{page}/"
        
        # Fetch the content of the current page
        response = requests.get(page_url)
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Extract the category name explicitly
        category_name = soup.find('h1', class_='archive-title').find('span').get_text(strip=True)
        
        # Extract article information
        for article in soup.find_all('article', class_='post'):
            # Extract title and link
            title_element = article.find('h2')
            title = title_element.get_text(strip=True)
            link = title_element.find('a')['href']
            
            # Fetch the detailed article page
            article_response = requests.get(link)
            article_soup = BeautifulSoup(article_response.content, "html.parser")
            
            # Locate the article element on the detailed page
            detailed_article = article_soup.find('article', class_='single-post')
            
            # Extract the date from the article page
            date_element = detailed_article.find('span', class_='post-meta-date')
            date = date_element.find('a').get_text(strip=True) if date_element else 'Unknown'
            
            # Extract the content from the detailed article page
            content = detailed_article.find('div', class_='post-content').get_text(strip=True)
            
            # Extract tags from the 'post-tags' div
            tags_div = detailed_article.find('div', class_='post-tags')
            if tags_div:
                tags = [tag.get_text(strip=True) for tag in tags_div.find_all('a')]
                tags_str = ', '.join(tags)
            else:
                tags_str = ''
            
            # Extract sub-categories (class names starting with 'category-')
            sub_categories = [cls.split('category-')[-1] for cls in article['class'] if 'category-' in cls]
            sub_categories_str = ', '.join(sub_categories)  # Join as a comma-separated string
            
            # Append the article data to the list
            articles_data.append({
                'category': category_name,  # Use the extracted category name
                'sub_categories': sub_categories_str,  # Sub-categories as a string
                'title': title,
                'link': link,
                'date': date,  # Date extracted from the detailed article page
                'content': content,
                'tags': tags_str  # Store the extracted tags
            })

# Convert the data into a DataFrame
df = pd.DataFrame(articles_data)

### Inspecting the scraped data

In [31]:
df.head()

Unnamed: 0,category,sub_categories,title,link,date,content,tags
0,Bacteria,"bacteria, public-health",Understanding Cholera: A brief look into its c...,https://foodmicrobiology.academy/understanding...,"July 13, 2024",IntroductionCholera is a bacterial infection t...,"cholera, food microbiology, Food safety, foodb..."
1,Bacteria,"bacteria, food-quality, fungi, yeast",From HPP Innovation Week – Part 2,https://foodmicrobiology.academy/from-hpp-inno...,"July 5, 2024",This is the second of a two-part series of ove...,"food industry, food manufacturing, food microb..."
2,Bacteria,"bacteria, food-quality",From HPP Innovation week – Part 1,https://foodmicrobiology.academy/from-hpp-inno...,"June 30, 2024",Hiperbaric îs a global leader in commercial hi...,"food manufacturing, food microbiology, food pr..."
3,Bacteria,"bacteria, public-health",Coliforms and their role in ensuring the safet...,https://foodmicrobiology.academy/coliforms-and...,"June 27, 2024",We are so delighted to welcome Ruby Chin to ou...,"foodborne disease, microbiology, water quality"
4,Bacteria,"bacteria, public-health",Diverse burden of foodborne disease,https://foodmicrobiology.academy/diverse-burde...,"May 26, 2024","Foodborne diseases, often referred to as foodb...","food microbiology, Food safety, food science, ..."


In [32]:
# Temporarily set max_colwidth to None to display full text for the row
with pd.option_context('display.max_colwidth', None):
    print(df.iloc[0])

category                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

In [33]:
len(df)

371

In [34]:
# Remove duplicate rows based on the 'title' column
df_unique = df.drop_duplicates(subset='title', keep='first')

In [36]:
len(df_unique)

158

### Scraping other pages (additional info that may be useful)

In [179]:
def scrape_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract the post title
        title = soup.find('h1', class_='post-title').get_text(strip=True)
        
        # Extract the content from the post-content div
        post_content_div = soup.find('div', class_='post-content entry-content')
        content = []

        # Iterate over all relevant tags within the post content
        for element in post_content_div.children:
            if element.name == 'p':
                content.append(element.get_text(strip=True))
            elif element.name == 'ul' or element.name == 'ol':  # Lists
                for li in element.find_all('li'):
                    content.append(f"• {li.get_text(strip=True)}")
            elif element.name == 'h2':
                content.append(f"## {element.get_text(strip=True)}")  # Sub-headings
            elif element.name == 'h3':
                content.append(f"### {element.get_text(strip=True)}")
            elif element.name == 'div' and 'wp-block-jetpack-contact-info' in element.get('class', []):
                # Handle contact info
                contact_info = element.get_text(strip=True, separator='\n')
                content.append(f"Contact Info:\n{contact_info}")
            elif element.name == 'figure' and 'wp-block-image' in element.get('class', []):
                # Handle images
                img_tag = element.find('img')
                if img_tag and img_tag['src']:
                    content.append(f"Image: {img_tag['src']}")
            elif element.name == 'div' and 'wp-block-jetpack-map' in element.get('class', []):
                # Handle map links
                map_link = element.find('a')
                if map_link and map_link['href']:
                    content.append(f"Map Location: {map_link['href']}")
            elif element.name == 'div' and 'jetpack-simple-payments-wrapper' in element.get('class', []):
                # Handle product details
                product_title = element.find('div', class_='jetpack-simple-payments-title').get_text(strip=True)
                product_description = element.find('div', class_='jetpack-simple-payments-description').get_text(strip=True)
                product_price = element.find('div', class_='jetpack-simple-payments-price').get_text(strip=True)
                purchase_link = element.find('a', class_='jetpack-simple-payments-purchase')['href']
                
                product_info = f"Product: {product_title}\nDescription: {product_description}\nPrice: {product_price}\nPurchase Link: {purchase_link}"
                content.append(product_info)
            elif element.name == 'blockquote' and 'wp-block-quote' in element.get('class', []):
                # Handle blockquotes
                quote = element.find('p').get_text(strip=True)
                citation = element.find('cite').get_text(strip=True) if element.find('cite') else None
                if citation:
                    content.append(f"Quote: {quote}\nCited: {citation}")
                else:
                    content.append(f"Quote: {quote}")

        return {'title': title, 'content': content}
    else:
        print(f"Failed to retrieve {url}")
        return None

In [180]:
def scrape_multiple_pages(base_url, page_paths):
    all_data = []
    for path in page_paths:
        url = f"{base_url}{path}"
        page_data = scrape_page(url)
        if page_data:
            all_data.append(page_data)
    return all_data

In [181]:
base_url = "https://foodmicrobiology.academy/"
page_paths = ["about-us", "consulting", "contact-us", "food-spoilage-yeast-reference-list", "shop",
             "our-team", "project-management", "research", "education-training", "about"] 

scraped_data = scrape_multiple_pages(base_url, page_paths)

In [197]:
additional_info = pd.DataFrame(scraped_data)

In [213]:
additional_info.head()

Unnamed: 0,title,content
0,About us,"[Fresher Healthier Safer., This is at the core..."
1,Consulting,[We offer specialised food preservation consul...
2,Contact us,[Contact Info:\nContact Dr Philip Button direc...
3,Food spoilage yeast reference list,"[References, 1999,Critical Controls for Juice ..."
4,Our shop,[Product: Science and business of food entrepr...


In [199]:
# New information to append
new_info = 'We are a dedicated team here to drive and inspire your creativity, innovation and passion. Dr Philip Button, Academy Director & Head of Research. Based in: Melbourne, Victoria, Australia. Expertise in theoretical research, academic and scholarly perspectives on all areas of food microbiology, although particularly in food quality and spoilage. Alma mater: The University of Melbourne. philip.button@foodmicrobiology.academy'

In [200]:
additional_info['content'][5] = new_info

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  additional_info['content'][5] = new_info


### Exporting the data files (csv format)

In [214]:
# Save the data to a CSV file
df.to_csv('articles_info.csv', index=False)
additional_info.to_csv('additional_info.csv', index=False)