# **Classification of Documents Using Graph-Based Features and KNN**

## **1. Food Data Collection and Preparation:**
Collect or create 15 pages of text for each of the three assigned topics, ensuring each
page contains approximately 500 words.

**Import necessary libraries**

In [1]:
import requests
from bs4 import BeautifulSoup
import os
import json
import csv

**Function to scrape articles links from a given URL**

In [2]:
def scrape_articles_links(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        articles = soup.find('section', class_='o-ListArticle')
        links = ["https:" + a['href'] for a in articles.find_all('a', href=True)]
        return links
    except Exception as e:
        print(f"Error scraping links from {url}: {e}")
        return None

**Function to process links**

In [3]:
def process_links(articles_links):
    links = []
    for i in range(0,len(articles_links),2):
        if articles_links[i] == "https:#" or articles_links[i].startswith("https://www.foodnetwork.com/healthy/articles/p"):
            break
        links.append(articles_links[i])
    return links

**Function to scrape data from a given URL**

In [4]:
def scrape_data(url):
    article = {}
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')
        article_content = soup.find('article', class_='article-content')
        if article_content:
            article['title'] = article_content.find('div', class_='assetTitle').get_text().replace('\n', '')
            body = article_content.find('div', class_='article-body')
            paras = body.find_all('div', class_='customRTE smartbody-core section')
            article['body'] = ' '.join([p.get_text() for p in paras]).replace('\n', '')
            article['words_count'] = len(article['body'].split())
        return article
    except Exception as e:
        print(f"Error scraping data from {url}: {e}")
        return None

**Function to save scraped data (dictionary) to a file as JSON format**

In [5]:
def save_to_json(data, filename):
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"Data saved to {filename}")
    except Exception as e:
        print(f"Error saving data to {filename}: {e}")

**Function to save scraped data (dictionary) to a file as CSV format**

In [12]:
def save_to_csv(data, filename):
  try:
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
      fieldnames = ['label', 'title', 'body', 'words_count']
      writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
      writer.writeheader()
      for article in data:
          writer.writerow({'label': article['label'], 'title': article['title'], 'body': article['body'], 'words_count': article['words_count']})
    print(f"Data saved to {filename}")
  except Exception as e:
        print(f"Error saving data to {filename}: {e}")

**Function to scrape articles**

In [10]:
def scrape_articles(url_base, pages, min_articles):
    articles_data = []
    articles_count = 0
    for i in range(1, pages + 1):
        url = f"{url_base}/p/{i}"
        print("Page Link:" + url)
        articles_links = scrape_articles_links(url)
        links = process_links(articles_links)
        for link in links:
            data = scrape_data(link)
            if data and data.get('words_count', 0) > 500:
                print(f"Article {articles_count + 1} - {data['title']} - {data['words_count']} words")
                articles_data.append({'index': articles_count + 1, 'label': 'Food', **data})
                articles_count += 1
                if articles_count >= min_articles:
                    return articles_data
    return articles_data

Main function

In [13]:
# URL to scrape
food_url = 'https://www.foodnetwork.com/healthy/articles'
pages = 5  # Number of pages to scrape
min_articles = 15  # Minimum number of articles to scrape

# Scrape articles
articles_data = scrape_articles(food_url, pages, min_articles)

# Create a directory to save the articles
os.makedirs("articles", exist_ok=True)
json_file = 'articles/food_articles.json'
csv_file = 'articles/food_articles.csv'

# Save to JSON file
save_to_json(articles_data, json_file)
# Save to CSV file
save_to_csv(articles_data, csv_file)

Page Link:https://www.foodnetwork.com/healthy/articles/p/1
Article 1 - Canola vs. Vegetable Oil: What’s the Difference? - 804 words
Article 2 - The Truth Behind These Super-Common Sports Nutrition Myths - 747 words
Article 3 - The Ultimate Healthy Cooking Playlist - 518 words
Article 4 - The FDA Just Approved a Major Health Claim for Yogurt - 694 words
Article 5 - Zero Waste: 7 Ways You Can Protect the Environment When You Shop - 615 words
Article 6 - Are We Drinking Too Much? New Guidelines Suggest We Might Be - 609 words
Page Link:https://www.foodnetwork.com/healthy/articles/p/2
Article 7 - The Chef's Take: Grains and Egg Bowl from Camille Becerra - 627 words
Article 8 - The Chef's Take: Lentil, Avocado and Kale Salad from Franklin Becker - 563 words
Article 9 - Are You a Healthy Snacker? - 504 words
Article 10 - Market Watch: Pomegranates - 506 words
Article 11 - Ask The Experts: 11 Healthy Cooking Mistakes - 1149 words
Article 12 - Taste Test: Fast Food Oatmeal - 793 words
Article 