<a href="https://colab.research.google.com/github/MarMarhoun/freelance_work/blob/main/side_projects/NLP_projs/eda_streamlit/webScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code to web scrapping and crawling using streamlit

# Project description:

The code for web scraping and crawling using Streamlit, you can create a user interface that allows users to input search queries and filter results. Here's an example of how you can modify the existing code to incorporate Streamlit:

In [None]:
import os  # Module for interacting with the operating system
import time  # Module for time-related operations
import ujson  # Module for working with JSON data
from random import randint  # Module for generating random numbers
from typing import Dict, List, Any  # Type hinting imports

import requests  # Library for making HTTP requests
from bs4 import BeautifulSoup  # Library for parsing HTML data
from selenium import webdriver  # Library for browser automation
from selenium.common.exceptions import NoSuchElementException  # Exception for missing elements
from webdriver_manager.chrome import ChromeDriverManager  # Driver manager for Chrome (We are using Chromium based )
import streamlit as st  # Streamlit library for building web apps

def initCrawlerScraper(seed, max_profiles=500):
    # Initialize driver for Chrome
    webOpt = webdriver.ChromeOptions()
    webOpt.add_experimental_option('excludeSwitches', ['enable-logging'])
    webOpt.add_argument('--ignore-certificate-errors')
    webOpt.add_argument('--incognito')
    webOpt.headless = True
    driver = webdriver.Chrome(ChromeDriverManager().install(), options=webOpt)
    driver.get(seed)  # Start with the original link

    links = []  # Array with pureportal profiles URL
    pub_data = []  # To store publication information for each pureportal profile

    nextLink = driver.find_element_by_css_selector(".nextLink").is_enabled()  # Check if the next page link is enabled
    print("Crawler has begun...")
    while (nextLink):
        page = driver.page_source
        # XML parser to parse each URL
        bs = BeautifulSoup(page, "lxml")  # Parse the page source using BeautifulSoup

        # Extracting exact URL by spliting string into list
        for link in bs.findAll('a', class_='link person'):
            url = str(link)[str(link).find('<https://pureportal.coventry.ac.uk/en/persons/'):].split('>"')
            links.append(url[0])

        # Click on Next button to visit next page
        try:
            if driver.find_element_by_css_selector(".nextLink"):
                element = driver.find_element_by_css_selector(".nextLink")
                driver.execute_script("arguments[0].click();", element)
            else:
                nextLink = False
        except NoSuchElementException:
            break

        # Check if the maximum number of profiles is reached
        if len(links) >= max_profiles:
            break

    print("Crawler has found ", len(links), " pureportal profiles")
    write_authors(links, 'Authors_URL.txt')  # Write the authors' URLs to a file

    print("Scraping publication data for ", len(links), " pureportal profiles...")
    count = 0
    for link in links:
        # Visit each link to get data
        time.sleep(1)
        driver.get(link)
        try:
            if driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large"):
                element = driver.find_elements_by_css_selector(".portal_link.btn-primary.btn-large")
                for a in element:
                    if "research output".lower() in a.text.lower():
                        driver.execute_script("arguments[0].click();", a)
                        driver.get(driver.current_url)
                        # Get name of Author
                        name = driver.find_element_by_css_selector("div[class='header person-details']>h1")
                        r = requests.get(driver.current_url)
                        # Parse all the data via BeautifulSoup
                        soup = BeautifulSoup(r.content, 'lxml')

                        # Extracting publication name, publication url, date and CU Authors
                        table = soup.find('ul', attrs={'class': 'list-results'})
                        if table != None:
                            for row in table.findAll('div', attrs={'class': 'result-container'}):
                                data = {}
                                data['name'] = row.h3.a.text
                                data['pub_url'] = row.h3.a['href']
                                date = row.find("span", class_="date")

                                rowitem = row.find_all(['div'])
                                span = row.find_all(['span'])
                                data['cu_author'] = name.text
                                data['date'] = date.text
                                print("Publication Name :", row.h3.a.text)
                                print("Publication URL :", row.h3.a['href'])
                                print("CU Author :", name.text)
                                print("Date :", date.text)
                                print("\\n")
                                pub_data.append(data)
        except Exception:
            continue

    print("Crawler has scrapped data for ", len(pub_data), " pureportal publications")
    driver.quit()
    # Writing all the scraped results in a file with JSON format
    with open('scraper_results.json', 'w') as f:
        ujson.dump(pub_data, f)

def main():
    st.title("Web Scraping and Crawling with Streamlit")

    search_query = st.text_input("Enter search query:")
    max_results = st.number_input("Maximum number of results:", value=10)

    if st.button("Search"):
        if not search_query:
            st.warning("Please enter a search query.")
        else:
            pub_data = initCrawlerScraper(search_query, max_results)
            st.write("Results:")
            for link in pub_data:
                st.markdown(f"[{link}]({link})")

if __name__ == "__main__":
    main()

# Code to web scrapping and crawling using Flask and Django

To create a web scraping and crawling application using Python, Flask, Beautiful Soup, and Requests, follow the steps below:

Install the required libraries:

In [None]:
!pip install Flask beautifulsoup4

Create a new Flask application (app.py):


In [None]:
from flask import Flask, render_template, request, url_for
import requests
from bs4 import BeautifulSoup

app = Flask(__name__)

@app.route('/', methods=['GET', 'POST'])
def index():
    if request.method == 'POST':
        url = request.form['url']
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract data using Beautiful Soup (replace with your own selectors)
        title = soup.select_one('h1').text.strip()
        description = soup.select_one('meta[name=description]')['content'].strip()

        return render_template('index.html', url=url, title=title, description=description)

    return render_template('index.html')

if __name__ == '__main__':
    app.run(debug=True)

Create an HTML template (templates/index.html):
/

In [None]:
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Web Scraping and Crawling with Flask</title>
</head>
<body>
    <h1>Web Scraping and Crawling with Flask</h1>

    <form method="post">
        <label for="url">Enter URL:</label>
        <input type="text" name="url" id="url" required>
        <button type="submit">Scrape</button>
    </form>

    {% if title %}
    <h2>{{ title }}</h2>
    <p>{{ description }}</p>
    {% endif %}
</body>
</html>

In [None]:
# Run the Flask application:

python app.py

Access the application in your web browser at http://127.0.0.1:5000/. Enter a URL to scrape and click "Scrape". The application will display the title and description of the provided URL.

This is a basic example of web scraping and crawling using Flask, Beautiful Soup, and Requests. You can further customize the code to extract more data or implement additional features like following links or handling JavaScript-rendered content.

### Advanced Features and Enhancements
For advanced features and enhancements, you can consider the following options:

Handle different content types (HTML, JSON, XML) using different parsing techniques.

Store scraped data in a database (e.g., SQLite, MySQL, MongoDB) for future use.

Implement web crawling techniques to follow links and recursively scrape data.

Add authentication and authorization features to restrict access to the scraping functionality.

Enhance the user interface using front-end libraries (e.g., Bootstrap, React, Vue.js) and improve the user experience.

Optimize the performance of the application by using asynchronous techniques (e.g., with asyncio and aiohttp) or parallel processing (e.g., with concurrent.futures).

Consider implementing unit tests and integration tests to ensure the stability and reliability of the application.

Remember to replace the example selectors used in the code with your own specific selectors for the target websites. Additionally, be aware of the legality and ethical considerations when web scraping, and ensure that you are complying with any applicable laws and terms of service.

In [None]:
from flask import Flask, render_template, request, url_for
import requests
from bs4 import BeautifulSoup
from flask_wtf import FlaskForm
from wtforms import StringField, SubmitField
from wtforms.validators import DataRequired, URL

app = Flask(__name__)
app.config['SECRET_KEY'] = 'your-secret-key'

class UrlForm(FlaskForm):
    url = StringField('url', validators=[DataRequired(), URL()])
    submit = SubmitField('Scrape')

@app.route('/', methods=['GET', 'POST'])
def index():
    form = UrlForm()
    if form.validate_on_submit():
        url = form.url.data
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract data using Beautiful Soup (replace with your own selectors)
        title = soup.select_one('h1').text.strip()
        description = soup.select_one('meta[name=description]')['content'].strip()

        return render_template('index.html', title=title, description=description)

    return render_template('index.html', form=form)

if __name__ == '__main__':
    app.run(debug=True)

# Code to web scrapping and crawling using Django

Web scraping and crawling using D, you can create a Django application that utilizes Scrapy for scraping data and integrates it into Django views to return the scraped data as JSON responses. Here's a step-by-step:

Create a new Django project and app:

In [None]:
django-admin startproject myproject
python manage.py startapp scrapers

Install Scrapy as a dependency:


In [None]:
!pip install scrapy

Create a new Scrapy spider for scraping data. For example, to scrape data from the Hacker News website, create a file named hacker_news_spider.py inside the spiders directory of your Scrapy project:

In [None]:
import scrapy

class HackerNewsSpider(scrapy.Spider):
    name = "hacker_news"
    start_urls = [
        "https://news.ycombinator.com/",
    ]

    def parse(self, response):
        for article in response.css("tr.athing"):
            yield {
                "title": article.css("a.storylink::text").get(),
                "url": article.css("a.storylink::attr(href)").get(),
                "votes": int(article.css("span.score::text").re_first(r"\d+"))
            }
        next_page = response.css("a.morelink::attr(href)").get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)

Create a Django view that will execute the Scrapy spider and return the scraped data as a JSON response:

In [None]:
from django.http import JsonResponse
from scrapy.crawler import CrawlerProcess
from scrapers.spiders.hacker_news_spider import HackerNewsSpider

def scrape_hacker_news(request):
    process = CrawlerProcess(settings={
        "FEEDS": {
            "items.json": {"format": "json"},
        },
    })
    process.crawl(HackerNewsSpider)
    process.start()
    with open("items.json", "r") as f:
        data = f.read()
    return JsonResponse(data, safe=False)

Add a URL route for the new view:


In [None]:
from django.urls import path
from .views import scrape_hacker_news

urlpatterns = [
    path("scrape-hacker-news/", scrape_hacker_news, name="scrape_hacker_news"),
]

Now, you can visit the URL /scrape-hacker-news/ in your Django application to execute the Scrapy spider and return the scraped data as a JSON response.

For more advanced use cases, you can consider using Celery or Django-Crontab to schedule and run the Scrapy spiders in the background. Additionally, you can use Django's ORM to store the scraped data in a database and create more sophisticated views and templates for displaying the data.

