<a href="https://colab.research.google.com/github/Musa-Ali-Kazmi/Web-Scraping-and-PDF-Data-Extraction/blob/main/WebScraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tabula-py

In [None]:
from tabula import read_pdf

import pymysql
from sqlalchemy import create_engine


In [None]:
# Step 1: Install Necessary Libraries
!pip install tabula-py requests beautifulsoup4 pandas sqlalchemy Flask openpyxl

# Step 2: PDF Data Extraction using Scraping API
import tabula

def extract_data_from_pdf(pdf_path):
    # Use the read_pdf function from tabula to extract tables from PDF
    tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True)

    # Process and clean the extracted tables if needed
    cleaned_data = process_and_clean_data(tables)

    return cleaned_data

# Step 3: Web Scraping for Additional Data
import requests
from bs4 import BeautifulSoup

def scrape_additional_data(url):
    # Send a GET request to the URL
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract relevant information from the webpage
    additional_data = extract_additional_data(soup)

    return additional_data

def extract_additional_data(soup):
    # Example: Extracting text from HTML tags
    additional_data = soup.find('div', class_='additional-info').get_text()

    return additional_data

# Step 4: Data Transformation
import pandas as pd

def process_and_clean_data(tables):
    # Example: Combine multiple tables into a single DataFrame
    unified_data = pd.concat(tables, ignore_index=True)

    # Add additional cleaning or processing steps as needed

    return unified_data

# Step 5: Database API Interaction

def store_data_in_database(dataframe, db_url):
    engine = create_engine(db_url)
    dataframe.to_sql('data_table', engine, if_exists='replace')

# Step 6: API Endpoint for Data Storage
from flask import Flask, request, jsonify
import os

app = Flask(__name__)

@app.route('/store_data', methods=['POST'])
def store_data():
    data = request.json

    # Store the received data in the database
    store_data_in_database(data, os.environ.get('DB_URL'))

    return jsonify({'message': 'Data stored successfully'}), 200

# Step 7: Download Button for Excel File
@app.route('/download_excel', methods=['GET'])
def download_excel():
    # Retrieve data from the database
    data = retrieve_data_from_database(os.environ.get('DB_URL'))

    # Convert data to Excel file
    excel_file = convert_to_excel(data)

    # Return the Excel file as a downloadable attachment
    return send_file(excel_file, as_attachment=True, attachment_filename='data.xlsx')

# Step 8: Error Handling and Logging
import logging

logging.basicConfig(filename='app.log', level=logging.INFO)

# Step 9: Documentation
# README file explaining how to run the script, including dependencies and API endpoints

# Step 10: Testing
# Test your script thoroughly with different PDF structures, web pages, and database interactions

# Step 11: Finalization
# Ensure all code is well-documented, package the script and dependencies, and share with stakeholders
