In [1]:
!pip install requests beautifulsoup4 pandas selenium


Defaulting to user installation because normal site-packages is not writeable
Collecting selenium
  Downloading selenium-4.22.0-py3-none-any.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m78.1 kB/s[0m eta [36m0:00:00[0m00:01[0mm00:03[0m
Collecting trio~=0.17
  Downloading trio-0.26.0-py3-none-any.whl (475 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.7/475.7 KB[0m [31m15.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
Collecting typing_extensions>=4.9.0
  Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting websocket-client>=1.8.0
  Downloading websocket_client-1.8.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 KB[0m [31m219.6 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting trio-websocket~=0.9
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting certifi>=2021.10.8
  Downloading certifi-2024.7.4-

# Web Scrapping using Beatiful Soap
This Python script is designed to scrape data from a webpage, extract specific information from a table, and save the extracted data into a CSV file. The script leverages the requests library to fetch the webpage content, BeautifulSoup from the bs4 library to parse the HTML content, and pandas to handle the data and save it in a CSV format. Below is a detailed description of each part of the script.
URL : https://www.scrapethissite.com/pages/forms/

# Code Breakdown
-Importing Libraries

The script imports three essential libraries:

requests: Used to make HTTP requests to fetch the webpage content.
BeautifulSoup from bs4: Used to parse HTML and XML documents, facilitating the extraction of data from HTML tags.
pandas: Used for data manipulation and analysis, particularly for creating and handling dataframes, which are then saved as CSV files.

-Function to Extract Data

extract_data(url): This function takes a URL as an input and attempts to scrape data from a table on the webpage.
Sends an HTTP GET request to the specified URL.
Checks if the request was successful (HTTP status code 200).
Parses the webpage content using BeautifulSoup.
Finds the table with the class 'table' on the webpage.
If the table is found:
Defines the column headers for the data.
Initializes an empty list to store the extracted data.
Finds all rows in the table.
Iterates over each row in the table, skipping the header row.
Finds all columns in the current row.
Extracts and cleans text from each column.
Appends the row data to the data list.
Creates a pandas DataFrame from the data list with the specified headers.
Returns the DataFrame.
If the table is not found, prints a message and returns None.
If the request fails, prints an error message with the status code and returns None.

-Function to Save Data to CSV

save_to_csv(df, filename='scrapethissite_data.csv'): This function takes a DataFrame and a filename as input and saves the DataFrame to a CSV file.
Checks if the DataFrame is not None.
Saves the DataFrame to a CSV file with the specified filename, without writing row indices.
Prints a success message indicating the file has been saved.
If the DataFrame is None, prints a message indicating that there is no data to save.

-Main Function

main(): This function orchestrates the data extraction and saving process.

Specifies the URL of the webpage to scrape.
Calls the extract_data function to scrape data from the specified URL.
Calls the save_to_csv function to save the extracted data to a CSV file.
if name == "main":: Checks if the script is being run as the main module.

Calls the main function to execute the script.


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_data(url):
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', class_='table')

        if table:
            headers = ['Team Name', 'Year', 'Wins', 'Losses', 'OT Losses', 'Win %', 'Goals For (GF)', 'Goals Against (GA)', '+ / -']
            data = []

            rows = table.find_all('tr')

            for row in rows[1:]:  
                cols = row.find_all('td')
                row_data = [col.get_text(strip=True) for col in cols]
                data.append(row_data)

            df = pd.DataFrame(data, columns=headers)
            return df
        else:
            print("Table not found on the page.")
            return None
    else:
        print(f"Failed to retrieve data from {url}. Status code: {response.status_code}")
        return None

def save_to_csv(df, filename='scrapethissite_data.csv'):
    if df is not None:
        df.to_csv(filename, index=False)
        print(f"Data successfully saved to '{filename}'")
    else:
        print("No data to save.")

def main():
    url = 'https://www.scrapethissite.com/pages/forms/'
    data = extract_data(url)
    save_to_csv(data)

if __name__ == "__main__":
    main()


  from pandas.core import (


Data successfully saved to 'scrapethissite_data.csv'
