In [4]:
import requests
from bs4 import BeautifulSoup
from time import sleep
from random import choice
import pandas as pd
import csv

In [24]:
# List to store all quotes
all_quotes = []

# Base URL for the quotes website
base_url = "http://quotes.toscrape.com/"

# Start scraping from the first page
url = "page/1"

# Loop through all pages
while url:
    # Send a GET request to the page
    res = requests.get(f"{base_url}{url}")
    
    # Parse the content of the page
    soup = BeautifulSoup(res.text, "html.parser")
    
    # Find all quote elements
    quotes = soup.find_all(class_="quote")
    
    # Extract quote data
    for quote in quotes:
        text = quote.find(class_="text").get_text()
        author = quote.find(class_="author").get_text()
        tags = [tag.get_text() for tag in quote.find_all(class_="tag")]
        
        # Append the data to the list
        all_quotes.append({
            "text": text,
            "author": author,
            "tags": ", ".join(tags)  # Join tags into a single string
        })
    
    # Find the next page
    next_btn = soup.find(class_="next")
    url = next_btn.find("a")["href"] if next_btn else None

# Save the scraped data to a CSV file
with open("quotes.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["text", "author", "tags"])
    writer.writeheader()
    writer.writerows(all_quotes)

print(f"Scraped {len(all_quotes)} quotes and saved to 'quotes.csv'")

Scraped 100 quotes and saved to 'quotes.csv'


In [43]:
df=pd.read_csv('quotes.csv')
print(df.isnull().sum())
df1=df.dropna(subset=["tags"])
print(df1.isnull().sum())

print(df1.head())

text      0
author    0
tags      3
dtype: int64
text      0
author    0
tags      0
dtype: int64
                                                text           author  \
0  “The world as we have created it is a process ...  Albert Einstein   
1  “It is our choices, Harry, that show what we t...     J.K. Rowling   
2  “There are only two ways to live your life. On...  Albert Einstein   
3  “The person, be it gentleman or lady, who has ...      Jane Austen   
4  “Imperfection is beauty, madness is genius and...   Marilyn Monroe   

                                           tags  
0        change, deep-thoughts, thinking, world  
1                            abilities, choices  
2  inspirational, life, live, miracle, miracles  
3              aliteracy, books, classic, humor  
4                    be-yourself, inspirational  


In [7]:
def scrape_wikipedia(url):
    # Send a GET request to the Wikipedia page
    response = requests.get(url)
    
    if response.status_code == 200:
        # Parse the page using BeautifulSoup
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract the title of the page
        title = soup.find("h1", {"id": "firstHeading"}).get_text()
        
        # Extract the first paragraph (usually the introduction)
        first_paragraph = soup.find("p").get_text()
        
        # Extract all links inside the main content (excluding external links)
        links = []
        for link in soup.find_all("a", href=True):
            href = link['href']
            if href.startswith("/wiki/"):  # Only internal links (Wikipedia pages)
                links.append(link.get_text())
        
        # Return the data in the specified format
        return {
            "title": title,
            "first_paragraph": first_paragraph,
            "links": ", ".join(links)  # Join links with commas
        }
    else:
        return None

# List of Wikipedia pages to scrape
urls = [
    "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "https://en.wikipedia.org/wiki/Java_(programming_language)",
    "https://en.wikipedia.org/wiki/C_(programming_language)",
    "https://en.wikipedia.org/wiki/C_Sharp_(programming_language)"
]

# List to store all scraped data
data = []

# Scrape each page
for url in urls:
    scraped_data = scrape_wikipedia_data(url)
    if scraped_data:
        data.append(scraped_data)

# Display the scraped data
print(data)

wikifile="wikipedia.csv"
fields = ["title", "first_paragraph", "links"]

with open(wikifile, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

print(f"Data successfully written to {wikifile}")


[{'title': 'Python (programming language)', 'first_paragraph': '\n', 'links': 'Main page, Contents, Current events, Random article, About Wikipedia, Help, Learn to edit, Community portal, Recent changes, Upload file, \n\n\n\n\n\n, \nSearch\n, learn more, Contributions, Talk, Article, Talk, Read, Read, What links here, Related changes, Upload file, Special pages, Python (genus), , Paradigm, Multi-paradigm, object-oriented, procedural, imperative, functional, structured, reflective, Designed\xa0by, Guido van Rossum, Developer, Python Software Foundation, Stable release, Typing discipline, duck, dynamic, strong, optional type annotations, OS, Linux, macOS, Windows, WebAssembly, Android, iOS, FreeBSD, Raspberry Pi OS, Unix-like, BSD, License, Python Software Foundation License, Filename extensions, implementations, CPython, PyPy, Stackless Python, MicroPython, CircuitPython, IronPython, Jython, Dialects, Cython, RPython, Starlark, ABC, Ada, ALGOL 68, APL, C, C++, CLU, Dylan, Haskell, Icon,

In [14]:
df=pd.read_csv("wikipedia.csv")
print(df.head())
print("---------------------------------------------------------------------------------------------")
df['first_paragraph'] = df['first_paragraph'].str.replace("\n", " Not Applicable ", regex=False)
print(df)

                            title                           first_paragraph  \
0   Python (programming language)                                        \n   
1     Java (programming language)                                        \n   
2        C (programming language)  This is an accepted version of this page   
3  C Sharp (programming language)                                        \n   

                                               links  
0  Main page, Contents, Current events, Random ar...  
1  Main page, Contents, Current events, Random ar...  
2  Main page, Contents, Current events, Random ar...  
3  Main page, Contents, Current events, Random ar...  
---------------------------------------------------------------------------------------------
                            title                           first_paragraph  \
0   Python (programming language)                           Not Applicable    
1     Java (programming language)                           Not Applicable   

In [5]:
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd

# Fetch the HTML content
html = urlopen("https://en.wikipedia.org/wiki/Comparison_of_programming_languages")
soup = BeautifulSoup(html, "html.parser")

# Locate the first wikitable
table = soup.findAll("table", {"class": "wikitable"})[0]
rows = table.findAll("tr")

# Write to CSV with UTF-8 encoding
output_file = "language.csv"
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    for i in rows:
        row = []
        for cell in i.findAll(["td", "th"]):
            row.append(cell.get_text(strip=True))  # Strip extra whitespace
        writer.writerow(row)

# Read the CSV with pandas
a = pd.read_csv(output_file, encoding="utf-8")
print(a.head())

                             Language  \
0  1C:Enterprise programming language   
1                        ActionScript   
2                                 Ada   
3                               Aldor   
4                            ALGOL 58   

                               Original purpose Imperative Object-oriented  \
0  Application,RAD, business,general,web,mobile        Yes              No   
1                  Application,client-side, web        Yes             Yes   
2          Application,embedded,realtime,system        Yes          Yes[2]   
3      Highlydomain-specific,symbolic computing        Yes             Yes   
4                                   Application        Yes              No   

  Functional Procedural Generic Reflective  \
0        Yes        Yes     Yes        Yes   
1        Yes        Yes      No         No   
2         No     Yes[3]  Yes[4]         No   
3        Yes         No      No         No   
4         No         No      No         No   

       

In [38]:
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

# Fetch the webpage
url = "http://www.hubertiming.com/results/2017GPTR10K"
html = urlopen(url)
soup = BeautifulSoup(html, 'html.parser')

# Extract table rows
rows = soup.find_all('tr')

# Process rows into structured data
list_rows = []
for row in rows:
    cells = row.find_all(['td', 'th'])  # Include both data and header cells
    if cells:  # Only process non-empty rows
        list_rows.append([cell.get_text(strip=True) for cell in cells])

df = pd.DataFrame(list_rows)
print(df.head(6))
df.to_csv("gametiming.csv", index=False)

            0    1                   2       3         4      5      6  \
0  Finishers:  577                None    None      None   None   None   
1       Male:  414                None    None      None   None   None   
2     Female:  163                None    None      None   None   None   
3       Place  Bib                Name  Gender      City  State   Time   
4           1  814        JARED WILSON       M    TIGARD     OR  36:21   
5           2  573  NATHAN A SUSTERSIC       M  PORTLAND     OR  36:42   

          7             8  
0      None          None  
1      None          None  
2      None          None  
3  Gun Time          Team  
4     36:24                
5     36:45  INTEL TEAM F  


In [39]:
df1 = df.drop([0,1, 2], axis=0)
df1.reset_index(drop=True, inplace=True)

df1.columns = df1.iloc[0]
df1 = df1[1:].reset_index(drop=True)
print(df1.head())

#df1['Team'] = df1['Team'].fillna('Not Applicable')
df1['Team'] = df1['Team'].replace('', 'Not Applicable')
df1

0 Place  Bib                Name Gender       City State   Time Gun Time  \
0     1  814        JARED WILSON      M     TIGARD    OR  36:21    36:24   
1     2  573  NATHAN A SUSTERSIC      M   PORTLAND    OR  36:42    36:45   
2     3  687      FRANCISCO MAYA      M   PORTLAND    OR  37:44    37:48   
3     4  623         PAUL MORROW      M  BEAVERTON    OR  38:34    38:37   
4     5  569     DEREK G OSBORNE      M  HILLSBORO    OR  39:21    39:24   

0          Team  
0                
1  INTEL TEAM F  
2                
3                
4  INTEL TEAM F  


Unnamed: 0,Place,Bib,Name,Gender,City,State,Time,Gun Time,Team
0,1,814,JARED WILSON,M,TIGARD,OR,36:21,36:24,Not Applicable
1,2,573,NATHAN A SUSTERSIC,M,PORTLAND,OR,36:42,36:45,INTEL TEAM F
2,3,687,FRANCISCO MAYA,M,PORTLAND,OR,37:44,37:48,Not Applicable
3,4,623,PAUL MORROW,M,BEAVERTON,OR,38:34,38:37,Not Applicable
4,5,569,DEREK G OSBORNE,M,HILLSBORO,OR,39:21,39:24,INTEL TEAM F
...,...,...,...,...,...,...,...,...,...
572,573,273,RACHEL L VANEY,F,OTHER,OR,1:38:17,1:38:34,Not Applicable
573,574,467,ROHIT B DSOUZA,M,PORTLAND,OR,1:38:31,1:40:32,INTEL TEAM I
574,575,471,CENITA D'SOUZA,F,PORTLAND,OR,1:38:32,1:40:34,Not Applicable
575,576,338,PRANAVI APPANA,F,HILLSBORO,OR,1:40:47,1:42:01,Not Applicable


In [22]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

def scrape_amazon_selenium(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Runs Chrome in headless mode.
    options.add_argument('--disable-gpu')  # Disables GPU hardware acceleration.
    driver = webdriver.Chrome(options=options)  # Use options if headless

    driver.get(url)
    product_details = []  # List to hold all product details
    
    try:
        # Wait for the product listings to load
        products = WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "s-main-slot .s-result-item"))
        )

        for product in products:
            try:
                # Extract the title and price for each product
                title_element = product.find_element(By.CLASS_NAME, "a-size-base-plus.a-color-base.a-text-normal")
                price_element = product.find_element(By.CLASS_NAME, "a-price-whole")
                #rating_element = product.find_element(By.CLASS_NAME, "a-icon-alt")

                title = title_element.text.strip()
                price = price_element.text.strip() if price_element else "Price not available"
                #rating = rating_element.text.strip() if rating_element else "No rating"

                product_details.append({
                    "title": title,
                    "price": price,
                    #"rating": rating
                })
            
            except Exception as e:
                # Skip products that may not have a title or price element
                print(f"Error extracting details for a product: {e}")
                continue

        # Print the details of each product
        df = pd.DataFrame(product_details)
        df.to_csv("amazon_product_details.csv", index=False)
        print("Data saved to amazon_product_details.csv")
    
        
    except TimeoutException:
        print("Timeout waiting for product listings to load.")
    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.quit()

if __name__ == "__main__":
    url = "https://www.amazon.com/s?k=clothes&i=black-friday&crid=2V39PICL4HNY9&sprefix=%2Cblack-friday%2C3566&ref=nb_sb_noss"
    scrape_amazon_selenium(url)

Error extracting details for a product: Message: no such element: Unable to locate element: {"method":"css selector","selector":".a-size-base-plus.a-color-base.a-text-normal"}
  (Session info: chrome=131.0.6778.86); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6AFBC6CB5+28821]
	(No symbol) [0x00007FF6AFB33840]
	(No symbol) [0x00007FF6AF9D578A]
	(No symbol) [0x00007FF6AFA291BE]
	(No symbol) [0x00007FF6AFA294AC]
	(No symbol) [0x00007FF6AFA1C52C]
	(No symbol) [0x00007FF6AFA4F33F]
	(No symbol) [0x00007FF6AFA1C3F6]
	(No symbol) [0x00007FF6AFA4F510]
	(No symbol) [0x00007FF6AFA6F412]
	(No symbol) [0x00007FF6AFA4F0A3]
	(No symbol) [0x00007FF6AFA1A778]
	(No symbol) [0x00007FF6AFA1B8E1]
	GetHandleVerifier [0x00007FF6AFEFFCAD+3408013]
	GetHandleVerifier [0x00007FF6AFF1741F+3504127]
	GetHandleVerifier [0x00007FF6AFF0B5FD+3455453]
	GetHandleVerifier [0x00007FF

In [23]:
df=pd.read_csv('amazon_product_details.csv')
df.head()

Unnamed: 0,title,price
0,CRZ YOGA Butterluxe High Waisted Lounge Leggin...,27
1,Trendy Queen Womens Oversized Hoodies Fleece S...,29
2,AUTOMET Womens Fall Outfits Fashion Clothes Sh...,23
3,Trendy Queen Womens Long Sleeve Shirts Crop To...,9
4,4 Pack Leggings for Women Butt Lift High Waist...,22


In [24]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException

def scrape_amazon_selenium(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Runs Chrome in headless mode.
    options.add_argument('--disable-gpu')  # Disables GPU hardware acceleration.
    driver = webdriver.Chrome(options=options)

    driver.get(url)
    product_details = []

    try:
        # Wait for the product listings to load
        WebDriverWait(driver, 20).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".s-main-slot .s-result-item"))
        )

        products = driver.find_elements(By.CSS_SELECTOR, ".s-main-slot .s-result-item")
        print(f"Found {len(products)} products.")

        for product in products:
            try:
                # Extract title and price
                title_element = product.find_element(By.CLASS_NAME, "a-size-base-plus.a-color-base.a-text-normal")
                price_element = product.find_element(By.CLASS_NAME, "a-price-whole")

                title = title_element.text.strip()
                price = price_element.text.strip() if price_element else "Price not available"

                # Extract rating using XPath for a-icon-alt
                try:
                    # XPath to find the rating element
                    rating_element = product.find_element(By.XPATH, ".//i[contains(@class, 'a-icon-alt')]")
                    rating = rating_element.text.strip()
                except NoSuchElementException:
                    rating = "No rating"

                product_details.append({
                    "title": title,
                    "price": price,
                    "rating": rating
                })

            except Exception as e:
                print(f"Error extracting details for a product: {e}")
                continue

        # Save to CSV if data is collected
        if product_details:
            df = pd.DataFrame(product_details)
            df.to_csv("amazon_product_details_with_ratings.csv", index=False)
            print("Data saved to amazon_product_details_with_ratings.csv")
        else:
            print("No product details extracted.")

    except TimeoutException:
        print("Timeout waiting for product listings to load.")
    except Exception as e:
        print(f"Error occurred: {e}")

    finally:
        driver.quit()

if __name__ == "__main__":
    url = "https://www.amazon.com/s?k=clothes&i=black-friday&crid=2V39PICL4HNY9&sprefix=%2Cblack-friday%2C3566&ref=nb_sb_noss"
    scrape_amazon_selenium(url)

Found 51 products.
Error extracting details for a product: Message: no such element: Unable to locate element: {"method":"css selector","selector":".a-size-base-plus.a-color-base.a-text-normal"}
  (Session info: chrome=131.0.6778.86); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF6AFBC6CB5+28821]
	(No symbol) [0x00007FF6AFB33840]
	(No symbol) [0x00007FF6AF9D578A]
	(No symbol) [0x00007FF6AFA291BE]
	(No symbol) [0x00007FF6AFA294AC]
	(No symbol) [0x00007FF6AFA1C52C]
	(No symbol) [0x00007FF6AFA4F33F]
	(No symbol) [0x00007FF6AFA1C3F6]
	(No symbol) [0x00007FF6AFA4F510]
	(No symbol) [0x00007FF6AFA6F412]
	(No symbol) [0x00007FF6AFA4F0A3]
	(No symbol) [0x00007FF6AFA1A778]
	(No symbol) [0x00007FF6AFA1B8E1]
	GetHandleVerifier [0x00007FF6AFEFFCAD+3408013]
	GetHandleVerifier [0x00007FF6AFF1741F+3504127]
	GetHandleVerifier [0x00007FF6AFF0B5FD+3455453]
	GetHandle

In [25]:
df=pd.read_csv('amazon_product_details_with_ratings.csv')
df.head()

Unnamed: 0,title,price,rating
0,CRZ YOGA Butterluxe High Waisted Lounge Leggin...,27,No rating
1,Trendy Queen Womens Oversized Hoodies Fleece S...,29,No rating
2,AUTOMET Womens Fall Outfits Fashion Clothes Sh...,23,No rating
3,Trendy Queen Womens Long Sleeve Shirts Crop To...,9,No rating
4,4 Pack Leggings for Women Butt Lift High Waist...,22,No rating
