### WEBSCRAPPING

## Recommendations to run this notebook

You might need to install some of the libraries being used, you can do so by CREATING A VIRTUAL ENVIRONMENT AND USE:


```python
!pip install -r requirements.txt

```

In [61]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

driver = webdriver.Chrome()

#s=Service(ChromeDriverManager().install()) #MAC user might need this
#driver = webdriver.Chrome(service=s)

In [62]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# This will automatically download and use the correct ChromeDriver version
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [63]:
driver = webdriver.Chrome()
driver.get('http://twitter.com/login')

In [None]:
class TwitterScraper:
    def __init__(self, email, password):
        self.email = email
        self.password = password
        self.driver = self._initialize_driver()

    def _initialize_driver(self):
        print("Setting up WebDriver...")
        options = webdriver.ChromeOptions()
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-gpu")
        options.add_argument("--log-level=3")
        options.add_argument("--disable-notifications")
        options.add_argument("--start-maximized")
        
        try:
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            print("WebDriver setup complete.")
            return driver
        except WebDriverException as e:
            print(f"Error setting up WebDriver: {e}")
            raise

    def login(self):
        try:
            self.driver.get(TWITTER_LOGIN_URL)
            time.sleep(5)

            print("Entering email...")
            self._enter_email()
            print("Clicking 'Next'...")
            self._click_next_button()
            print("Entering password...")
            self._enter_password()

            print("Login successful!")
        except Exception as e:
            print(f"Error during login: {e}")
        finally:
            time.sleep(5)
            self.driver.quit()

    def _enter_email(self):
        try:
            email_input = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='username']"))
            )
            email_input.send_keys(self.email)
            email_input.send_keys(Keys.RETURN)
        except NoSuchElementException:
            print("Email input field not found.")

    def _click_next_button(self):
        attempts = 0
        while attempts < 3:
            try:
                next_button = WebDriverWait(self.driver, 10).until(
                    EC.element_to_be_clickable((By.XPATH, "//div[@role='button' and text()='Next']"))
                )
                self.driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                next_button.click()
                break
            except Exception as e:
                print(f"Retrying to click 'Next'... Attempt {attempts + 1}")
                attempts += 1
                time.sleep(2)

    def _enter_password(self):
        try:
            password_input = WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//input[@autocomplete='current-password']"))
            )
            password_input.send_keys(self.password)
            password_input.send_keys(Keys.RETURN)
        except NoSuchElementException:
            print("Password input field not found.")
        except Exception as e:
            print(f"Error entering password: {e}")

if __name__ == "__main__":
    email = "jose.perez.castellanos@itam.mx"
    password = "..."

    scraper = TwitterScraper(email=email, password=password)
    scraper.login()

Setting up WebDriver...
WebDriver setup complete.
Entering email...
Clicking 'Next'...
Retrying to click 'Next'... Attempt 1
Retrying to click 'Next'... Attempt 2
Retrying to click 'Next'... Attempt 3
Entering password...
Login successful!


In [65]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Recording the start time
start_time = time.time()

# Initializing Chrome WebDriver
driver = webdriver.Chrome()

# Opening the website
driver.get("http://quotes.toscrape.com/js/")

# Initializing lists to store data
quotes_data = []

# Creating a WebDriverWait object with a timeout of 10 seconds
wait = WebDriverWait(driver, 10)

# Iterating through all pages
while True:
    # Finding all the quotes on the current page
    quotes = driver.find_elements(By.CLASS_NAME, "quote")
    
    # Extracting data for each quote on the current page
    for quote in quotes:
        quote_text = quote.find_element(By.CLASS_NAME, "text").text
        author = quote.find_element(By.CLASS_NAME, "author").text
        tags = "|".join(tag.text for tag in quote.find_elements(By.CLASS_NAME, "tag"))
        
        # Appending data to the list
        quotes_data.append([author, quote_text, tags])
    
    # Checking if there is a next page
    try:
        next_page = driver.find_element(By.XPATH, "//li[@class='next']/a")
        if 'disabled' in next_page.get_attribute("class"):
            print("Reached the last page.")
            break
    except:
        print("Next page button not found. Exiting.")
        break
    
    # Scrolling to the next page element
    print("Scrolling to the next page.")
    ActionChains(driver).move_to_element(next_page).perform()
    
    # Clicking the "Next" link
    print("Clicking the next page.")
    next_page.click()

# Closing the WebDriver
driver.quit()

# Recording the end time
end_time = time.time()

# Calculating the total run time
total_run_time = end_time - start_time
print(f"Total run time: {total_run_time} seconds")

# Saving the data to a CSV file
with open("quotes.csv", "w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Writing header row
    csv_writer.writerow(["Author", "Quote", "Tags"])
    
    # Writing quote data
    csv_writer.writerows(quotes_data)

print("Quotes have been scraped and saved to quotes.csv.")


Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Next page button not found. Exiting.
Total run time: 10.602749109268188 seconds
Quotes have been scraped and saved to quotes.csv.


In [66]:
import pandas as pd

# Load the data
df = pd.read_csv("quotes.csv")

# Display the first few rows of the dataframe
print(df.head())

            Author                                              Quote  \
0  Albert Einstein  “The world as we have created it is a process ...   
1     J.K. Rowling  “It is our choices, Harry, that show what we t...   
2  Albert Einstein  “There are only two ways to live your life. On...   
3      Jane Austen  “The person, be it gentleman or lady, who has ...   
4   Marilyn Monroe  “Imperfection is beauty, madness is genius and...   

                                       Tags  
0       change|deep-thoughts|thinking|world  
1                         abilities|choices  
2  inspirational|life|live|miracle|miracles  
3             aliteracy|books|classic|humor  
4                 be-yourself|inspirational  


In [67]:
import random

def introduce_errors(name, num_errors=1):
    """Introduce errors in a name by adding, removing, or swapping letters."""
    name = list(name)
    for _ in range(num_errors):
        error_type = random.choice(["add", "remove", "swap"])
        if error_type == "add" and len(name) > 0:
            index = random.randint(0, len(name) - 1)
            name.insert(index, random.choice("abcdefghijklmnopqrstuvwxyz"))
        elif error_type == "remove" and len(name) > 1:
            index = random.randint(0, len(name) - 1)
            name.pop(index)
        elif error_type == "swap" and len(name) > 1:
            index = random.randint(0, len(name) - 2)
            name[index], name[index + 1] = name[index + 1], name[index]
    return ''.join(name)

# Example usage:
authors = ["Albert Einstein", "J.K. Rowling", "Mark Twain", "C.S. Lewis"]
authors_with_errors = [introduce_errors(name, num_errors=2) for name in authors]
print(authors_with_errors)

['Albert Einstein', 'j.K. Rowling', 'Mark Taiwn', 'C.bS. Lweis']


In [68]:
def manual_levenshtein_distance(name1, name2):
    """Compute the Levenshtein distance between two strings manually."""
    len1, len2 = len(name1), len(name2)
    dp = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)]

    # Initialize the DP table
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j

    # Fill the table
    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            cost = 0 if name1[i - 1] == name2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,  # Deletion
                dp[i][j - 1] + 1,  # Insertion
                dp[i - 1][j - 1] + cost,  # Substitution
            )

    return dp[len1][len2]

# Example usage
correct_name = "Albert Einstein"
misspelled_name = "Alberton Einstain"
distance = manual_levenshtein_distance(correct_name, misspelled_name)
print(f"Distance between '{correct_name}' and '{misspelled_name}': {distance}")

Distance between 'Albert Einstein' and 'Alberton Einstain': 3


In [69]:
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Manual Levenshtein distance function
def levenshtein_distance(name1, name2):
    """Manually calculate the Levenshtein distance between two strings."""
    len1, len2 = len(name1), len(name2)
    dp = [[0 for _ in range(len2 + 1)] for _ in range(len1 + 1)]

    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j

    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            cost = 0 if name1[i - 1] == name2[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,    # Deletion
                dp[i][j - 1] + 1,    # Insertion
                dp[i - 1][j - 1] + cost  # Substitution
            )
    return dp[len1][len2]

# Function to check if a name is similar within a threshold
def is_similar_name(correct_name, author_name, max_distance=3):
    return levenshtein_distance(correct_name, author_name) <= max_distance

# Recording the start time
start_time = time.time()

# Initializing Chrome WebDriver
driver = webdriver.Chrome()

# Opening the website
driver.get("http://quotes.toscrape.com/js/")

# Initializing lists to store data
quotes_data = []

# Creating a WebDriverWait object with a timeout of 10 seconds
wait = WebDriverWait(driver, 10)

# Correct name and threshold for similarity
correct_name = "Albert Einstein"
max_distance = 3

# Iterating through all pages
while True:
    # Finding all the quotes on the current page
    quotes = driver.find_elements(By.CLASS_NAME, "quote")
    
    # Extracting data for each quote on the current page
    for quote in quotes:
        try:
            quote_text = quote.find_element(By.CLASS_NAME, "text").text
            author = quote.find_element(By.CLASS_NAME, "author").text
            tags = "|".join(tag.text for tag in quote.find_elements(By.CLASS_NAME, "tag"))
            
            # Check if the author's name is similar to the target name
            if is_similar_name(correct_name, author, max_distance):
                quotes_data.append([author, quote_text, tags])
        except Exception as e:
            print(f"Error extracting quote: {e}")

    # Checking if there is a next page
    try:
        next_page = driver.find_element(By.XPATH, "//li[@class='next']/a")
    except:
        print("Next page button not found. Exiting.")
        break

    # Scrolling to the next page element
    print("Scrolling to the next page.")
    ActionChains(driver).move_to_element(next_page).perform()
    
    # Clicking the "Next" link
    print("Clicking the next page.")
    next_page.click()

# Closing the WebDriver
driver.quit()

# Recording the end time
end_time = time.time()

# Calculating the total run time
total_run_time = end_time - start_time
print(f"Total run time: {total_run_time} seconds")

# Saving the data to a CSV file
with open("quotes_filtered.csv", "w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    
    # Writing header row
    csv_writer.writerow(["Author", "Quote", "Tags"])
    
    # Writing quote data
    csv_writer.writerows(quotes_data)

print("Quotes have been scraped and saved to quotes_filtered.csv.")

Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Scrolling to the next page.
Clicking the next page.
Next page button not found. Exiting.
Total run time: 10.36624002456665 seconds
Quotes have been scraped and saved to quotes_filtered.csv.


In [70]:
# Load the filtered data
df = pd.read_csv("quotes_filtered.csv")

# Display the first few rows
print(df.head())

            Author                                              Quote  \
0  Albert Einstein  “The world as we have created it is a process ...   
1  Albert Einstein  “There are only two ways to live your life. On...   
2  Albert Einstein  “Try not to become a man of success. Rather be...   
3  Albert Einstein  “If you can't explain it to a six year old, yo...   
4  Albert Einstein  “If you want your children to be intelligent, ...   

                                       Tags  
0       change|deep-thoughts|thinking|world  
1  inspirational|life|live|miracle|miracles  
2                   adulthood|success|value  
3                     simplicity|understand  
4                      children|fairy-tales  


In [72]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Initialize Selenium WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

try:
    # List of name variations to search
    name_variations = ["D. Trump", "Donald T.", "Donald Trump"]  

    # Initialize a dictionary to store results
    news_data = {name: [] for name in name_variations}

    # Iterate through each name variation
    for name in name_variations:
        # Construct the search URL with the name variation and last hour filter
        search_query = f"{name} when:1h"
        url = f"https://news.google.com/search?q={search_query.replace(' ', '%20')}&hl=en-US&gl=US&ceid=US%3Aen"
        driver.get(url)
        time.sleep(5)  # Wait for the page to load

        # Locate all articles
        articles = driver.find_elements(By.XPATH, "//article")
        print(f"Found {len(articles)} articles for '{name}'")

        # Extract the first 5 links
        for article in articles[:5]:
            try:
                # Extract the link
                link_element = article.find_element(By.XPATH, ".//a[@href]")
                link = link_element.get_attribute("href")

                # Append the link to the corresponding name variation
                news_data[name].append(link)
            except Exception as e:
                print(f"Error processing article for '{name}': {e}")

finally:
    # Close the WebDriver
    driver.quit()

# Save results to a DataFrame
results = []
for name, links in news_data.items():
    for link in links:
        results.append({"Name Variation": name, "Link": link})

df = pd.DataFrame(results)

# Save to CSV
df.to_csv("filtered_news_last_hour.csv", index=False)
print("\nFiltered news saved to 'filtered_news_last_hour.csv'.")

# Display the DataFrame
print("\nExtracted Links:")
print(df)

Found 28 articles for 'D. Trump'
Found 35 articles for 'Donald T.'
Found 96 articles for 'Donald Trump'

Filtered news saved to 'filtered_news_last_hour.csv'.

Extracted Links:
   Name Variation                                               Link
0        D. Trump  https://news.google.com/read/CBMihgFBVV95cUxQM...
1        D. Trump  https://news.google.com/read/CBMiuAFBVV95cUxOb...
2        D. Trump  https://news.google.com/read/CBMiuwFBVV95cUxNR...
3        D. Trump  https://news.google.com/read/CBMikwJBVV95cUxQW...
4        D. Trump  https://news.google.com/read/CBMikwFBVV95cUxQV...
5       Donald T.  https://news.google.com/read/CBMiX0FVX3lxTE9MM...
6       Donald T.  https://news.google.com/read/CBMirAFBVV95cUxOM...
7       Donald T.  https://news.google.com/read/CBMikwJBVV95cUxQW...
8       Donald T.  https://news.google.com/read/CBMipwFBVV95cUxQU...
9       Donald T.  https://news.google.com/read/CBMiZ0FVX3lxTE1TZ...
10   Donald Trump  https://news.google.com/read/CBMirAFBVV95cUxO

In [73]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

# Load the CSV file with Google News links
df = pd.read_csv("filtered_news_last_hour.csv")
article_links = df["Link"].tolist()

# Initialize Selenium WebDriver
options = Options()
options.add_argument("--headless")  # Run browser in headless mode
options.add_argument("--disable-gpu")  # Disable GPU rendering for better performance
options.add_argument("--no-sandbox")
prefs = {"profile.managed_default_content_settings.images": 2}  # Disable image loading to speed up scraping
options.add_experimental_option("prefs", prefs)
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# Initialize a list to store scraped data
scraped_data = []

try:
    for idx, link in enumerate(article_links, start=1):
        try:
            # Open the article link
            driver.get(link)
            driver.set_page_load_timeout(30)  # Allow up to 30 seconds for the page to load

            # Wait for the article's main content to load
            try:
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.XPATH, "//h1"))
                )
            except Exception:
                print(f"No <h1> tag found for link {idx}/{len(article_links)}.")

            # Scrape the article title
            try:
                # Primary: Look for <h1> tag
                title = driver.find_element(By.XPATH, "//h1").text
            except Exception:
                try:
                    # Fallback: Use the <title> tag
                    title = driver.find_element(By.TAG_NAME, "title").text
                except Exception:
                    title = "N/A"  # Default value if no title is found

            # Append the scraped data
            scraped_data.append({"Link": link, "Title": title})
            print(f"{idx}/{len(article_links)}: Scraped - Title: {title}")
        
        except Exception as e:
            print(f"Error processing link {idx}/{len(article_links)}: {link}, Error: {e}")
        
finally:
    # Close the WebDriver
    driver.quit()
    print("WebDriver closed.")

# Save the scraped data to a new CSV file
output_file = "scraped_articles_titles_only.csv"
scraped_df = pd.DataFrame(scraped_data)
scraped_df.to_csv(output_file, index=False)
print(f"Scraped data saved to '{output_file}'.")

# Display the DataFrame
print("\nScraped Data Preview:")
print(scraped_df.head())

1/15: Scraped - Title: Trump’s Executive Orders: Reversing Biden’s Policies and Attacking the ‘Deep State’
2/15: Scraped - Title: Trump said he'd quickly end Russia's war on Ukraine. But it's proving tough.
3/15: Scraped - Title: Trump to visit disaster zones in North Carolina and California on first trip of second term
4/15: Scraped - Title: Donald Trump news LIVE: Post US exit, ordered by Trump, WHO to cut costs, ‘reset’ priorities, report says
5/15: Scraped - Title: Trump vows ‘big discussion’ on FEMA ahead of disaster tour
6/15: Scraped - Title: Project 1897: The imperial presidency
7/15: Scraped - Title: How will Trump try to fix inflation? Experts say he'll focus on these areas.
8/15: Scraped - Title: Donald Trump news LIVE: Post US exit, ordered by Trump, WHO to cut costs, ‘reset’ priorities, report says
9/15: Scraped - Title: Trump won’t deliver on maximum tariff pledges, says his former commerce secretary—merely making the threat is enough
10/15: Scraped - Title: This site can