In [None]:
# Web Harvesting Unit 2 -

# Import necessary libraries
import requests
import json
import xml.etree.ElementTree as ET
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Sample XML Data for Parsing

xml_data = """<data>
    <item>
        <title>Web Scraping Basics</title>
        <author>John Doe</author>
        <date>2025-02-22</date>
    </item>
    <item>
        <title>Advanced Web Scraping</title>
        <author>Jane Smith</author>
        <date>2025-02-23</date>
    </item>
</data>"""
root = ET.fromstring(xml_data)
for item in root.findall('item'):
    title = item.find('title').text
    author = item.find('author').text
    date = item.find('date').text
    print(f"Title: {title}, Author: {author}, Date: {date}")

In [None]:
# Sample JSON Data for Parsing
json_data = '{"articles": [{"title": "Data Extraction", "author": "Alice", "date": "2025-02-21"}, {"title": "Cleaning Data", "author": "Bob", "date": "2025-02-22"}]}'
data = json.loads(json_data)
for article in data['articles']:
    print(f"Title: {article['title']}, Author: {article['author']}, Date: {article['date']}")



# **Regular Expressions for Data Extraction**

In [None]:
# Searching for a pattern
import re

text = "The year is 2024."
pattern = r"\d{4}"  # Matches a 4-digit number

match = re.search(pattern, text)
if match:
    print("Found:", match.group())

In [None]:
# Finding all matches in a string
text = "My numbers are 123-456-7890 and 987-654-3210."
pattern = r"\d{3}-\d{3}-\d{4}"  # Matches phone numbers

matches = re.findall(pattern, text)
print(matches)  # Output: ['123-456-7890', '987-654-3210']


In [None]:
#  Replacing text using regex
text = "I love Python and Java!"
pattern = r"Python|Java"
replacement = "Programming"

new_text = re.sub(pattern, replacement, text)
print(new_text)

In [None]:

# extracting emaails
text = "Contact us at support@example.com or visit https://example.com"
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
urls = re.findall(r'https?://\S+', text)
print(f"Extracted Emails: {emails}")
print(f"Extracted URLs: {urls}")

In [None]:
# Creating a Term Document Matrix

corpus = ["Web scraping is useful for data gathering.", "Data cleaning is crucial after web scraping."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print("Feature Names:", vectorizer.get_feature_names_out())
print("TDM Matrix:\n", X.toarray())

In [None]:
# Data Cleaning and Manipulation
data = {"Text": ["Web scraping is FUN!!", "Data cleaning is important!!!"]}
df = pd.DataFrame(data)
df['Cleaned_Text'] = df['Text'].str.lower().str.replace(r'[^a-z ]', '', regex=True)
print(df)

Practical Implementation using a website

In [63]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [64]:
# URL of the website to scrape
url = "https://ticker.finology.in/"
r = requests.get(url)

In [65]:

# Parsing the HTML content using BeautifulSoup
soup = BeautifulSoup(r.text, "lxml")

In [67]:

# Finding the table with stock data
table = soup.find("table", class_="table table-sm table-hover screenertable")

In [68]:
# Extracting table headers
headers = table.find_all("th")

In [69]:

titles = [i.text for i in headers]  # Storing headers in a list

In [70]:

# Creating an empty DataFrame with extracted headers
df = pd.DataFrame(columns=titles)

In [71]:

# Finding all table rows
rows = table.find_all("tr")

In [72]:
# Extracting data from each row
for i in rows[1:]:  # Skipping the header row
    data = i.find_all("td")
    row = [tr.text for tr in data]  # Extracting text from each cell
    print(row)

    # Adding row data to the DataFrame
    l = len(df)
    df.loc[l] = row

['\nRedington\n', '250.00', '263.80']
['\nDenta Water And Inf\n', '356.30', '377.30']
['\nBenares Hotels\n', '11975.20', '12499.95']
['\nLKP Finance\n', '323.95', '339.80']
['\nDeccan Cements\n', '861.00', '899.00']
['\nUPL\n', '646.15', '657.75']
['\nCamlin Fine Sciences\n', '155.05', '157.00']
['\nColab Platforms\n', '60.26', '60.26']
['\nNarayana Hrudayalay\n', '1406.00', '1429.40']
['\nOsiajee Texfab\n', '112.85', '113.50']
['\nFischer Medical\n', '801.00', '840.80']
['\nTechNVision Ventures\n', '4624.70', '5111.50']
['\nA-1\n', '440.00', '461.90']
['\nAK Spintex\n', '864.00', '874.00']
['\nShree Rama News\n', '30.98', '31.25']


In [73]:
# Extracting data from each row and cleaning using regex
cleaned_data = []
for i in rows[1:]:  # Skipping the header row
    data = i.find_all("td")
    row = [re.sub(r'\s+', ' ', tr.text).strip() for tr in data]  # Cleaning extra spaces/newlines
    cleaned_data.append(row)

In [74]:
#Converting to DataFrame
df = pd.DataFrame(cleaned_data, columns=titles)

In [75]:
df

Unnamed: 0,Company,PriceRs.,Day HighRs.
0,Redington,250.0,263.8
1,Denta Water And Inf,356.3,377.3
2,Benares Hotels,11975.2,12499.95
3,LKP Finance,323.95,339.8
4,Deccan Cements,861.0,899.0
5,UPL,646.15,657.75
6,Camlin Fine Sciences,155.05,157.0
7,Colab Platforms,60.26,60.26
8,Narayana Hrudayalay,1406.0,1429.4
9,Osiajee Texfab,112.85,113.5


In [76]:
df["Company"] = df["Company"].apply(lambda x: re.sub(r'^\n+|\n+$', '', x))

In [77]:
df

Unnamed: 0,Company,PriceRs.,Day HighRs.
0,Redington,250.0,263.8
1,Denta Water And Inf,356.3,377.3
2,Benares Hotels,11975.2,12499.95
3,LKP Finance,323.95,339.8
4,Deccan Cements,861.0,899.0
5,UPL,646.15,657.75
6,Camlin Fine Sciences,155.05,157.0
7,Colab Platforms,60.26,60.26
8,Narayana Hrudayalay,1406.0,1429.4
9,Osiajee Texfab,112.85,113.5


In [None]:
# Saving the scraped data to a CSV file
df.to_csv("Stock_data.csv", index=False)

In [None]:
import requests
import pandas as pd
import re
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
url = "https://quotes.toscrape.com/"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
quotes = soup.find_all('span', class_='text')
authors = soup.find_all('small', class_='author')

data = []
for quote, author in zip(quotes, authors):
    data.append({"Quote": quote.text, "Author": author.text})

In [None]:
df = pd.DataFrame(data)
df

In [None]:
# Applying Regular Expressions on Extracted Data
df['Processed_Quote'] = df['Quote'].apply(lambda x: re.sub(r'[^a-zA-Z ]', '', x.lower()))


In [None]:
df['Processed_Quote']

In [None]:
# Creating a Term Document Matrix
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Processed_Quote'])
tdm = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
tdm