# 🕸️ Advanced Web Scraping Project – IBM Data Analyst Capstone

This project explores deeper web scraping skills:
- Scraping programming language popularity data
- Parsing structured HTML tables
- Cleaning, analyzing, and saving scraped data
- Visualizing top programming languages

---


# 🕸️ Web Scraping Project – IBM Data Analyst Capstone

This project demonstrates how to:
- Download webpages
- Scrape hyperlinks and images
- Extract tabular data

---

# **Case Study: Web Scraping**

# Objectives
After completing this hands-on lab work, we will be able to:

<ul>
<li>Extract information from a given web site.</li>
<li>Write the scraped data into a csv file.</li>
</ul>

## Extract information from the given web site
You will extract the data from the below web site: <br> 


In [None]:
#this url contains the data you need to scrape
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"

The data you need to scrape is the **name of the programming language** and **average annual salary**.<br> It is a good idea to open the url in your web broswer and study the contents of the web page before you start to scrape.


Import the required libraries


In [None]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a webpage
import pandas as pd

print(soup.prettify()[:500])  # Preview first 500 characters of parsed HTML

Download the webpage at the url


In [None]:
!pip install openpyxl

In [None]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url)
data = data.text

print(response.status_code)  # Confirm successful download (200 OK)

Create a soup object


In [None]:
soup = BeautifulSoup(data,"html.parser")  # create a soup object using the variable 'data'

print(soup.prettify()[:500])  # Preview first 500 characters of parsed HTML

Scrape the `Language name` and `annual average salary`.


In [None]:
for link in soup.find_all('a'): # in html anchor/link is represented by the tag <a>
    print(link.get('href'))

In [None]:
#find a html table in the web page
table = soup.find('table') # in html table is represented by the tag <table>

In [None]:
# Extract column headers (Handle cases where headers are missing `<th>`)
headers = []
header_row = table.find("tr")  # Find the first row (headers may be inside <td>)
if header_row:
    headers = [header.text.strip() for header in header_row.find_all(["th", "td"])]  # Search for both <th> and <td>

# Extract table rows
rows = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    rows.append(cols)

# Ensure headers exist, otherwise create default headers
if not headers:
    headers = [f"Column_{i}" for i in range(len(rows[0]))]  # Create generic column names if none found

# Convert to a pandas DataFrame
df_languages = pd.DataFrame(rows, columns=headers)

print(f"Number of tables found: {len(tables)}")

Save the scrapped data into a file named *popular-languages.csv*


In [None]:
# Save the scraped data into a CSV file
csv_filename = "popular-languages.csv"
df_languages.to_csv(csv_filename, index=False)

# Display the extracted data
print(f"Popular Programming Languages {df_languages}.")

print(f"Scraped data successfully saved in {csv_filename}.")


print('Data exported successfully as CSV.')

<strong>FULL CODE SCRIPT BELOW</strong>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL containing the data to scrape
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"


# Get the webpage content
response = requests.get(url)
data = response.text

# Create a BeautifulSoup object to parse the HTML
soup = BeautifulSoup(data, "html.parser")

# Extract the table data
table = soup.find("table")

# Extract column headers (Handle cases where headers are missing `<th>`)
headers = []
header_row = table.find("tr")  # Find the first row (headers may be inside <td>)
if header_row:
    headers = [header.text.strip() for header in header_row.find_all(["th", "td"])]  # Search for both <th> and <td>

# Extract table rows
rows = []
for row in table.find_all("tr")[1:]:  # Skip header row
    cols = row.find_all("td")
    cols = [col.text.strip() for col in cols]
    rows.append(cols)

# Ensure headers exist, otherwise create default headers
if not headers:
    headers = [f"Column_{i}" for i in range(len(rows[0]))]  # Create generic column names if none found

# Convert to a pandas DataFrame
df_languages = pd.DataFrame(rows, columns=headers)

# Save the scraped data into a CSV file
csv_filename = "popular-languages.csv"
df_languages.to_csv(csv_filename, index=False)

# Display the extracted data
# import ace_tools as tools
# tools.display_dataframe_to_user(name="Popular Programming Languages", dataframe=df_languages)
print(f"Popular Programming Languages {df_languages}.")

print(f"Scraped data successfully saved in {csv_filename}.")


print(soup.prettify()[:500])  # Preview first 500 characters of parsed HTML

print(f"Number of tables found: {len(tables)}")

print('Data exported successfully as CSV.')

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# STEP 1: URL to scrape
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/Programming_Languages.html"

# STEP 2: Scrape and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table")

# STEP 3: Extract headers and rows
headers = [th.text.strip() for th in table.find("tr").find_all(["th", "td"])]
rows = []
for tr in table.find_all("tr")[1:]:
    cols = [td.text.strip() for td in tr.find_all("td")]
    rows.append(cols)

df = pd.DataFrame(rows, columns=headers)

# STEP 4: Clean salary column
df["Average Annual Salary"] = df["Average Annual Salary"].replace('[\$,]', '', regex=True).astype(float)

# STEP 5: Sort data
df_sorted = df.sort_values(by="Average Annual Salary", ascending=False)

# STEP 6: Create chart
plt.figure(figsize=(12, 6))
colors = plt.cm.plasma(np.linspace(0, 1, len(df_sorted)))
bars = plt.bar(df_sorted["Language"], df_sorted["Average Annual Salary"], color=colors)

# Add value labels
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, height + 2000, f"${int(height):,}",
             ha='center', va='bottom', fontsize=9, fontweight='bold')

plt.title("Popular Programming Languages by Average Annual Salary", fontsize=14, weight='bold')
plt.xlabel("Programming Language")
plt.ylabel("Average Annual Salary (USD)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# STEP 7: Save chart
plt.savefig("popular_languages_salary_chart.png", dpi=300)
plt.show()


print(soup.prettify()[:500])  # Preview first 500 characters of parsed HTML

print(f"Number of tables found: {len(tables)}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Data
data = {
    "Language": ["Python", "Java", "R", "Javascript", "Swift", "C++", "C#", "PHP", "SQL", "Go"],
    "Created By": [
        "Guido van Rossum", "James Gosling", "Robert Gentleman, Ross Ihaka", "Netscape",
        "Apple", "Bjarne Stroustrup", "Microsoft", "Rasmus Lerdorf",
        "Donald D. Chamberlin, Raymond F. Boyce.", "Robert Griesemer, Ken Thompson, Rob Pike."
    ]
}

df = pd.DataFrame(data)

# Placeholder bar height
bar_height = np.ones(len(df)) * 1

# Plot
plt.figure(figsize=(12, 6))
colors = plt.cm.Set2(np.linspace(0, 1, len(df)))
bars = plt.bar(df["Language"], bar_height, color=colors)

# Add creators as labels
for bar, creator in zip(bars, df["Created By"]):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, creator,
             ha='center', va='bottom', fontsize=9, rotation=90)

# Remove y-axis ticks since height is symbolic
plt.yticks([])
plt.ylabel("Language Representation")
plt.xticks(rotation=45, ha='right')

# Add title at the bottom
plt.text(
    0.5, -0.15, "Popular Programming Languages and Their Creators",
    fontsize=14, fontweight='bold',
    ha='center', transform=plt.gca().transAxes
)

plt.tight_layout()

# Save the figure properly to current directory
file_name = "popular_languages_creators_chart.png"
file_path = os.path.join(os.getcwd(), file_name)
plt.savefig(file_path, dpi=300)
plt.show()

print("✅ Chart saved at:", file_path)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

# Data
data = {
    "Language": ["Python", "Java", "R", "Javascript", "Swift", "C++", "C#", "PHP", "SQL", "Go"],
    "Created By": [
        "Guido van Rossum", "James Gosling", "Robert Gentleman, Ross Ihaka", "Netscape",
        "Apple", "Bjarne Stroustrup", "Microsoft", "Rasmus Lerdorf",
        "Donald D. Chamberlin, Raymond F. Boyce.", "Robert Griesemer, Ken Thompson, Rob Pike."
    ]
}

df = pd.DataFrame(data)

# Placeholder bar height
bar_height = np.ones(len(df)) * 1

# Plot
plt.figure(figsize=(12, 6))
colors = plt.cm.Set2(np.linspace(0, 1, len(df)))
bars = plt.bar(df["Language"], bar_height, color=colors)

# Add creators as labels
for bar, creator in zip(bars, df["Created By"]):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, creator,
             ha='center', va='bottom', fontsize=9, rotation=90)

# Remove y-axis ticks since height is symbolic
plt.yticks([])
plt.ylabel("Language Representation")
plt.xticks(rotation=45, ha='right')

# Add the title way below the chart
plt.text(
    0.5, -0.25,  # Very far down
    "Popular Programming Languages and Their Authors",
    fontsize=14, fontweight='bold',
    ha='center', transform=plt.gca().transAxes
)

# Save with padding for title space
#plt.savefig(file_path, dpi=300, bbox_inches='tight', pad_inches=1.0)


plt.tight_layout()

# Save chart in same directory
file_name = "popular_languages_creators_chart.png"
file_path = os.path.join(os.getcwd(), file_name)
plt.savefig(file_path, dpi=300)
plt.show()

print("✅ Chart saved at:", file_path)


In [None]:
import os

# Get the absolute file path of the notebook file
file_path = os.path.abspath("Web-Scraping-Lab.ipynb")
print("The notebook is located at:", file_path)

In [None]:
import nbconvert
import nbformat
import pdfkit

# Corrected file paths (Using raw string notation or forward slashes)
input_file_path = r"C:\Users\Ede\Desktop\IBM_Coursera_Data_Analyst_Projects\CapStoneProjects\module1\Web-Scraping-Lab.ipynb"
output_pdf_path = r"C:\Users\Ede\Desktop\IBM_Coursera_Data_Analyst_Projects\CapStoneProjects\module1\Web-Scraping-Lab.pdf"

# Load the Jupyter Notebook file
with open(input_file_path, 'r', encoding='utf-8') as f:
    notebook_content = nbformat.read(f, as_version=4)

# Convert the notebook to HTML
html_exporter = nbconvert.HTMLExporter()
html_exporter.exclude_input = False  # Include code cells in the output
(body, resources) = html_exporter.from_notebook_node(notebook_content)

# Convert HTML to PDF
pdfkit.from_string(body, output_pdf_path)

# Return the PDF file path
print(f"Notebook successfully converted to PDF: {output_pdf_path}")


# Congratulations to us for having successfully completed the above lab!
# Authors: 
<h4>Kelechukwu Innocent Ede and Ramesh Sannareddy</h4>

# Other Contributors:
<ul>
<li>Rav Ahuja</li>
</ul>