<a href="https://colab.research.google.com/github/JuanPicUNT/JuanPic_DTSC3020_Fall2025/blob/main/Assignment_6_WebScraping%3Cjap0706%3E_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 6 (4 points) — Web Scraping

In this assignment you will complete **two questions**. The **deadline is posted on Canvas**.


## Assignment Guide (Read Me First)

- This notebook provides an **Install Required Libraries** cell and a **Common Imports & Polite Headers** cell. Run them first.
- Each question includes a **skeleton**. The skeleton is **not** a solution; it is a lightweight scaffold you may reuse.
- Under each skeleton you will find a **“Write your answer here”** code cell. Implement your scraping, cleaning, and saving logic there.
- When your code is complete, run the **Runner** cell to print a Top‑15 preview and save the CSV.
- Expected outputs:
  - **Q1:** `data_q1.csv` + Top‑15 sorted by the specified numeric column.
  - **Q2:** `data_q2.csv` + Top‑15 sorted by `points`.


In [None]:
1) #Install Required Libraries
!pip -q install requests beautifulsoup4 lxml pandas
print("Dependencies installed.")


### 2) Common Imports & Polite Headers

In [None]:
# Common Imports & Polite Headers
import re, sys, pandas as pd, requests
from bs4 import BeautifulSoup
HEADERS = {"User-Agent": (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/122.0 Safari/537.36")}
def fetch_html(url: str, timeout: int = 20) -> str:
    r = requests.get(url, headers=HEADERS, timeout=timeout)
    r.raise_for_status()
    return r.text
def flatten_headers(df: pd.DataFrame) -> pd.DataFrame:
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = [" ".join([str(x) for x in tup if str(x)!="nan"]).strip()
                      for tup in df.columns.values]
    else:
        df.columns = [str(c).strip() for c in df.columns]
    return df
print("Common helpers loaded.")


## Question 1 — IBAN Country Codes (table)
**URL:** https://www.iban.com/country-codes  
**Extract at least:** `Country`, `Alpha-2`, `Alpha-3`, `Numeric` (≥4 cols; you may add more)  
**Clean:** trim spaces; `Alpha-2/Alpha-3` → **UPPERCASE**; `Numeric` → **int** (nullable OK)  
**Output:** write **`data_q1.csv`** and **print a Top-15** sorted by `Numeric` (desc, no charts)  
**Deliverables:** notebook + `data_q1.csv` + short `README.md` (URL, steps, 1 limitation)

**Tip:** You can use `pandas.read_html(html)` to read tables and then pick one with ≥3 columns.


In [None]:
# --- Q1 Skeleton (fill the TODOs) ---
def q1_read_table(html: str) -> pd.DataFrame:
    """Return the first table with >= 3 columns from the HTML.
    TODO: implement with pd.read_html(html), pick a reasonable table, then flatten headers.
    """
    raise NotImplementedError("TODO: implement q1_read_table")

def q1_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Clean columns: strip, UPPER Alpha-2/Alpha-3, cast Numeric to int (nullable), drop invalids.
    TODO: implement cleaning steps.
    """
    raise NotImplementedError("TODO: implement q1_clean")

def q1_sort_top(df: pd.DataFrame, top: int = 15) -> pd.DataFrame:
    """Sort descending by Numeric and return Top-N.
    TODO: implement.
    """
    raise NotImplementedError("TODO: implement q1_sort_top")


NameError: name 'pd' is not defined

In [None]:
# Q1 — Write your answer here
import pandas as pd

URL = "https://www.iban.com/country-codes"

def q1_read_table(html: str) -> pd.DataFrame:
    """Return the first table with >= 3 columns from the HTML."""
    tables = pd.read_html(html)
    # pick the first table meeting the criteria
    for df in tables:
        if df.shape[1] >= 3:
            return df
    raise ValueError("No table found with >=3 columns")

def q1_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Clean columns: strip, UPPER Alpha‑2/Alpha‑3, cast Numeric to int (nullable), drop invalids."""
    # Assuming columns like "Country", "Alpha‑2 code", "Alpha‑3 code", "Numeric"
    df = df.copy()
    # rename columns more simply
    df = df.rename(columns={
        df.columns[0]: 'Country',
        df.columns[1]: 'Alpha2',
        df.columns[2]: 'Alpha3',
        df.columns[3]: 'Numeric'
    })
    # strip whitespace
    df['Country'] = df['Country'].str.strip()
    df['Alpha2']  = df['Alpha2'].str.strip().str.upper()
    df['Alpha3']  = df['Alpha3'].str.strip().str.upper()
    # convert Numeric: remove leading zeros, convert to int if possible
    df['Numeric'] = pd.to_numeric(df['Numeric'], errors='coerce').astype('Int64')
    # drop rows with missing Country or Alpha2/Alpha3 or Numeric
    df = df.dropna(subset=['Country','Alpha2','Alpha3','Numeric'])
    return df

def q1_sort_top(df: pd.DataFrame, top: int = 15) -> pd.DataFrame:
    """Sort descending by Numeric and return Top‑N."""
    return df.sort_values(by='Numeric', ascending=False).head(top)

if __name__ == "__main__":
    html = pd.read_html(URL)  # or use requests + html content
    df = q1_read_table(html=URL)  # adjust if passing html text
    df_clean = q1_clean(df)
    top15 = q1_sort_top(df_clean, top=15)
    print(top15)
    df_clean.to_csv("data_q1.csv", index=False)




                                               Country Alpha2 Alpha3  Numeric
247                                             Zambia     ZM    ZMB      894
246                                              Yemen     YE    YEM      887
192                                              Samoa     WS    WSM      882
244                                  Wallis and Futuna     WF    WLF      876
240                 Venezuela (Bolivarian Republic of)     VE    VEN      862
238                                         Uzbekistan     UZ    UZB      860
237                                            Uruguay     UY    URY      858
35                                        Burkina Faso     BF    BFA      854
243                              Virgin Islands (U.S.)     VI    VIR      850
236                     United States of America (the)     US    USA      840
219                       Tanzania, United Republic of     TZ    TZA      834
108                                        Isle of Man     IM   

## Question 2 — Hacker News (front page)
**URL:** https://news.ycombinator.com/  
**Extract at least:** `rank`, `title`, `link`, `points`, `comments` (user optional)  
**Clean:** cast `points`/`comments`/`rank` → **int** (non-digits → 0), fill missing text fields  
**Output:** write **`data_q2.csv`** and **print a Top-15** sorted by `points` (desc, no charts)  
**Tip:** Each story is a `.athing` row; details (points/comments/user) are in the next `<tr>` with `.subtext`.


In [None]:
# --- Q2 Skeleton (fill the TODOs) ---
def q2_parse_items(html: str) -> pd.DataFrame:
    """Parse front page items into DataFrame columns:
       rank, title, link, points, comments, user (optional).
    TODO: implement with BeautifulSoup on '.athing' and its sibling '.subtext'.
    """
    raise NotImplementedError("TODO: implement q2_parse_items")

def q2_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Clean numeric fields and fill missing values.
    TODO: cast points/comments/rank to int (non-digits -> 0). Fill text fields.
    """
    raise NotImplementedError("TODO: implement q2_clean")

def q2_sort_top(df: pd.DataFrame, top: int = 15) -> pd.DataFrame:
    """Sort by points desc and return Top-N. TODO: implement."""
    raise NotImplementedError("TODO: implement q2_sort_top")


In [None]:
# Q2 — Write your answer hereimport requests

from bs4 import BeautifulSoup
import pandas as pd
import re

def q2_parse_items(html: str) -> pd.DataFrame:
    """Parse front page items into DataFrame columns:
       rank, title, link, points, comments, user (optional).
    """
    soup = BeautifulSoup(html, 'html.parser')
    rows = soup.select('tr.athing')

    data = []
    for row in rows:
        rank_tag = row.select_one('.rank')
        title_tag = row.select_one('.titleline a')
        subtext_row = row.find_next_sibling('tr')
        subtext = subtext_row.select_one('.subtext')

        # Extract fields
        rank = rank_tag.text.strip().strip('.') if rank_tag else '0'
        title = title_tag.text.strip() if title_tag else ''
        link = title_tag['href'] if title_tag and title_tag.has_attr('href') else ''

        points = 0
        comments = 0
        user = ''

        if subtext:
            points_tag = subtext.select_one('.score')
            user_tag = subtext.select_one('.hnuser')
            comments_tag = subtext.find_all('a')[-1] if subtext.find_all('a') else None

            points = points_tag.text.replace(' points', '') if points_tag else '0'
            user = user_tag.text if user_tag else ''
            if comments_tag and 'comment' in comments_tag.text:
                comments = comments_tag.text.replace('\xa0comments', '').replace(' comments', '').replace(' comment', '')

        data.append({
            'rank': rank,
            'title': title,
            'link': link,
            'points': points,
            'comments': comments,
            'user': user
        })

    return pd.DataFrame(data)

def q2_clean(df: pd.DataFrame) -> pd.DataFrame:
    """Clean numeric fields and fill missing values."""
    for col in ['rank', 'points', 'comments']:
        df[col] = pd.to_numeric(df[col].apply(lambda x: re.sub(r'\D', '', str(x))), errors='coerce').fillna(0).astype(int)

    text_fields = ['title', 'link', 'user']
    for col in text_fields:
        df[col] = df[col].fillna('').astype(str)

    return df

def q2_sort_top(df: pd.DataFrame, top: int = 15) -> pd.DataFrame:
    """Sort by points desc and return Top-N."""
    return df.sort_values(by='points', ascending=False).head(top)

# --- Run everything together ---
if __name__ == '__main__':
    url = "https://news.ycombinator.com/"
    response = requests.get(url)
    html = response.text

    df_raw = q2_parse_items(html)
    df_clean = q2_clean(df_raw)
    df_top15 = q2_sort_top(df_clean, top=15)

    df_clean.to_csv('data_q2.csv', index=False)
    print(df_top15[['rank', 'title', 'link', 'points', 'comments']])


