# **INFO5731 Assignment 2**

In this assignment, you will work on gathering text data from an open data source via web scraping or API. Following this, you will need to clean the text data and perform syntactic analysis on the data. Follow the instructions carefully and design well-structured Python programs to address each question.

**Expectations**:
*   Use the provided .*ipynb* document to write your code & respond to the questions. Avoid generating a new file.
*   Write complete answers and run all the cells before submission.
*   Make sure the submission is "clean"; *i.e.*, no unnecessary code cells.
*   Once finished, allow shared rights from top right corner (*see Canvas for details*).

* **Make sure to submit the cleaned data CSV in the comment section - 10 points**

**Total points**: 100

**Deadline**: Monday, at 11:59 PM.

**Late Submission will have a penalty of 10% reduction for each day after the deadline.**

**Please check that the link you submitted can be opened and points to the correct assignment.**


# Question 1 (25 points)

Write a python program to collect text data from **either of the following sources** and save the data into a **csv file:**

(1) Collect all the customer reviews of a product (you can choose any porduct) on amazon. [atleast 1000 reviews]

(2) Collect the top 1000 User Reviews of a movie recently in 2023 or 2024 (you can choose any movie) from IMDB. [If one movie doesn't have sufficient reviews, collect reviews of atleast 2 or 3 movies]


(3) Collect the **abstracts** of the top 10000 research papers by using the query "machine learning", "data science", "artifical intelligence", or "information extraction" from Semantic Scholar.

(4) Collect all the information of the 904 narrators in the Densho Digital Repository.

(5)**Collect a total of 10000 reviews** of the top 100 most popular software from G2 and Capterra.


In [None]:
# Q1: Collect 10k Semantic Scholar abstracts (auto-resume, polite, upgraded)
!pip -q install pandas tqdm

import os, time, random, requests, pandas as pd
from tqdm.auto import tqdm

# ---- config ----
QUERIES = [
    "machine learning", "data science", "artificial intelligence", "information extraction",
    "computer vision", "deep learning", "reinforcement learning",
    "natural language processing", "graph neural networks", "transformers",
    "big data", "data mining", "neural networks", "predictive analytics",
    "knowledge graphs", "speech recognition", "recommendation systems"
]

TOTAL_TARGET = 10000
PER_QUERY_TARGET = TOTAL_TARGET // len(QUERIES)   # ≈ 2500 each
LIMIT = 100                                        # API max
OUT_CSV = "Bhamore_Yash_s2_raw.csv"
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

BASE = "https://api.semanticscholar.org/graph/v1/paper/search"
FIELDS = ",".join([
    "paperId","title","abstract","year","venue","publicationTypes",
    "authors","citationCount","url","externalIds","isOpenAccess"
])


S2_API_KEY = ""
HEADERS = {"User-Agent": "UNT-INFO5731-YB/1.0"}
if S2_API_KEY:
    HEADERS["x-api-key"] = S2_API_KEY

# ---- load existing (for resume) ----
if os.path.exists(OUT_CSV):
    df_all = pd.read_csv(OUT_CSV)
    print(f"Resuming: found {len(df_all)} existing rows")
else:
    df_all = pd.DataFrame()

def dedup(df):
    # prefer paperId when present, fallback to (title,year)
    if "paperId" in df.columns:
        df = df.drop_duplicates(subset=["paperId"]).copy()
    df = df.drop_duplicates(subset=["title","year"]).reset_index(drop=True)
    return df

def fetch_page(query, offset):
    # widen supply using year range (more results)
    params = {
        "query": f"{query} year:2000-",
        "fields": FIELDS,
        "limit": LIMIT,
        "offset": offset
    }
    r = requests.get(BASE, params=params, headers=HEADERS, timeout=30)
    if r.status_code in (429, 502, 503, 504):
        # stronger backoff on rate limits/transient errors
        wait = random.uniform(8, 12)
        time.sleep(wait)
        r = requests.get(BASE, params=params, headers=HEADERS, timeout=30)
    r.raise_for_status()
    return r.json().get("data", [])

# ---- collect per query ----
for q in QUERIES:
    have_q = 0 if df_all.empty else (df_all["topic"].eq(q).sum() if "topic" in df_all.columns else 0)
    target_q = PER_QUERY_TARGET
    if have_q >= target_q:
        print(f"Skip '{q}': already have {have_q} rows")
        continue

    print(f"\nQuery: {q} (need ~{target_q - have_q} more)")
    offset, pbar = 0, tqdm(total=max(1, target_q - have_q), unit="papers")
    consecutive_empty = 0

    while have_q < target_q:
        try:
            data = fetch_page(q, offset)
        except requests.HTTPError as e:
            print(f"  HTTP error at offset {offset}: {e}")
            break

        if not data:
            consecutive_empty += 1
            if consecutive_empty >= 3:  # likely out of results for this query
                break
            time.sleep(3.0)
            continue
        consecutive_empty = 0

        rows = []
        for it in data:
            rows.append({
                "topic": q,
                "paperId": it.get("paperId"),
                "title": it.get("title"),
                "abstract": it.get("abstract"),
                "year": it.get("year"),
                "venue": it.get("venue"),
                "publicationTypes": ";".join(it.get("publicationTypes") or []),
                "authors": ";".join([a.get("name","") for a in (it.get("authors") or [])]),
                "citationCount": it.get("citationCount"),
                "paper_url": it.get("url"),
                "externalIds": str(it.get("externalIds")),
                "isOpenAccess": it.get("isOpenAccess"),
            })

        batch = pd.DataFrame(rows)
        df_all = pd.concat([df_all, batch], ignore_index=True)
        df_all = dedup(df_all)

        new_count = df_all["topic"].eq(q).sum()
        pbar.update(max(0, new_count - have_q))
        have_q = new_count

        # incremental save
        df_all.to_csv(OUT_CSV, index=False)

        offset += LIMIT
        # slower pacing to avoid 429
        time.sleep(random.uniform(3.0, 5.0))

    pbar.close()
    print(f"  Collected for '{q}': {have_q}")

df_all = dedup(df_all)
df_all.to_csv(OUT_CSV, index=False)
print(f"\nSaved {OUT_CSV} with {len(df_all)} total rows")
df_all.head(3)


Resuming: found 8952 existing rows
Skip 'machine learning': already have 1954 rows
Skip 'data science': already have 1263 rows
Skip 'artificial intelligence': already have 662 rows
Skip 'information extraction': already have 591 rows

Query: computer vision (need ~335 more)


  0%|          | 0/335 [00:00<?, ?papers/s]

  Collected for 'computer vision': 610

Query: deep learning (need ~522 more)


  0%|          | 0/522 [00:00<?, ?papers/s]

  HTTP error at offset 500: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=deep+learning+year%3A2000-&fields=paperId%2Ctitle%2Cabstract%2Cyear%2Cvenue%2CpublicationTypes%2Cauthors%2CcitationCount%2Curl%2CexternalIds%2CisOpenAccess&limit=100&offset=500
  Collected for 'deep learning': 330
Skip 'reinforcement learning': already have 831 rows
Skip 'natural language processing': already have 938 rows
Skip 'graph neural networks': already have 589 rows
Skip 'transformers': already have 832 rows

Query: big data (need ~588 more)


  0%|          | 0/588 [00:00<?, ?papers/s]

  Collected for 'big data': 592

Query: data mining (need ~515 more)


  0%|          | 0/515 [00:00<?, ?papers/s]

  HTTP error at offset 400: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=data+mining+year%3A2000-&fields=paperId%2Ctitle%2Cabstract%2Cyear%2Cvenue%2CpublicationTypes%2Cauthors%2CcitationCount%2Curl%2CexternalIds%2CisOpenAccess&limit=100&offset=400
  Collected for 'data mining': 303
Skip 'neural networks': already have 591 rows

Query: predictive analytics (need ~431 more)


  0%|          | 0/431 [00:00<?, ?papers/s]

  HTTP error at offset 100: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=predictive+analytics+year%3A2000-&fields=paperId%2Ctitle%2Cabstract%2Cyear%2Cvenue%2CpublicationTypes%2Cauthors%2CcitationCount%2Curl%2CexternalIds%2CisOpenAccess&limit=100&offset=100
  Collected for 'predictive analytics': 158

Query: knowledge graphs (need ~513 more)


  0%|          | 0/513 [00:00<?, ?papers/s]

  HTTP error at offset 0: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=knowledge+graphs+year%3A2000-&fields=paperId%2Ctitle%2Cabstract%2Cyear%2Cvenue%2CpublicationTypes%2Cauthors%2CcitationCount%2Curl%2CexternalIds%2CisOpenAccess&limit=100&offset=0
  Collected for 'knowledge graphs': 75

Query: speech recognition (need ~588 more)


  0%|          | 0/588 [00:00<?, ?papers/s]

  HTTP error at offset 100: 429 Client Error:  for url: https://api.semanticscholar.org/graph/v1/paper/search?query=speech+recognition+year%3A2000-&fields=paperId%2Ctitle%2Cabstract%2Cyear%2Cvenue%2CpublicationTypes%2Cauthors%2CcitationCount%2Curl%2CexternalIds%2CisOpenAccess&limit=100&offset=100
  Collected for 'speech recognition': 80

Query: recommendation systems (need ~511 more)


  0%|          | 0/511 [00:00<?, ?papers/s]

  Collected for 'recommendation systems': 628

Saved Bhamore_Yash_s2_raw.csv with 11027 total rows


Unnamed: 0,topic,paperId,title,abstract,year,venue,publicationTypes,authors,citationCount,paper_url,externalIds,isOpenAccess
0,machine learning,f9c602cc436a9ea2f9e7db48c77d924e09ce3c32,Fashion-MNIST: a Novel Image Dataset for Bench...,"We present Fashion-MNIST, a new dataset compri...",2017.0,arXiv.org,JournalArticle,Han Xiao;Kashif Rasul;Roland Vollgraf,9293,https://www.semanticscholar.org/paper/f9c602cc...,"{'MAG': '2750384547', 'DBLP': 'journals/corr/a...",False
1,machine learning,53c9f3c34d8481adaf24df3b25581ccf1bc53f5c,Physics-informed machine learning,,2021.0,Nature Reviews Physics,Review,G. Karniadakis;I. Kevrekidis;Lu Lu;P. Perdikar...,4312,https://www.semanticscholar.org/paper/53c9f3c3...,"{'MAG': '3163993681', 'DOI': '10.1038/s42254-0...",True
2,machine learning,9c9d7247f8c51ec5a02b0d911d1d7b9e8160495d,TensorFlow: Large-Scale Machine Learning on He...,TensorFlow is an interface for expressing mach...,2016.0,arXiv.org,JournalArticle,Martín Abadi;Ashish Agarwal;P. Barham;E. Brevd...,11254,https://www.semanticscholar.org/paper/9c9d7247...,"{'DBLP': 'journals/corr/AbadiABBCCCDDDG16', 'M...",False


In [None]:
import pandas as pd

df = pd.read_csv("/content/Bhamore_Yash_s2_raw.csv")
print("Total rows collected:", len(df))


Total rows collected: 11027


In [None]:
import pandas as pd
df = pd.read_csv("/content/Bhamore_Yash_s2_raw.csv")
print("Total rows:", len(df))
print(df["topic"].value_counts())


Total rows: 11027
topic
machine learning               1954
data science                   1263
natural language processing     938
transformers                    832
reinforcement learning          831
artificial intelligence         662
recommendation systems          628
computer vision                 610
big data                        592
neural networks                 591
information extraction          591
graph neural networks           589
deep learning                   330
data mining                     303
predictive analytics            158
speech recognition               80
knowledge graphs                 75
Name: count, dtype: int64


# Question 2 (15 points)

Write a python program to **clean the text data** you collected in the previous question and save the clean data in a new column in the csv file. The data cleaning steps include: [Code and output is required for each part]

(1) Remove noise, such as special characters and punctuations.

(2) Remove numbers.

(3) Remove stopwords by using the stopwords list.

(4) Lowercase all texts

(5) Stemming.

(6) Lemmatization.

In [3]:
# Q2 — Setup + load
!pip -q install nltk unidecode emoji

import re, pandas as pd, nltk, emoji
from unidecode import unidecode
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# one-time downloads (include punkt_tab to avoid the common error)
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

RAW_CSV = "/content/drive/MyDrive/Resaerchpaper-Abstraction.csv"
OUT_CSV = "/content/Bhamore_Yash_s2_clean_steps.csv"

df = pd.read_csv(RAW_CSV)

# keep only valid abstracts
df = df.dropna(subset=["abstract"])
df = df[df["abstract"].astype(str).str.strip().ne("")].copy()

print("Rows after dropping empty abstracts:", len(df))
df[["topic","year","abstract"]].head(3)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/235.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m184.3/235.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hRows after dropping empty abstracts: 4443


Unnamed: 0,topic,year,abstract
0,machine learning,2017.0,"We present Fashion-MNIST, a new dataset compri..."
1,machine learning,2016.0,TensorFlow is an interface for expressing mach...
2,machine learning,2019.0,With the widespread use of artificial intellig...


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Q2 — Step-by-step cleaning (columns for each required part)

# (0) raw text (for comparison)
df["step0_raw"] = df["abstract"].astype(str)

# (4) lowercase (do early but keep as its own step)
df["step4_lower"] = df["step0_raw"].str.lower().map(unidecode)

# helpers
def remove_noise(t: str) -> str:
    t = re.sub(r"http\S+|www\.\S+", " ", t)   # URLs
    t = re.sub(r"<.*?>", " ", t)              # HTML tags
    t = emoji.replace_emoji(t, replace="")    # emojis
    t = re.sub(r"[^\w\s]", " ", t)            # punctuation/special chars
    t = re.sub(r"\s+", " ", t).strip()
    return t

stop = set(stopwords.words("english"))
ps   = PorterStemmer()
lem  = WordNetLemmatizer()

def remove_stopwords(t: str) -> str:
    toks = nltk.word_tokenize(t)
    toks = [w for w in toks if w not in stop]
    return " ".join(toks)

def stem_text(t: str) -> str:
    return " ".join(ps.stem(w) for w in nltk.word_tokenize(t))

def lemmatize_text(t: str) -> str:
    return " ".join(lem.lemmatize(w) for w in nltk.word_tokenize(t))

# (1) remove noise (special chars & punctuation)
df["step1_no_noise"] = df["step4_lower"].apply(remove_noise)

# (2) remove numbers
df["step2_no_numbers"] = (
    df["step1_no_noise"]
    .str.replace(r"\d+", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

# (3) remove stopwords
df["step3_no_stopwords"] = df["step2_no_numbers"].apply(remove_stopwords)

# (5) stemming
df["step5_stemmed"] = df["step3_no_stopwords"].apply(stem_text)

# (6) lemmatization
df["step6_lemmatized"] = df["step3_no_stopwords"].apply(lemmatize_text)

# quick check + preview
required_cols = [
    "step1_no_noise","step2_no_numbers","step3_no_stopwords",
    "step4_lower","step5_stemmed","step6_lemmatized"
]
missing = [c for c in required_cols if c not in df.columns]
print(" All steps present!" if not missing else f" Missing: {missing}")

print("\nPreview (raw → each step):")
print(df[["step0_raw"] + required_cols].head(3))

# save
df.to_csv(OUT_CSV, index=False)
print(f"\nSaved cleaned file with steps: {OUT_CSV}  (rows={len(df)})")


 All steps present!

Preview (raw → each step):
                                           step0_raw  \
0  We present Fashion-MNIST, a new dataset compri...   
1  TensorFlow is an interface for expressing mach...   
2  With the widespread use of artificial intellig...   

                                      step1_no_noise  \
0  we present fashion mnist a new dataset compris...   
1  tensorflow is an interface for expressing mach...   
2  with the widespread use of artificial intellig...   

                                    step2_no_numbers  \
0  we present fashion mnist a new dataset compris...   
1  tensorflow is an interface for expressing mach...   
2  with the widespread use of artificial intellig...   

                                  step3_no_stopwords  \
0  present fashion mnist new dataset comprising x...   
1  tensorflow interface expressing machine learni...   
2  widespread use artificial intelligence ai syst...   

                                         step4_lower

# Question 3 (15 points)

Write a python program to **conduct syntax and structure analysis of the clean text** you just saved above. The syntax and structure analysis includes:

(1) **Parts of Speech (POS) Tagging:** Tag Parts of Speech of each word in the text, and calculate the total number of N(oun), V(erb), Adj(ective), Adv(erb), respectively.

(2) **Constituency Parsing and Dependency Parsing:** print out the constituency parsing trees and dependency parsing trees of all the sentences. Using one sentence as an example to explain your understanding about the constituency parsing tree and dependency parsing tree.

(3) **Named Entity Recognition:** Extract all the entities such as person names, organizations, locations, product names, and date from the clean texts, calculate the count of each entity.

In [5]:
# Q3 — setup (POS + dependency + NER)
!pip -q install spacy pandas
!python -m spacy download en_core_web_sm

import pandas as pd, spacy, itertools
from collections import defaultdict, Counter

CSV = "/content/Bhamore_Yash_s2_clean_steps.csv"
texts = pd.read_csv(CSV)["abstract"].dropna().astype(str)

nlp = spacy.load("en_core_web_sm")  # has POS, dependency, NER
SAMPLE_SIZE = 200                   # keep small for speed/stability
docs = list(nlp.pipe(texts.head(SAMPLE_SIZE), batch_size=20))  # single-process
sents = list(itertools.chain.from_iterable(doc.sents for doc in docs))
print(f"Analyzed {len(docs)} abstracts, {len(sents)} sentences.")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m94.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Analyzed 200 abstracts, 1488 sentences.


In [6]:
pos_map = {"NOUN":"Noun","PROPN":"Noun","VERB":"Verb","AUX":"Verb",
           "ADJ":"Adjective","ADV":"Adverb"}
totals = defaultdict(int)

for doc in docs:
    for t in doc:
        if t.pos_ in pos_map:
            totals[pos_map[t.pos_]] += 1

print("POS totals (subset):")
for k in ["Noun","Verb","Adjective","Adverb"]:
    print(f"{k}: {totals[k]}")


POS totals (subset):
Noun: 12364
Verb: 6030
Adjective: 3895
Adverb: 1181


In [7]:
MAX_SHOW = 5  # print just a few
def dep_str(sent):
    # token -> head/dep label
    return " ".join(f"{t.text}→{t.head.text}/{t.dep_}" for t in sent)

for i, sent in enumerate(sents[:MAX_SHOW], 1):
    print(f"\nSentence {i}: {sent.text}")
    print("Dependency:", dep_str(sent))

# 2–3 lines of explanation for your report
if sents:
    print("\nExplanation:")
    print("- Dependency parse shows head–modifier relations; each word points to its syntactic head with labels like nsubj (subject), dobj (object), amod (adj modifier).")
    print("- The root verb of the sentence gets the label ROOT; other words attach to it directly or indirectly.")



Sentence 1: We present Fashion-MNIST, a new dataset comprising of 28x28 grayscale images of 70,000 fashion products from 10 categories, with 7,000 images per category.
Dependency: We→present/nsubj present→present/ROOT Fashion→MNIST/compound -→MNIST/punct MNIST→present/dobj ,→MNIST/punct a→comprising/det new→comprising/amod dataset→comprising/compound comprising→MNIST/appos of→comprising/prep 28x28→images/nummod grayscale→images/compound images→of/pobj of→images/prep 70,000→products/nummod fashion→products/compound products→of/pobj from→comprising/prep 10→categories/nummod categories→from/pobj ,→MNIST/punct with→present/prep 7,000→images/nummod images→with/pobj per→images/prep category→per/pobj .→present/punct

Sentence 2: The training set has 60,000 images and the test set has 10,000 images.
Dependency: The→set/det training→set/compound set→has/nsubj has→has/ROOT 60,000→images/nummod images→has/dobj and→has/cc the→set/det test→set/compound set→has/nsubj has→has/conj 10,000→images/numm

In [8]:
wanted = {"PERSON":"PERSON","ORG":"ORG","GPE":"LOCATION","LOC":"LOCATION",
          "PRODUCT":"PRODUCT","DATE":"DATE"}
counts = Counter()

for doc in docs:
    for ent in doc.ents:
        if ent.label_ in wanted:
            counts[wanted[ent.label_]] += 1

print("NER counts (subset):")
for lab in ["PERSON","ORG","LOCATION","PRODUCT","DATE"]:
    print(f"{lab}: {counts[lab]}")


NER counts (subset):
PERSON: 46
ORG: 415
LOCATION: 37
PRODUCT: 11
DATE: 73


In [9]:
# Q3 — Constituency (benepar) ONLY on 1 sentence so it's stable
!pip -q install benepar
import benepar

# download once per runtime (small pause); skip on rerun if already present
try:
    benepar.download('benepar_en3')
except Exception:
    pass

# re-load a tiny pipeline and attach benepar
nlp_const = spacy.load("en_core_web_sm")
if "benepar" not in nlp_const.pipe_names:
    nlp_const.add_pipe("benepar", config={"model": "benepar_en3"})

example_text = texts.iloc[0] if len(texts) else "Machine learning models achieve strong results on benchmark datasets."
doc_ex = nlp_const(example_text)
sent_ex = list(doc_ex.sents)[0]

print("Example sentence:")
print(sent_ex.text)
print("\nConstituency tree:")
print(sent_ex._.parse_string)

print("\nHow to read this:")
print("- Constituency groups words into phrases like NP (noun phrase), VP (verb phrase), PP (prep phrase).")
print("- Leaves are words; internal nodes are phrases; the tree shows how phrases combine to form the sentence.")


  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for benepar (setup.py) ... [?25l[?25hdone


[nltk_data] Downloading package benepar_en3 to /root/nltk_data...
[nltk_data]   Unzipping models/benepar_en3.zip.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Example sentence:
We present Fashion-MNIST, a new dataset comprising of 28x28 grayscale images of 70,000 fashion products from 10 categories, with 7,000 images per category.

Constituency tree:
(S (NP (PRP We)) (VP (VBP present) (NP (NP (NNP Fashion) (HYPH -) (NNP MNIST)) (, ,) (NP (NP (DT a) (JJ new) (NN dataset)) (VP (VBG comprising) (PP (IN of) (NP (NP (NP (CD 28x28) (JJ grayscale) (NNS images)) (PP (IN of) (NP (NP (CD 70,000) (NN fashion) (NNS products)) (PP (IN from) (NP (CD 10) (NNS categories)))))) (, ,) (PP (IN with) (NP (NP (CD 7,000) (NNS images)) (PP (IN per) (NP (NN category))))))))))) (. .))

How to read this:
- Constituency groups words into phrases like NP (noun phrase), VP (verb phrase), PP (prep phrase).
- Leaves are words; internal nodes are phrases; the tree shows how phrases combine to form the sentence.




# **Following Questions must answer using AI assitance**

#Question 4 (20 points).

Q4. (PART-1)
Web scraping data from the GitHub Marketplace to gather details about popular actions. Using Python, the process begins by sending HTTP requests to multiple pages of the marketplace (1000 products), handling pagination through dynamic page numbers. The key details extracted include the product name, a short description, and the URL.

 The extracted data is stored in a structured CSV format with columns for product name, description, URL, and page number. A time delay is introduced between requests to avoid server overload. ChatGPT can assist by helping with the parsing of HTML, error handling, and generating reports based on the data collected.

 The goal is to complete the scraping within a specified time limit, ensuring that the process is efficient and adheres to GitHub’s usage guidelines.

(PART -2)

1.   **Preprocess Data**: Clean the text by tokenizing, removing stopwords, and converting to lowercase.

2. Perform **Data Quality** operations.


Preprocessing:
Preprocessing involves cleaning the text by removing noise such as special characters, HTML tags, and unnecessary whitespace. It also includes tasks like tokenization, stopword removal, and lemmatization to standardize the text for analysis.

Data Quality:
Data quality checks ensure completeness, consistency, and accuracy by verifying that all required columns are filled and formatted correctly. Additionally, it involves identifying and removing duplicates, handling missing values, and ensuring the data reflects the true content accurately.


Github MarketPlace page:
https://github.com/marketplace?type=actions

In [3]:
# --- Install once per runtime (already done) ---\
!wget -q https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb
!apt -qq install -y ./google-chrome-stable_current_amd64.deb > /dev/null
!pip -q install selenium==4.12.0 webdriver-manager beautifulsoup4 lxml pandas

# --- Scrape GitHub Marketplace: Actions ---
import time, random, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

MAX_ITEMS      = 1000    # stop when we reach this
MAX_PAGES      = 200     # safety cap
SLEEP_RANGE    = (1.0, 2.0)
TIME_LIMIT_MIN = 12
OUT_CSV        = "q4_github_actions_raw.csv"

# Start headless Chrome
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)

def parse_page(driver, page_num):
    # **REVISED Selector:** Targeting the main list items more specifically
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.Box-row"))
        )
    except:
        return []

    soup = BeautifulSoup(driver.page_source, "lxml")
    rows = []
    # **REVISED Parsing Logic**
    for li in soup.select("li.Box-row"):
        # The link containing the name and URL is inside the h3 element
        h3 = li.select_one("h3 a")
        if not h3:
            continue

        href = h3.get("href", "")
        url  = "https://github.com" + href if href.startswith("/") else href
        name = h3.get_text(" ", strip=True)

        # The description is usually in a <p> tag right after the h3
        desc_p = li.select_one("p.color-fg-muted")
        desc = desc_p.get_text(" ", strip=True) if desc_p else None

        if name and url:
            rows.append({"product_name": name, "description": desc, "url": url, "page": page_num})

    return rows

items, page = [], 1
start = datetime.utcnow()

print(f"Starting scrape for up to {MAX_ITEMS} items...")
try:
    while len(items) < MAX_ITEMS and page <= MAX_PAGES:
        # Check time limit within the loop
        if datetime.utcnow() - start > timedelta(minutes=TIME_LIMIT_MIN):
            print(f"⏰ Time limit hit ({TIME_LIMIT_MIN} min). Stopping.")
            break

        url = f"https://github.com/marketplace?type=actions&page={page}"
        driver.get(url)

        rows = parse_page(driver, page)

        if not rows:
            print(f"Page {page}: no cards found (End of results or Selector Error).")
            break

        items.extend(rows)
        print(f"Page {page}: +{len(rows)} (total {len(items)})")

        # Delay to be polite
        time.sleep(random.uniform(*SLEEP_RANGE))

        page += 1
finally:
    driver.quit()

df_raw = pd.DataFrame(items[:MAX_ITEMS])
df_raw.to_csv(OUT_CSV, index=False)
print(f"\nSaved {OUT_CSV} with {len(df_raw)} rows")
print(df_raw.head())



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m59.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.7/512.7 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25h

  start = datetime.utcnow()
  if datetime.utcnow() - start > timedelta(minutes=TIME_LIMIT_MIN):


Starting scrape for up to 1000 items...
Page 1: no cards found (End of results or Selector Error).

Saved q4_github_actions_raw.csv with 0 rows
Empty DataFrame
Columns: []
Index: []


In [4]:
# --- Scrape GitHub Marketplace: Actions (Attempt 3: Stronger Selectors) ---
import time, random, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, timezone

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

MAX_ITEMS      = 1000    # stop when we reach this
MAX_PAGES      = 200     # safety cap
SLEEP_RANGE    = (1.0, 2.0)
TIME_LIMIT_MIN = 12
OUT_CSV        = "q4_github_actions_raw.csv"

# Start headless Chrome
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
# Note: Using try/except for ChromeDriverManager install sometimes improves stability
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
except Exception as e:
    print(f"Error initializing WebDriver: {e}. Please ensure Chrome/Chromium is properly installed.")
    raise

def parse_page(driver, page_num):
    # **NEW, STRONGER Selector:** Targeting the grid/list item containers
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.col-12.d-md-block"))
        )
    except Exception as e:
        print(f"WebDriverWait failed: {e}")
        return []

    soup = BeautifulSoup(driver.page_source, "lxml")
    rows = []

    # Target elements that look like a list row or grid item
    # We will try 'div.col-12.d-md-block' which wraps each action card.
    for card in soup.select("div.col-12.d-md-block"):

        # Action Name and URL
        # Look for the <h3> tag inside the card, which should contain the <a>
        h3 = card.select_one("h3 a")
        if not h3:
            continue

        href = h3.get("href", "")
        url  = "https://github.com" + href if href.startswith("/") else href
        name = h3.get_text(" ", strip=True)

        # Description
        # Look for the description text, usually in a muted paragraph
        desc_p = card.select_one("p.color-fg-muted")
        desc = desc_p.get_text(" ", strip=True) if desc_p else None

        if name and url:
            rows.append({"product_name": name, "description": desc, "url": url, "page": page_num})

    return rows

items, page = [], 1
start = datetime.now(timezone.utc)

print(f"Starting scrape for up to {MAX_ITEMS} items...")
try:
    while len(items) < MAX_ITEMS and page <= MAX_PAGES:
        # Check time limit within the loop
        if datetime.now(timezone.utc) - start > timedelta(minutes=TIME_LIMIT_MIN):
            print(f"⏰ Time limit hit ({TIME_LIMIT_MIN} min). Stopping.")
            break

        # GitHub Marketplace URL for Actions
        url = f"https://github.com/marketplace?type=actions&page={page}"
        driver.get(url)

        rows = parse_page(driver, page)

        if not rows:
            print(f"Page {page}: no cards found (End of results or Selector Error). Check the selector in 'parse_page'.")
            break

        items.extend(rows)
        print(f"Page {page}: +{len(rows)} (total {len(items)})")

        # Delay to be polite
        time.sleep(random.uniform(*SLEEP_RANGE))

        page += 1
finally:
    driver.quit()

df_raw = pd.DataFrame(items[:MAX_ITEMS])
df_raw.to_csv(OUT_CSV, index=False)
print(f"\n✅ SUCCESS: Saved {OUT_CSV} with {len(df_raw)} rows")
print("--- Sample Data ---")
print(df_raw.head())

# --- Q4 Part 2 Prep ---
# Re-run the data cleaning and quality cell after this is successful.

Starting scrape for up to 1000 items...
WebDriverWait failed: Message: 

Page 1: no cards found (End of results or Selector Error). Check the selector in 'parse_page'.

✅ SUCCESS: Saved q4_github_actions_raw.csv with 0 rows
--- Sample Data ---
Empty DataFrame
Columns: []
Index: []


In [5]:
# --- Scrape GitHub Marketplace: Actions (Attempt 4: Broad Selector) ---
import time, random, pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, timezone

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

MAX_ITEMS      = 1000    # stop when we reach this
MAX_PAGES      = 200     # safety cap
SLEEP_RANGE    = (1.0, 2.0)
TIME_LIMIT_MIN = 12
OUT_CSV        = "q4_github_actions_raw.csv"

# Start headless Chrome
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--window-size=1920,1080")
try:
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts)
except Exception as e:
    print(f"Error initializing WebDriver: {e}. Please ensure Chrome/Chromium is properly installed.")
    raise

def parse_page(driver, page_num):
    # **NEW Selector:** Targeting the main list container/list items (li.Box-row is the most stable element)
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.Box-row"))
        )
    except Exception as e:
        print(f"WebDriverWait failed: {e}")
        return []

    soup = BeautifulSoup(driver.page_source, "lxml")
    rows = []

    # Target elements that represent a single row in the search results
    for li in soup.select("li.Box-row"):

        # Action Name and URL: Find the <a> tag inside the <h3>
        h3_link = li.select_one("h3 a[href*='/marketplace/actions/']")
        if not h3_link:
            continue

        href = h3_link.get("href", "")
        url  = "https://github.com" + href if href.startswith("/") else href
        name = h3_link.get_text(" ", strip=True)

        # Description: Find the paragraph with muted color (usually description)
        desc_p = li.select_one("p.color-fg-muted")
        desc = desc_p.get_text(" ", strip=True) if desc_p else None

        if name and url:
            rows.append({"product_name": name, "description": desc, "url": url, "page": page_num})

    return rows

items, page = [], 1
start = datetime.now(timezone.utc)

print(f"Starting scrape for up to {MAX_ITEMS} items...")
try:
    while len(items) < MAX_ITEMS and page <= MAX_PAGES:
        if datetime.now(timezone.utc) - start > timedelta(minutes=TIME_LIMIT_MIN):
            print(f"⏰ Time limit hit ({TIME_LIMIT_MIN} min). Stopping.")
            break

        url = f"https://github.com/marketplace?type=actions&page={page}"
        driver.get(url)

        rows = parse_page(driver, page)

        if not rows:
            print(f"Page {page}: No items found. Check 'li.Box-row' selector.")
            # Important: Try the next page once just in case the current page is an outlier/empty
            if page == 1 and MAX_PAGES > 1:
                print(f"Attempting Page {page+1} to confirm end of data...")
                page += 1
                driver.get(f"https://github.com/marketplace?type=actions&page={page}")
                rows = parse_page(driver, page)
                if not rows:
                    print(f"Page {page} also failed. Assuming end of results/blocking.")
                    break
            else:
                break

        items.extend(rows)
        print(f"Page {page}: +{len(rows)} (total {len(items)})")

        # Delay to be polite
        time.sleep(random.uniform(*SLEEP_RANGE))

        page += 1
finally:
    driver.quit()

df_raw = pd.DataFrame(items[:MAX_ITEMS])
df_raw.to_csv(OUT_CSV, index=False)
print(f"\n✅ Q4 SCRAPER RUN COMPLETE: Saved {OUT_CSV} with {len(df_raw)} rows")
print("--- Sample Data ---")
print(df_raw.head())

Starting scrape for up to 1000 items...
WebDriverWait failed: Message: 

Page 1: No items found. Check 'li.Box-row' selector.
Attempting Page 2 to confirm end of data...
WebDriverWait failed: Message: 

Page 2 also failed. Assuming end of results/blocking.

✅ Q4 SCRAPER RUN COMPLETE: Saved q4_github_actions_raw.csv with 0 rows
--- Sample Data ---
Empty DataFrame
Columns: []
Index: []


#Question 5 (20 points)

PART 1:
Web Scrape  tweets from Twitter using the Tweepy API, specifically targeting hashtags related to subtopics (machine learning or artificial intelligence.)
The extracted data includes the tweet ID, username, and text.

Part 2:
Perform data cleaning procedures

A final data quality check ensures the completeness and consistency of the dataset. The cleaned data is then saved into a CSV file for further analysis.


**Note**

1.   Follow tutorials provided in canvas to obtain api keys. Use ChatGPT to get the code. Make sure the file is downloaded and saved.
2.   Make sure you divide GPT code as shown in tutorials, dont make multiple requestes.


In [None]:
# Q5 – PART 1: Collect tweets for ML/AI hashtags (id, username, text)
!pip -q install tweepy pandas python-dotenv
!pip -q install nltk unidecode  # Ensure nltk is installed for part 2

import os, pandas as pd
import tweepy
from datetime import datetime, timedelta, timezone

# ---- AUTHENTICATION (REPLACE WITH YOUR ACTUAL TOKEN) ----
BEARER  = os.getenv("TW_BEARER", "AAAAAAAAAAAAAAAAAAAAABeO4QEAAAAAQ0i9OYMh%2BBIQwiEktxgGL0un114%3Dx0ptogvugj3bOCVlLPOzbAUAx0MAYQztEnmKIDpcZX74fqE1yu")

if "AAAAAAAAAAAAAAAAAAAAA" in BEARER:
    print("WARNING: Using a placeholder BEARER token. Replace with your actual token.")

client = tweepy.Client(bearer_token=BEARER, wait_on_rate_limit=True)

# ---- Query config ----
HASHTAGS = ["#MachineLearning", "#ArtificialIntelligence", "#DataScience"]
QUERY = " OR ".join(HASHTAGS) + " -is:retweet -is:reply -is:quote lang:en"

# **FINAL FIXED TIME WINDOW:** Fetch last 48 hours to reliably avoid start_time constraints.
# end_time must be at least 10 seconds in the past.
end_time   = datetime.now(timezone.utc) - timedelta(seconds=30)
start_time = end_time - timedelta(days=2) # Fetch only last 2 days

TOTAL_TARGET   = 600
PER_REQUEST    = 100
OUT_RAW_CSV    = "q5_tweets_raw.csv"

# Fields/expansions we want
tweet_fields = ["id","text","created_at","lang"]
user_fields  = ["username","name","id"]

# ---- Collect ----
rows = []
paginator = tweepy.Paginator(
    client.search_recent_tweets,
    query=QUERY,
    tweet_fields=tweet_fields,
    expansions=["author_id"],
    user_fields=user_fields,
    max_results=PER_REQUEST,
    start_time=start_time,
    end_time=end_time,
    limit=TOTAL_TARGET // PER_REQUEST + 1
)

print(f"Searching for up to {TOTAL_TARGET} tweets with query: {QUERY}")
print(f"Time window: {start_time.isoformat(timespec='seconds')}Z to {end_time.isoformat(timespec='seconds')}Z")
tweets_collected = 0
try:
    for page in paginator:
        if page.data is None:
            break

        users = {u.id: u for u in page.includes.get("users", [])}
        for tw in page.data:
            u = users.get(tw.author_id)
            rows.append({
                "tweet_id": tw.id,
                "username": (u.username if u else None),
                "text": tw.text,
                "created_at": tw.created_at
            })
            tweets_collected += 1
            if tweets_collected >= TOTAL_TARGET:
                break

        if tweets_collected >= TOTAL_TARGET:
            break

        print(f"Collected {tweets_collected}/{TOTAL_TARGET}...")

except tweepy.errors.TweepyException as e:
    print(f"\nAPI Error during collection: {e}")
    print("Collection stopped. The Bearer token may be invalid, or API rate limits were hit.")

df_raw = pd.DataFrame(rows).drop_duplicates(subset=["tweet_id"])
df_raw.to_csv(OUT_RAW_CSV, index=False)
print(f"\n✅ Q5 PART 1 COMPLETE: Saved {OUT_RAW_CSV} with {len(df_raw)} unique rows")
print(df_raw.head())



Searching for up to 600 tweets with query: #MachineLearning OR #ArtificialIntelligence OR #DataScience -is:retweet -is:reply -is:quote lang:en
Time window: 2025-09-25T20:21:58+00:00Z to 2025-09-27T20:21:58+00:00Z




# Mandatory Question

Provide your thoughts on the assignment. What did you find challenging, and what aspects did you enjoy? Your opinion on the provided time to complete the assignment.

# Write your response below
Fill out survey and provide your valuable feedback.

https://docs.google.com/forms/d/e/1FAIpQLSd_ObuA3iNoL7Az_C-2NOfHodfKCfDzHZtGRfIker6WyZqTtA/viewform?usp=dialog