In [None]:
! mkdir -p .data/2023-mining-qa-examples.ipynb && \
    cd .data/2023-mining-qa-examples.ipynb && \
    gdown --id "1jO5wp8OJ-IiP_OrfjHwGd3jVHHH_zh9U"

In [None]:
! cd .data/2023-mining-qa-examples.ipynb && \
    rm -rf val.jsonl && \
    unzstd val.jsonl.zst

In [None]:
# prompt: ingest val.jsonl line by line

import json

piles = {}
c_lines = 0

with open('.data/2023-mining-qa-examples.ipynb/val.jsonl', 'r') as f:
  for line in f:
    data = json.loads(line)
    if data["meta"]["pile_set_name"] not in piles:
      piles[data["meta"]["pile_set_name"]] = []
    piles[data["meta"]["pile_set_name"]].append(data["text"])
    c_lines += 1

print(piles.keys())


In [None]:
from transformers import pipeline

pipe = pipeline(model="facebook/bart-large-mnli", device=0)
pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
)

In [None]:
print(piles["StackExchange"][0])

In [None]:
pipe(piles["OpenWebText2"][0][:100],
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer", "question & answer", "news"],
)

In [None]:
print(c_lines)

In [None]:
# prompt: graph the lengths of the strings for the `OpenWebText2` array in `piles`

import matplotlib.pyplot as plt

lengths = [len(s) for s in piles["OpenWebText2"]]

plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=100)
plt.xlabel("String Length")
plt.ylabel("Frequency")
plt.title("Distribution of String Lengths in OpenWebText2")
plt.show()


In [None]:
lengths = [len(s) for s in piles["StackExchange"]]

plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=100)
plt.xlabel("String Length")
plt.ylabel("Frequency")
plt.title("Distribution of String Lengths in OpenWebText2")
plt.show()

In [None]:
# prompt: list the sizes of each array in `piles`

for pile in piles:
  print(pile, len(piles[pile]))


In [None]:
# prompt: print the first 100 characters of each string in piles["PhilPapers"]

# keys: DM Mathematics, Pile-CC, Enron Emails, StackExchange, OpenWebText2, Github?

for s in piles["PubMed Abstracts"][:10]:
  print(s[:100], '\n')


In [None]:
%%time
for text in piles["Pile-CC"][:10]:
  score = pipe(text[:200], candidate_labels=["question & answer"], multi_label=True
               )["scores"][0]
  print(score)

In [None]:
print(piles["Pile-CC"][1], '\n', piles["Pile-CC"][4][:200], '\n', piles["Pile-CC"][6][:200])

In [None]:
pipe(piles["NIH ExPorter"][:10], candidate_labels=["question & answer"], multi_label=True)

In [None]:
%%time
# prompt: batch process all of the text in `piles` through `pipe`

scores = []
for pile in ["DM Mathematics", "Enron Emails", "StackExchange", "Github", "OpenWebText2", "Pile-CC"]:
  print("doing ", pile)
  cp = piles[pile]
  for i in range(len(cp)):
    cp[i] = cp[i][:100]
  ress = pipe(cp, candidate_labels=["question & answer"], multi_label=True)
  for res in ress:
    scores.append((res["scores"][0], res["sequence"], pile))

# Sort the scores in descending order
scores.sort(key=lambda x: x[0], reverse=True)

# Print sample of the top 10 highest-scoring texts
for score, text, pile in scores[:10]:
  print(f"Score: {score:.2f}\nPile: {pile}\nText:{text[:1000]}\n==========================================")


In [None]:
len(scores)

In [None]:
scores.sort(key=lambda x: x[0], reverse=True)

for score, text, pile in scores[:10]:
  if pile == "DM Mathematics":
    continue
  print(f"Score: {score:.2f}\nPile: {pile}\nText:{text[:1000]}\n==========================================")