In [None]:
# ✅ Step 1: Install the transformers library
!pip install transformers

# ✅ Step 2: Import required modules
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# ✅ Step 3: Load GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()  # Set to evaluation mode

# ✅ Step 4: Provide a prompt
prompt = "In the future, AI will change the way we"

# ✅ Step 5: Tokenize the input
inputs = tokenizer.encode(prompt, return_tensors="pt")

# ✅ Step 6: Generate text
outputs = model.generate(
    inputs,
    max_length=100,
    temperature=0.8,         # creativity
    top_k=50,                # diversity
    top_p=0.95,              # nucleus sampling
    do_sample=True,          # enables random sampling
    num_return_sequences=1   # generate 1 output
)

# ✅ Step 7: Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Text:\n")
print(generated_text)




Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text:

In the future, AI will change the way we think and act. It will make people smarter. And it will help us achieve what we're trying to do today.

This post is part of the series 'The Future of Computing, Part 3', which I'm presenting this week at the Computers Summit in San Francisco. I've already talked about the importance of research in building smart, autonomous systems, but I wanted to address one more of the bigger challenges facing AI today.




In [2]:
# 🚀 PDF QA (Improved - LangChain + FAISS + HuggingFace, No API Key)

!pip install -q PyPDF2 langchain faiss-cpu spacy transformers langchain-community
!python -m spacy download en_core_web_sm

from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings
from langchain_community.vectorstores import FAISS
from transformers import pipeline
from google.colab import files
import os, tempfile

# 📤 Upload PDF
uploaded = files.upload()
pdf_path = next(iter(uploaded))
file_path = os.path.join(tempfile.gettempdir(), pdf_path)
with open(file_path, 'wb') as f: f.write(uploaded[pdf_path])

# 📄 Extract & chunk text
reader = PdfReader(file_path)
text = "".join([page.extract_text() or "" for page in reader.pages])
print("📄 Extracted text preview:\n", text[:1000])  # Preview text
chunks = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200).split_text(text)

# 🧠 Embed and store
embedder = SpacyEmbeddings(model_name="en_core_web_sm")
db = FAISS.from_texts(chunks, embedder)
retriever = db.as_retriever()

# ✅ Use improved QA model
qa = pipeline("question-answering", model="deepset/roberta-base-squad2")

# 🤖 Ask questions with context preview
def ask(q):
    docs = retriever.get_relevant_documents(q)

    print("\n📚 Top Retrieved Chunks:")
    for i, doc in enumerate(docs[:2]):
        print(f"\n--- Chunk {i+1} ---\n{doc.page_content[:500]}\n")

    context = " ".join([doc.page_content for doc in docs[:2]])[:1000]
    result = qa(question=q, context=context)
    print(f"\n❓ Question: {q}\n💬 Answer: {result['answer']}")

# 🔁 Interactive Q&A loop
while True:
    q = input("Ask a question (or type 'exit'): ")
    if q.lower() == 'exit':
        print("👋 Exiting. Thanks!")
        break
    ask(q)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.2/45.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('e

Saving attention.pdf to attention.pdf
📄 Extracted text preview:
 Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and figures in this paper solely for use in journalistic or
scholarly works.
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.comNoam Shazeer∗
Google Brain
noam@google.comNiki Parmar∗
Google Research
nikip@google.comJakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.comAidan N. Gomez∗ †
University of Toronto
aidan@cs.toronto.eduŁukasz Kaiser∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗ ‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanism

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Device set to use cpu
  docs = retriever.get_relevant_documents(q)



📚 Top Retrieved Chunks:

--- Chunk 1 ---
the input sequence centered around the respective output position. This would increase the maximum
path length to O(n/r). We plan to investigate this approach further in future work.
A single convolutional layer with kernel width k < n does not connect all pairs of input and output
positions. Doing so requires a stack of O(n/k)convolutional layers in the case of contiguous kernels,
orO(logk(n))in the case of dilated convolutions [ 18], increasing the length of the longest paths
between any two p


--- Chunk 2 ---
order of the sequence, we must inject some information about the relative or absolute position of the
tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the
bottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel
as the embeddings, so that the two can be summed. There are many choices of positional encodings,
learned and fixed [9].
In this work, we us

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipython-input-2-2015379259.py", line 48, in <cell line: 0>
    q = input("Ask a question (or type 'exit'): ")
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 1177, in raw_input
    return self._input_request(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 1219, in _input_request
    raise KeyboardInterrupt("Interrupted by user") from None
KeyboardInterrupt: Interrupted by user

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
          ^^^

TypeError: object of type 'NoneType' has no len()

In [4]:
import os
os.environ["GOOGLE_API_KEY"] = "gemini-api-key"

In [5]:
import os
os.environ["GOOGLE_API_KEY"] = "gemini-api-key"

In [6]:
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain import FAISS

ModuleNotFoundError: No module named 'langchain_google_genai'

In [7]:
loader = PyPDFLoader("./Apps.pdf")
pages = loader.load_and_split()

ValueError: File path ./Apps.pdf is not a valid file or url

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = FAISS.from_documents(pages, embeddings)

In [None]:
query = "What is Gista?"
docs = db.similarity_search(query)

In [None]:
content = "\n".join([x.page_content for x in docs])
qa_prompt = "Use the following pieces of context to answer the user's question. If you don't know the answer, just say that you don't know, don't try to make up an answer.----------------"
input_text = qa_prompt+"\nContext:"+content+"\nUser question:\n"+query
llm = ChatGoogleGenerativeAI(model="gemini-pro")
result = llm.invoke(input_text)
result.content

In [None]:
!pip install summarytools langchain-google-genai crewai -q

In [None]:
import pandas as pd # load example dataset
from summarytools import dfSummary
titanic = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
df_summary_info = dfSummary(titanic)
df_summary_info

In [None]:
print("Original Text:")
print(df_summary_info.to_string())

import re

def remove_img_texts(html_text: str) -> str:
    """
    Removes any text between <img ... </img> tags (including the tags)
    from the given text.
    """
    # This pattern matches an <img> tag with any attributes, followed by any content until a </img> tag.
    pattern = re.compile(r'<img[^>]*>.*?</img>', re.DOTALL | re.IGNORECASE)
    cleaned_text = pattern.sub('', html_text)
    return cleaned_text

# Example usage:
result = remove_img_texts(df_summary_info.to_string())
print("\nCleaned Text:")
print(result)

# Optionally, save the cleaned text to a text file.
with open("df_summary_text.txt", "w") as f:
    f.write(result)

# TODO: apply custom instructions on each row of text. instruct-fine tune, error rate analysis with custom dashboards or just version control of prompts and results like latitude

In [None]:
api_key = input("Please enter your API key: ")
os.environ["GEMINI_API_KEY"] = api_key
print("API key has been set.")

In [None]:
import os
import pandas as pd
from crewai import Agent, Task, Crew, Process
from langchain_google_genai import ChatGoogleGenerativeAI
from summarytools import dfSummary
from crewai import Agent, Task, Crew, LLM

clean_summary = remove_img_texts(df_summary_info.to_string())
llm = LLM(model="gemini/gemini-2.0-flash")

# -----------------------------------------------------------------------------
# 2) Load the Titanic dataset and generate a text summary using summarytools.
#
# The dfSummary() output (a pandas Styler) is converted to plain text with .to_string().
# -----------------------------------------------------------------------------
titanic = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
df_summary_info = dfSummary(titanic).to_string()

# -----------------------------------------------------------------------------
# 3) Create AI team agents in a CrewAI style.
#
# Each agent is provided with a role, goal, and backstory. They use Gemini Pro via llm.
# -----------------------------------------------------------------------------
data_quality_agent = Agent(
    role="Data Quality Analyst",
    goal="Identify columns with data quality issues such as those with more than 20% missing values.",
    backstory="An experienced data analyst who excels at detecting data quality issues and anomalies in datasets.",
    verbose=True,
    allow_delegation=False,
    llm=llm
)

statistical_agent = Agent(
    role="Statistical Analyst",
    goal="Examine numeric columns for potential outliers where maximum values greatly exceed the mean.",
    backstory="A meticulous statistician with expertise in numeric data distributions and anomaly detection.",
    verbose=True,
    allow_delegation=False,
    llm=llm
)

insight_agent = Agent(
    role="Predictive Modeling Advisor",
    goal="Recommend which columns appear most informative for predictive modeling.",
    backstory="A seasoned machine learning consultant skilled in feature engineering and predictive analysis.",
    verbose=True,
    allow_delegation=True,
    llm=llm
)

# -----------------------------------------------------------------------------
# 4) Create tasks for each agent by embedding the dataset summary (as text).
#
# Note: We now include an 'expected_output' field as required by the Task model.
# -----------------------------------------------------------------------------
task1 = Task(
    description=f"""Data Quality Task:
Examine the dataset summary below and identify columns with significant data quality issues or concerns.
Focus primarily on those columns with more than 20% missing values, and consider columns with unusual uniqueness metrics.
Provide detailed insights including the percentage of missing data and any recommendations for cleaning.
Dataset Summary:
{clean_summary}""",
    expected_output="A detailed list of column names with significant missing data issues along with insights and recommendations for data cleaning.",
    agent=data_quality_agent
)


task2 = Task(
    description=f"""Statistical Analysis Task:
Review the dataset summary below and identify any numeric columns that exhibit potential outlier behavior.
Specifically, look for columns where the maximum value far exceeds the mean (e.g., max > 3 × mean) and any unusual spread in the data.
Include a brief explanation of why each identified column may be problematic. Provide any recommendations for further investigation.
Dataset Summary:
{clean_summary}""",
    expected_output="A detailed list of numeric columns with potential outlier issues, including a brief explanation for each.",
    agent=statistical_agent
)


task3 = Task(
    description=f"""Predictive Modeling Insight Task:
Based on the dataset summary below, recommend which columns might be most informative for building a predictive model.
Dataset Summary:
{clean_summary}""",
    expected_output="A list of column names that are likely to be most informative for predictive modeling.",
    agent=insight_agent
)

# -----------------------------------------------------------------------------
# 5) Assemble the Crew and execute the tasks sequentially.
#
# The Crew (ManagerAgent) runs the tasks one by one.
# -----------------------------------------------------------------------------
crew = Crew(
    agents=[data_quality_agent, statistical_agent, insight_agent],
    tasks=[task1, task2, task3],
    verbose=True,
    process=Process.sequential,
)

crew_output = crew.kickoff()

print("\nCrew Output:")
print(crew_output)
