In [112]:
#!pip install sentence-transformers
# !apt install libomp-dev
# !pip install faiss
!pip install torch



In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np

In [3]:
path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/articles.csv"

In [4]:
articles = pd.read_csv(path)

In [5]:
articles.head(2)

Unnamed: 0,author,claps,reading_time,link,title,text
0,Justin Lee,8.3K,11,https://medium.com/swlh/chatbots-were-the-next...,Chatbots were the next big thing: what happene...,"Oh, how the headlines blared:\nChatbots were T..."
1,Conor Dewey,1.4K,7,https://towardsdatascience.com/python-for-data...,Python for Data Science: 8 Concepts You May Ha...,If you’ve ever found yourself looking up the s...


In [6]:
articles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337 entries, 0 to 336
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author        337 non-null    object
 1   claps         337 non-null    object
 2   reading_time  337 non-null    int64 
 3   link          337 non-null    object
 4   title         337 non-null    object
 5   text          337 non-null    object
dtypes: int64(1), object(5)
memory usage: 15.9+ KB


# Embedding Model Initialization

In [123]:
embedding_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [36]:
sentence = ['This is related to sports', 'that is nice kick with the ball', 'i like the movie john wick super action',
            'i love to take wicket with swing ball']
test_sentence = ["I want to watch movie"]
embedding = model.encode(sentence)
query_embedding = model.encode(test_sentence)

In [37]:
embedding.shape

(4, 384)

In [38]:
query_embedding.shape

(1, 384)

In [44]:
# similarity_scores = np.dot(query_embedding[0], embedding.T)
# indices = similarity_scores.argsort()[::-1]

# Index Documents

In [56]:
sentences = []

for index, row in articles.iterrows():
  sentences.append(row["text"])

indexed_embeddings = model.encode(sentences)

# Document Retriever

In [124]:
def get_top_k_text(df, top_k, query):
  query_embedding = embedding_model.encode([query])
  similarity_scores = np.dot(query_embedding[0], indexed_embeddings.T)
  indices = similarity_scores.argsort()[::-1]
  top_indices = indices[0:top_k]
  return [articles.iloc[index]["text"] for index in top_indices], [articles.iloc[index]["title"] for index in top_indices]

In [125]:
articles.head(10)["title"]

0    Chatbots were the next big thing: what happene...
1    Python for Data Science: 8 Concepts You May Ha...
2    Automated Feature Engineering in Python – Towa...
3    Machine Learning: how to go from Zero to Hero ...
4    Reinforcement Learning from scratch – Insight ...
5    Intuitively Understanding Convolutions for Dee...
6    An intro to Machine Learning for designers – U...
7    The Big List of DS/ML Interview Resources – To...
8    Must know Information Theory concepts in Deep ...
9    What I learned from interviewing at multiple A...
Name: title, dtype: object

# Example Usage of Retriever

In [62]:
query = "what are the next big things in the world now"
response_text, response_title = get_top_k_text(articles, 5, query)

In [63]:
response_title

['建议的程序员学习LDA算法的步骤 – 蒸汽与魔法',
 'Eleven Reasons To Be Excited About The Future of Technology',
 'The future of work – Oxford University – Medium',
 'Using Artificial Intelligence to Balance Out Customer Value',
 'Who Is Going To Make Money In AI? Part I – Towards Data Science']

In [64]:
query = "what are chatbots"
response_text, response_title = get_top_k_text(articles, 5, query)

In [65]:
response_title

['The Complete Beginner’s Guide To Chatbots – Chatbots Magazine',
 'Chatbots were the next big thing: what happened? – The Startup – Medium',
 'Chatbots were the next big thing: what happened? – The Startup – Medium',
 'Chatbots were the next big thing: what happened? – The Startup – Medium',
 'What Are The Best Intelligent Chatbots or AI Chatbots Available Online?']

In [88]:
response_text

['What are chatbots? Why are they such a big opportunity? How do they work? How can I build one? How can I meet other people interested in chatbots?\nThese are the questions we’re going to answer for you right now.\nReady? Let’s do this.\n(Do you work in ecommerce? Stop reading and click here, we made something for you.)\n(p.s. here is where I believe the future of bots is headed, you will probably disagree with me at first.)\n(p.p.s. My newest guide about conversational commerce is up, I think you’ll find it super interesting.)\nA chatbot is a service, powered by rules and sometimes artificial intelligence, that you interact with via a chat interface. The service could be any number of things, ranging from functional to fun, and it could live in any major chat product (Facebook Messenger, Slack, Telegram, Text Messages, etc.).\nIf you haven’t wrapped your head around it yet, don’t worry. Here’s an example to help you visualize a chatbot.\nIf you wanted to buy shoes from Nordstrom onli

# Question Answer LLM

In [70]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import FlaxBartForQuestionAnswering

In [68]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

In [69]:
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base", forced_bos_token_id=0)

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [72]:
qa_model = FlaxBartForQuestionAnswering.from_pretrained("facebook/bart-base")

Downloading flax_model.msgpack:   0%|          | 0.00/558M [00:00<?, ?B/s]

Some weights of FlaxBartForQuestionAnswering were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: {('qa_outputs', 'bias'), ('qa_outputs', 'kernel')}
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [89]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"

text = "Essentially, we’re swapping something simple for a more-complex alternative.\nSure, there are some concepts that we can only express using language (“show me all the ways of getting to a museum that give me 2000 steps but don’t take longer than 35 minutes”), but most tasks can be carried out more efficiently and intuitively with GUIs than with a conversational UI.\nAiming for a human dimension in business interactions makes sense.\nIf there’s one thing that’s broken about sales and marketing, it’s the lack of humanity: brands hide behind ticket numbers, feedback forms, do-not-reply-emails, automated responses and gated ‘contact us’ forms.\nFacebook’s goal is that their bots should pass the so-called Turing Test, meaning you can’t tell whether you are talking to a bot or a human. But a bot isn’t the same as a human. It never will be.\nA conversation encompasses so much more than just text.\nHumans can read between the lines, leverage contextual information and understand double layers like sarcasm. Bots quickly forget what they’re talking about, meaning it’s a bit like conversing with someone who has little or no short-term memory.\nAs HubSpot team pinpointed:\nPeople aren’t easily fooled, and pretending a bot is a human is guaranteed to diminish returns (not to mention the fact that you’re lying to your users).\nAnd even those rare bots that are powered by state-of-the-art NLP, and excel at processing and producing content, will fall short in comparison.\nAnd here’s the other thing. Conversational UIs are built to replicate the way humans prefer to communicate — with other humans.\nBut is that how humans prefer to interact with machines?\nNot necessarily.\nAt the end of the day, no amount of witty quips or human-like mannerisms will save a bot from conversational failure.\nIn a way, those early-adopters weren’t entirely wrong.\nPeople are yelling at Google Home to play their favorite song, ordering pizza from the Domino’s bot and getting makeup tips from Sephora. But in terms of consumer response and developer involvement, chatbots haven’t lived up to the hype generated circa 2015/16.\nNot even close.\nComputers are good at being computers. Searching for data, crunching numbers, analyzing opinions and condensing that information"
question = "what is broken about sales and marketing"

In [90]:
inputs = tokenizer(question, text, return_tensors="jax")

In [117]:
#inputs

In [102]:
outputs = qa_model(**inputs)

In [93]:
outputs["output_logits"]

FlaxSeq2SeqQuestionAnsweringModelOutput(start_logits=Array([[ 0.12429245,  0.6879718 ,  0.9607391 ,  0.38230354, -0.78611434,
        -0.00594745,  0.0814611 ,  1.53239   , -0.0848643 ,  0.11409563,
         0.10934819, -0.18166989, -0.90862495, -0.6875577 ,  0.35753164,
         0.7494087 , -0.9083259 ,  0.4646651 , -0.89260197, -0.69570965,
        -0.91500306,  0.4545213 ,  1.2399535 ,  0.2987823 ,  0.16216023,
        -0.38533884, -1.6747088 , -0.95684725, -0.6829569 , -0.6108008 ,
        -0.47563598,  0.67845863, -0.47935197, -1.0877166 , -0.5026999 ,
        -0.14632261,  1.8426483 ,  0.85395294,  1.1931043 , -0.8417158 ,
        -0.05845101, -1.3279247 , -0.45386702,  0.29421744,  0.40680638,
         0.671847  ,  0.16606909, -0.00363436,  0.6094322 , -1.1942548 ,
         0.1466212 ,  0.88439363,  0.3453309 ,  0.09063312, -1.323919  ,
         1.8873427 , -0.39013198,  1.2988083 , -0.48016122, -0.6339157 ,
        -0.13209447,  1.2536767 ,  1.237092  ,  1.115227  ,  1.3420014 

In [121]:
def using_bart_models(ouputs, tokenizer):

  start_scores = outputs.start_logits
  end_scores = outputs.end_logits

  # start_logits = outputs.start_logits.squeeze()
  # end_logits = outputs.end_logits.squeeze()

  # start_index = int(start_logits.argmax())
  # end_index = int(end_logits.argmax())

  # tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
  # answer_tokens = tokens[start_index : end_index + 1]
  # answer_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))

  start_logits = outputs.start_logits.squeeze()
  end_logits = outputs.end_logits.squeeze()

  start_index = int(start_logits.argmax())
  end_index = int(end_logits.argmax())

  tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
  answer_tokens = tokens[start_index + 1 : end_index + 1]  # Exclude the <s> token
  answer_text = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))

# Question Answer Model - DistillBert

In [114]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Load the pre-trained model and tokenizer
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

def answer_question(question, text):
    # Tokenize input question and text
    inputs = tokenizer(question, text, return_tensors='pt', truncation=True)

    # Perform question-answering
    start_positions, end_positions = model(**inputs).values()

    # Find the predicted answer span
    answer_start = torch.argmax(start_positions, dim=1).item()
    answer_end = torch.argmax(end_positions, dim=1).item()

    # Decode the answer tokens
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    answer = tokenizer.convert_tokens_to_string(tokens[answer_start:answer_end+1])

    return answer

# Example usage
question = "What is the capital of France?"
text = "France is a country located in Western Europe. Its capital is Paris."
answer = answer_question(question, text)
print(answer)

paris


 # Combine Everything together -> Embedding, Retriever and QA

In [153]:
query = "what are chatbots"
response_text, response_title = get_top_k_text(articles, 5, query)
top_k_document_text = " ".join(response_text)
answer = answer_question(query, top_k_document_text)
print(answer)

a service, powered by rules and sometimes artificial intelligence, that you interact with via a chat interface. the service could be any number of things, ranging from functional to fun, and it could live in any major chat product ( facebook messenger, slack, telegram, text messages, etc. ). if you haven ’ t wrapped your head around it yet, don ’ t worry. here ’ s an example to help you visualize a chatbot. if you wanted to buy shoes from nordstrom online, you would go to their website, look around until you find the shoes you wanted, and then you would purchase them. if nordstrom makes a bot, which i ’ m sure they will, you would simply be able to message nordstrom on facebook. it would ask you what you ’ re looking for and you would simply... tell it. instead of browsing a website, you will have a conversation with the nordstrom bot, mirroring the type of experience you would get when you go into the retail store. watch this video from facebook ’ s recent f8 conference ( where they m

In [154]:
top_k_document_text

"What are chatbots? Why are they such a big opportunity? How do they work? How can I build one? How can I meet other people interested in chatbots?\nThese are the questions we’re going to answer for you right now.\nReady? Let’s do this.\n(Do you work in ecommerce? Stop reading and click here, we made something for you.)\n(p.s. here is where I believe the future of bots is headed, you will probably disagree with me at first.)\n(p.p.s. My newest guide about conversational commerce is up, I think you’ll find it super interesting.)\nA chatbot is a service, powered by rules and sometimes artificial intelligence, that you interact with via a chat interface. The service could be any number of things, ranging from functional to fun, and it could live in any major chat product (Facebook Messenger, Slack, Telegram, Text Messages, etc.).\nIf you haven’t wrapped your head around it yet, don’t worry. Here’s an example to help you visualize a chatbot.\nIf you wanted to buy shoes from Nordstrom onlin

In [143]:
articles.tail(20)["title"]

317    Traffic Sign Recognition with TensorFlow – Wal...
318    Cheat Sheets for AI, Neural Networks, Machine ...
319    Understanding Activation Functions in Neural N...
320    Romance Novels, Generated by Artificial Intell...
321    37 Reasons why your Neural Network is not work...
322               Picking a GPU for Deep Learning – Slav
323    Text Classification using Neural Networks – Ma...
324    You requested someone with a degree in this? *...
325    A Guide For Time Series Prediction Using Recur...
326    Neural Network Architectures – Towards Data Sc...
327    In defense of skepticism about deep learning –...
328    How to easily Detect Objects with Deep Learnin...
329    A “weird” introduction to Deep Learning – Towa...
330      The New Neural Internet is Coming – Hacker Noon
331    Stochastic Weight Averaging — a New Way to Get...
332    You can build a neural network in JavaScript e...
333    Artificial Intelligence, AI in 2018 and beyond...
334    Spiking Neural Networks,

In [135]:
query = "How to easily Detect Objects with Deep Learning"
response_text, response_title = get_top_k_text(articles, 5, query)
top_k_document_text = " ".join(response_text)
answer = answer_question(query, top_k_document_text)
print(answer)

[CLS]


In [155]:
query = "What is AI ?"
response_text, response_title = get_top_k_text(articles, 5, query)
top_k_document_text = " ".join(response_text)
answer = answer_question(query, top_k_document_text)
print(answer)

artificial intelligence


In [142]:
top_k_document_text

"Artificial Intelligence (AI) is the mantra of the current era. The phrase is intoned by technologists, academicians, journalists and venture capitalists alike. As with many phrases that cross over from technical academic fields into general circulation, there is significant misunderstanding accompanying the use of the phrase. But this is not the classical case of the public not understanding the scientists — here the scientists are often as befuddled as the public. The idea that our era is somehow seeing the emergence of an intelligence in silicon that rivals our own entertains all of us — enthralling us and frightening us in equal measure. And, unfortunately, it distracts us.\nThere is a different narrative that one can tell about the current era. Consider the following story, which involves humans, computers, data and life-or-death decisions, but where the focus is something other than intelligence-in-silicon fantasies. When my spouse was pregnant 14 years ago, we had an ultrasound.

In [157]:
query = "What is Machine learning?"
response_text, response_title = get_top_k_text(articles, 5, query)
top_k_document_text = " ".join(response_text)
answer = answer_question(query, top_k_document_text)
print(answer)

[CLS]
