In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/my-data/sus.json


Instead of using the .json file as used in this notebook, custom .json file can also be added by adding the file to my-data in this notebook or by changing path as required.

In [2]:
#Loading the contents of the file
import json

# Read the JSON file
with open('/kaggle/input/my-data/sus.json', 'r') as json_file:
    project_data = json.load(json_file)

text = []
for i in project_data.keys():
    text.append(project_data[i])


In [3]:
# Accumulating all sentences from all paragraphs into a list for further processing 

import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')


sentences = []
for t in text:
    sentences.append(sent_tokenize(str(t)))


sent = []
for i in range(len(sentences)):
    for j in range(len(sentences[i])):
        sent.append(sentences[i][j])




[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Installing dependencies
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ done
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=ddc3001cd1afe0a53ebb001abbe7e15c61b4b89d0117a966afa93a03e8219687
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.2.2


In [5]:
# In this notebook, I am using sentence bert model for generating embeddings 
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [6]:
# Generating embeddings using sentence-bert model
embeddings = sbert_model.encode(sent)


Batches:   0%|          | 0/2917 [00:00<?, ?it/s]

In [7]:
print(len(sent[0]))
print(len(embeddings[5]))

148
384


In [8]:
# !python -m pip -qq install --no-index --find-links /kaggle/input/faiss-163/ faiss-cpu==1.6.3 
# Instaling FAISS for retrieval of most relevant sentences according to the query
!pip install faiss-gpu
import faiss                   # make faiss available

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [9]:
# Doing further processing using FAISS
index = faiss.IndexFlatL2(384)   # build the index, d=size of vectors 
index.add(embeddings)                  # add vectors to the index
print(index.ntotal)

93327


In [10]:
# Sample query for faiss index 
query = "Beyoncé along with a variety of other celebrities"
query_embed = sbert_model.encode(query)
len(query_embed)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

384

In [11]:
# Reshaping according to further processing 

query_embed = query_embed.reshape(1,384)
query_embed.shape


(1, 384)

In [12]:
# Generating indexes related to most relevant sentence using faiss
k = 4                          # we want 4 similar vectors
D, I = index.search(query_embed, k)     # actual search
# print(I)

In [13]:
print(I.shape)
I = I .reshape(I.shape[1])
print(I.shape)

(1, 4)
(4,)


In [14]:
# Installing and loading facebook-bert for summarizing the sentences generated using faiss
!pip install transformers

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the pre-trained model and tokenizer for summarization
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [15]:
# Helper function for doing summary 
def doing_summary(text):
    # Tokenize and generate summary
    input_ids = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(input_ids, max_length=400, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Decode and print the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [16]:
# Helper function for generating summarized response according to the provided query from the text
def summarize_answer(sentences,k,query):
    query_embed = sbert_model.encode(query)
    query_embed = query_embed.reshape(1,384)
    D, I = index.search(query_embed, k)
    I = I .reshape(I.shape[1])
    ans = ""
    for i in I:
        print(i)
        ans += sentences[i]
#         print(i)
    ans = doing_summary(ans)

    return ans

In [17]:
# type(sent[0])
q = "Tell me about Beyoncé"  ## Change this for a custom query 
num_sentences = 4  
summary = summarize_answer(sent,num_sentences,q)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

307
0
25
18


In [18]:
print(summary)

Beyoncé Giselle Knowles-Carter (born September 4, 1981) is an American singer, songwriter, record producer and actress. Her name is a tribute to her mother's maiden name. She has received numerous awards.
