In [2]:
!pip install sentencepiece
!pip install rouge
!pip install transformers
!pip install nltk
#!pip install --upgrade google-protobuf

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m113.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[

# Importing necessary packages

In [3]:
import numpy as np
import nltk
nltk.download('punkt')

import torch, time
from transformers import PegasusForConditionalGeneration
from transformers import PegasusForConditionalGeneration, AutoTokenizer
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BartTokenizer, BartForConditionalGeneration
from rouge import Rouge

from transformers import BertTokenizer, BertModel

from scipy.spatial.distance import cosine


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Defining methods for text summarization, generating word embedding and fetching nearby similar sentences for a question

In [None]:
# methods to get summary of given input text and rouge metric scores

In [4]:
# Inputs
#input_text = "The quick brown fox jumps over the lazy dog. This is a test sentence."

def get_summary(input_text, model_name):
    print("USING MODEL:", model_name)
    start_time = time.time()
    print("Creating tokenizer object of ", model_name,"and tokens...")
    if(model_name=="google/pegasus-xsum"):
        #print("using peg")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name)
    elif(model_name=="t5-small"):
        #print("using T5")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = T5ForConditionalGeneration.from_pretrained(model_name)
    elif(model_name=="facebook/bart-large-cnn"):
        print("using bart")
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name)
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    t1 = time.time()-start_time
    print("   time taken:", t1)

    print("Generating summary ids/tokens...")
    summary_ids = model.generate(input_ids, early_stopping=True)
    t2 = (time.time()-start_time)-t1
    print("   time taken:", t2)

    print("Generating summary in text...")
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    t3 = (time.time()-start_time)-t1-t2
    print("   time taken:", t3)

    end_time = time.time()
    print("Total time taken:",end_time-start_time)
    return summary



def get_rouge_scores(summary, reference):
    rouge = Rouge()
    scores = rouge.get_scores(summary, reference)[0]
    rouge_1 = scores['rouge-1']['f']
    rouge_2 = scores['rouge-2']['f']
    rouge_l = scores['rouge-l']['f']
    return rouge_1, rouge_2, rouge_l



In [None]:
# methods to get sentences (using nltk) given paragraphs and to get sentence embeddings using BERT

In [5]:
def get_sentences(paragraph):
    # Tokenize the paragraph into sentences using NLTK
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

In [6]:
#model_name = 'bert-base-uncased'

def get_sentence_embeddings(paragraph):
    # Tokenize the paragraph into sentences using NLTK
    sentences = nltk.sent_tokenize(paragraph)

    # Load the BERT model and tokenizer
    model = BertModel.from_pretrained('bert-base-uncased')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Create the sentence embeddings using BERT
    sentence_embeddings = []
    for sentence in sentences:
        # Tokenize the sentence using BERT tokenizer
        tokenized_sentence = tokenizer.encode(sentence, add_special_tokens=True)

        # Convert the tokens to PyTorch tensors
        input_ids = torch.tensor([tokenized_sentence])

        # Get the BERT model outputs
        with torch.no_grad():
            last_hidden_states = model(input_ids)[0]

        # Get the sentence embedding by taking the average of the last layer of BERT output
        sentence_embedding = np.mean(last_hidden_states.numpy()[0], axis=0)

        # Add the sentence embedding to the list
        sentence_embeddings.append(sentence_embedding.tolist())

    # Return the list of sentence embeddings
    return sentence_embeddings



In [None]:
#Finding similar sentences for given question using cosine similarity score

In [7]:
def find_similar_sentences(question_embedding, sentence_embeddings):
    # Calculate similarity scores between question and sentence embeddings
    similarity_scores = []
    for emb in sentence_embeddings:
        score = 1 - cosine(question_embedding, emb)
        similarity_scores.append(score)

    # Combine the sentences and scores and sort them in descending order
    sentences_indexes = np.arange(0,len(sentence_embeddings))
    sentences_and_scores = list(zip(sentences_indexes, similarity_scores))
    sentences_and_scores.sort(key=lambda x: x[1], reverse=True)

    # Return the sentences and scores
    return sentences_and_scores


# TEXT SUMMARIZATION

In [8]:
input_text = "Commodities, stocks, oil, and recession are all important factors in the world of finance. Commodities, such as gold and oil, are traded on global markets and their prices can be affected by supply and demand, geopolitical events, and economic indicators. Stocks, which represent ownership in a company, are also traded on stock markets and their prices can be influenced by factors such as earnings reports, investor sentiment, and global economic conditions. Oil, which is a key commodity, has been particularly volatile in recent years due to fluctuations in supply and demand as well as geopolitical tensions. Finally, recession, which is a period of economic decline, can have a significant impact on all of these factors as well as on the wider economy. Understanding the interplay between commodities, stocks, oil, and recession is crucial for investors and financial professionals alike."
input_text = "what are Commodities Commodities her energy metals and agricultural products the drive Global production and consumption and the idea of a commodity fungible or standardized that Beyond specific grades of the commodity there's a global market of standardized products and so a specific grade of oil is the same around the world you don't care which Barrel you get her which bar of gold you give because they're they're generally standardized what's interesting about Commodities is that as we go into the energy Market the metals market and the agriculture Market there's very different drivers of the economics energy in metals have very long for Dakshin cycles and environmental sensitivities which means they can have very strong boom-and-bust periods and environmental sensitivities as well so why would we want to put Commodities in to our portfolio making an allocation of these Commodities in our asset allocation mix the first is the Commodities are highly sensitive to the business cycle and they tend to outperform during times of rising rates Rising inflation and heavy economic activity for the stronger the economy as a general rule the stronger the return to commodities second Commodities can be a great hedge relative to inflation because food and energy are an important part of the inflation calculation as measured by a consumer price index such as a c p i in the United States finally we see that Commodities might outperform stocks during times of economic growth and Rising inflation"
# "t5-small", "google/pegasus-xsum"
model_name = "facebook/bart-large-cnn"
summary = get_summary(input_text, model_name)
print("Input text:", input_text)
print("Summary:", summary)

USING MODEL: facebook/bart-large-cnn
Creating tokenizer object of  facebook/bart-large-cnn and tokens...
using bart


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

   time taken: 19.836285829544067
Generating summary ids/tokens...




   time taken: 13.13162088394165
Generating summary in text...
   time taken: 2.3073437213897705
Total time taken: 35.27536702156067
Input text: what are Commodities Commodities her energy metals and agricultural products the drive Global production and consumption and the idea of a commodity fungible or standardized that Beyond specific grades of the commodity there's a global market of standardized products and so a specific grade of oil is the same around the world you don't care which Barrel you get her which bar of gold you give because they're they're generally standardized what's interesting about Commodities is that as we go into the energy Market the metals market and the agriculture Market there's very different drivers of the economics energy in metals have very long for Dakshin cycles and environmental sensitivities which means they can have very strong boom-and-bust periods and environmental sensitivities as well so why would we want to put Commodities in to our portfolio 

In [9]:
#summary = "This is a picture of a fox and a dog."
reference = "Commodities, stocks, oil, and recession are important factors in wrold of finance.They are traded on global markets and are subject to various influences such as supply and demand, geopolitical events, and economic indicators. In recent times oil has been volatile due to supply and demand changes and geopolitical tensions. Recession has an impact on all these factors and there is interplay between commodities, stocks, and oil."
rouge_1, rouge_2, rouge_l = get_rouge_scores(summary, reference)
print(f"ROUGE-1 F1 score: {rouge_1:.3f}")
print(f"ROUGE-2 F1 score: {rouge_2:.3f}")
print(f"ROUGE-L F1 score: {rouge_l:.3f}")

ROUGE-1 F1 score: 0.211
ROUGE-2 F1 score: 0.016
ROUGE-L F1 score: 0.168


In [10]:
summary

'Commodities are highly sensitive to the business cycle and they tend to outperform during times of rising rates Rising inflation and heavy economic activity. Commodities can be a great hedge relative to inflation because food and energy are an important part of the inflation calculation as measured by a consumer price index such as a c p i.'

In [11]:
reference

'Commodities, stocks, oil, and recession are important factors in wrold of finance.They are traded on global markets and are subject to various influences such as supply and demand, geopolitical events, and economic indicators. In recent times oil has been volatile due to supply and demand changes and geopolitical tensions. Recession has an impact on all these factors and there is interplay between commodities, stocks, and oil.'

# GENERATING EMBEDDINGS FOR SENTENCES

In [12]:
# Get Sentences

sentences = get_sentences(summary)
#sentences

In [13]:
# Get sentence embeddings
sentence_embeddings = get_sentence_embeddings(summary)

# Print the embeddings
#print(embeddings)

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [16]:
sentence_embeddings

[[-0.23312994837760925,
  -0.005708656739443541,
  0.580746591091156,
  0.42341476678848267,
  0.14978978037834167,
  0.04855816066265106,
  -0.15099206566810608,
  0.6163118481636047,
  -0.3084328770637512,
  -0.3370990455150604,
  -0.008545972406864166,
  -0.08184397965669632,
  -0.15159547328948975,
  0.6133961081504822,
  0.08454589545726776,
  0.467117577791214,
  0.17903238534927368,
  -0.1234244853258133,
  -0.2789383828639984,
  0.13715589046478271,
  -0.05595879629254341,
  -0.08602480590343475,
  -0.061007335782051086,
  0.45102638006210327,
  0.5662779808044434,
  -0.3846535384654999,
  -0.35676994919776917,
  0.27158185839653015,
  -0.3949229121208191,
  -0.2276182621717453,
  0.29911527037620544,
  0.5408028364181519,
  -0.3749253749847412,
  0.10575610399246216,
  0.5037971138954163,
  -0.048684824258089066,
  -0.4251137673854828,
  -0.49345701932907104,
  -0.14727017283439636,
  0.08600658923387527,
  -0.6055636405944824,
  -0.3337405323982239,
  0.0492965430021286,
  0.

In [15]:
#print(embeddings)
print("Total sentences are:",len(sentence_embeddings), " Each Sentence Embedding Vector dim length is:",len(sentence_embeddings[0]))

Total sentences are: 2  Each Sentence Embedding Vector dim length is: 768


In [None]:
# # Sample paragraph
# paragraph = "Commodities, stocks, oil, and recession are all important factors in the world of finance. Stocks, which represent ownership in a company, are also traded on stock markets and their prices can be influenced by factors such as earnings reports, investor sentiment, and global economic conditions."

# # Get sentence embeddings
# embeddings = get_sentence_embeddings(paragraph)

# # Print the embeddings
# #print(embeddings)

In [17]:
#lines = ['Readme', 'How to write text files in Python']
with open('embeddings.txt', 'w') as f:
    for line in sentence_embeddings:
        f.write(line)
        f.write('\n')

TypeError: ignored

# FETCH SENTENCES MATCHING TO QUESTION

In [None]:
question = "Does recession have impact on commodities?"
question_embedding = get_sentence_embeddings(question)
#len(sentence_embeddings[0]), len(question_embedding[0])

#1 - cosine(question_embedding[0], sentence_embeddings[0])

#len(get_sentence_embeddings(question))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# sentences = nltk.sent_tokenize(paragraph)
# sentences

In [None]:
sentences_and_scores = find_similar_sentences(question_embedding[0], sentence_embeddings)
sentences_and_scores

[(3, 0.778810018021589),
 (0, 0.7217199410624682),
 (1, 0.7071587574243141),
 (2, 0.5786675179475325)]

In [None]:
for i in range(0,len(sentences_and_scores)):
  sentence_index = sentences_and_scores[i][0]
  #print(sentence_index)
  print(sentences[sentence_index], sentences_and_scores[i][1])


Recession, which is a period of economic decline, can have a significant impact on all of these factors. 0.778810018021589
Commodities, stocks, oil, and recession are all important factors in the world of finance. 0.7217199410624682
Commodities are traded on global markets and their prices can be affected by supply and demand, geopolitical events, and economic indicators. 0.7071587574243141
Stocks, which represent ownership in a company, are also traded on stock markets. 0.5786675179475325


In [None]:
sentences[0]

'C'

In [None]:
summary

'Commodities, stocks, oil, and recession are all important factors in the world of finance. Commodities are traded on global markets and their prices can be affected by supply and demand, geopolitical events, and economic indicators. Stocks, which represent ownership in a company, are also traded on stock markets. Recession, which is a period of economic decline, can have a significant impact on all of these factors.'

In [None]:
sentences = nltk.sent_tokenize(summary)
sentences[0]

'Commodities, stocks, oil, and recession are all important factors in the world of finance.'