In [7]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.0.1


In [12]:
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
from sentence_transformers import SentenceTransformer, util
import re

def fetch_headlines(api_key, keyword):
    url = f"https://newsapi.org/v2/everything?q={keyword}&apiKey={api_key}"
    response = requests.get(url)
    articles = response.json().get('articles', [])
    headlines = [article['title'] for article in articles]
    return headlines

def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower().strip()  # Convert to lowercase and strip extra spaces
    return text

def load_opt_model():
    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
    model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
    return model, tokenizer

def paraphrase_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Paraphrased Text: {paraphrased_text}")  # Debugging line
    return paraphrased_text

def load_sentence_model():
    model = SentenceTransformer('all-MiniLM-L6-v2')
    return model

def get_similarity(text1, text2, model):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

def is_matching(user_input, headlines, model, tokenizer, sentence_model):
    preprocessed_input = preprocess_text(user_input)
    paraphrased_input = paraphrase_text(preprocessed_input, model, tokenizer)

    for headline in headlines:
        preprocessed_headline = preprocess_text(headline)
        similarity = get_similarity(paraphrased_input, preprocessed_headline, sentence_model)
        if similarity > 0.8:  # Threshold for similarity
            return True
    return False

# Example usage
api_key = 'Give Your Own Key'
keyword = 'Indian Navy Instagram nuke reveal'
user_input = 'Indian Navy accidentally reveals nuke information in an Instagram post.'

model, tokenizer = load_opt_model()
sentence_model = load_sentence_model()
headlines = fetch_headlines(api_key, keyword)
match = is_matching(user_input, headlines, model, tokenizer, sentence_model)

if match:
    print("The user input matches with a news headline.")
else:
    print("The user input does not match any news headline.")


Paraphrased Text: indian navy accidentally reveals nuke information in an instagram post

indian navy accidentally reveals nuke information in an instagram post

indian navy accidentally reveals nuke information in an instagram post

indian navy accidentally reveals nuke information in an instagram post

indian navy accidentally reveals nuke information in an instagram post
The user input matches with a news headline.
