In [None]:
import requests
from transformers import AutoModelForCausalLM, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Fetch headlines from NewsAPI specific to India
def fetch_headlines(api_key):
    url = f"https://newsapi.org/v2/top-headlines?country=in&apiKey={api_key}"
    response = requests.get(url)
    articles = response.json().get('articles', [])
    headlines = [article['title'] for article in articles]
    return headlines

# Preprocess text
def preprocess_text(text):
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Load Meta-Llama model and tokenizer
def load_llama_model():
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B")
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B")
    return model, tokenizer

# Generate paraphrase using Meta-Llama
def paraphrase_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=512, num_beams=5, early_stopping=True)
    paraphrased_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return paraphrased_text

# Compute similarity
def get_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0][1]

# Determine if user input matches any headline
def is_matching(user_input, headlines, model, tokenizer):
    preprocessed_input = preprocess_text(user_input)
    paraphrased_input = paraphrase_text(preprocessed_input, model, tokenizer)

    for headline in headlines:
        preprocessed_headline = preprocess_text(headline)
        similarity = get_similarity(paraphrased_input, preprocessed_headline)
        if similarity > 0.8:  # Threshold for similarity
            return True
    return False

# Example usage
api_key = 'f0349bd5b3c645b0b3c31075b48e5ed5'
user_input = 'The government announced new policies for economic growth.'

# Load the model and tokenizer
model, tokenizer = load_llama_model()

# Fetch headlines
headlines = fetch_headlines(api_key)

# Check for matching headlines
match = is_matching(user_input, headlines, model, tokenizer)

if match:
    print("The user input matches with a news headline.")
else:
    print("The user input does not match any news headline.")


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]