In [None]:
pip install sentence-transformers

In [None]:
pip install faiss-gpu

In [None]:
pip install datasketch

In [3]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
import nltk
nltk.download('wordnet')

# Load pre-trained BERT model
model_name = 'bert-base-nli-mean-tokens'
model = SentenceTransformer(model_name)

# Load dataset
df = pd.read_csv("crime articles.csv")  # Adjust the filename as per your dataset

# Assign unique IDs to each row
df['article_id'] = range(len(df))

# Process user input
user_input_text = input("Enter your input text: ")

# Tokenize user input and perform stemming and lemmatization
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()
user_input_tokens = [lemmatizer.lemmatize(stemmer.stem(token)) for token in user_input_text.split()]

# Concatenate titles and user input for BERT embeddings
corpus = df['heading'].tolist() + [user_input_text]

# Encode sentences using BERT
embeddings = model.encode(corpus, show_progress_bar=True)

# Split encoded embeddings back into articles and user input
article_embeddings = embeddings[:-1]
user_input_embedding = embeddings[-1]

# Calculate cosine similarities between user input and article titles
similarities = cosine_similarity([user_input_embedding], article_embeddings)[0]

# Add similarity scores to dataframe
df['similarity'] = similarities

# Sort articles based on similarity
sorted_df = df.sort_values(by='similarity', ascending=False)

# Display recommended articles (IDs and names)
recommended_articles = sorted_df[['article_id', 'heading', 'content_summary']].head(10)
print("Recommended Articles:")
print(recommended_articles)

# User selects an article ID
selected_article_id = int(input("Enter the ID of the article you want to see details of: "))

# Display details of the selected article
selected_article_details = df[df['article_id'] == selected_article_id]
if not selected_article_details.empty:
    print("\nDetails of Selected Article (ID {}):".format(selected_article_id))
    selected_article = selected_article_details.iloc[0]
    print("Heading:", selected_article['heading'])
    print("Content Summary:", selected_article['content_summary'])
    print("Article Link:", selected_article['article_link'])  # If article link is available in your dataset
else:
    print("Invalid article ID. Please enter a valid ID.")


[nltk_data] Downloading package wordnet to /root/nltk_data...
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Enter your input text: killed


Batches:   0%|          | 0/239 [00:00<?, ?it/s]

Recommended Articles:
      article_id                                            heading  \
1998        1998                      This story has been removed.    
3247        3247               Elderly couple found dead in Howrah    
14            14      Aurangabad: Two destitute murdered near GMCH    
1432        1432     Delhi: Suffocation? Mystery over death of two    
6664        6664                  Contractor found dead in Cuttack    
4572        4572   MMRDA official falls to death, suicide angle ...   
3717        3717   Bhopal: Kolar SHO, SI removed from active dut...   
3911        3911            Pune: Tailor found bludgeoned to death    
739          739     Senior citizen found murdered in north Mysuru    
6251        6251                   Mentally ill boy thrashed, dies    

                                        content_summary  
1998                      This story has been removed.   
3247   Bodies of an elderly couple living at a gover...  
14     Two destitute r