# BoW

In [None]:
#Dataset
#collection of corpus

# Example corpus (simplified)
corpus = [
    "Artificial intelligence is the simulation of human intelligence by machines.",
    "Machine learning is a subset of AI that enables machines to learn from data.",
    "Deep learning is a subfield of machine learning focused on neural networks.",
    "Natural language processing is a branch of AI that deals with text and speech.",
    "Reinforcement learning involves agents taking actions to maximize cumulative reward."
]


In [None]:
#creating a bow for our dataset
import numpy as np
from collections import Counter

# Create a basic vocabulary
vocabulary = list(set(" ".join(corpus).split()))

# Build the bag-of-words representation for the corpus
def build_bow_matrix(corpus, vocabulary):
    bow_matrix = []
    for doc in corpus:
        word_counts = Counter(doc.split())
        bow_vector = [word_counts[word] for word in vocabulary]
        bow_matrix.append(bow_vector)
    return np.array(bow_matrix)

bow_matrix = build_bow_matrix(corpus, vocabulary)
bow_matrix

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 2,
        0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
        0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
        0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert query into BoW representation
def query_to_bow(query, vocabulary):
    word_counts = Counter(query.split())
    return np.array([word_counts[word] for word in vocabulary])

# Retrieve the most similar document to the query
def retrieve(query, bow_matrix, vocabulary, corpus):

    query_bow = query_to_bow(query, vocabulary)
    print(query_bow)
    query_bow = query_bow.reshape(1,-1)
    print(query_bow)

    similarities = cosine_similarity(query_bow, bow_matrix)
    print(similarities)
    similarities = similarities[0]
    print(similarities)
    best_idx = np.argmax(similarities)
    return corpus[best_idx]

# Example query
query = "How do machines learn?"
retrieved_doc = retrieve(query, bow_matrix, vocabulary, corpus)
print(f"Retrieved document: {retrieved_doc}")


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0]]
[[0.         0.26726124 0.         0.         0.        ]]
[0.         0.26726124 0.         0.         0.        ]
Retrieved document: Machine learning is a subset of AI that enables machines to learn from data.


In [None]:
import random

def simple_generator(retrieved_doc):
    # Simulate text generation by returning a sentence from the document
    sentences = retrieved_doc.split(".")
    response = random.choice(sentences).strip()
    return response if response else "Sorry, I don't have enough information."

# Generate a response based on the retrieved document
response = simple_generator(retrieved_doc)
print(f"Generated response: {response}")


Generated response: Machine learning is a subset of AI that enables machines to learn from data


In [None]:
def rag_pipeline(query, bow_matrix, vocabulary, corpus):
    # Step 1: Retrieve the most relevant document
    retrieved_doc = retrieve(query, bow_matrix, vocabulary, corpus)

    # Step 2: Generate a response based on the retrieved document
    response = simple_generator(retrieved_doc)

    return response

# Test the RAG pipeline
query = "Tell me about neural networks."
response = rag_pipeline(query, bow_matrix, vocabulary, corpus)
print(f"Final response: {response}")


[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0]
[[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0]]
[[0.         0.         0.37796447 0.         0.        ]]
[0.         0.         0.37796447 0.         0.        ]
Final response: Sorry, I don't have enough information.


# Word Embeddings


In [None]:
#Install Required modules
!pip install requests beautifulsoup4
!pip install sentence_transformers
!pip install faiss-cpu



In [None]:
#import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

In [None]:
#Funciton to scrap some papers using keywords

def scrape_google_scholar(keyword):
    '''
    inp:
      keyword - str, keyword to find papers

    out:
      papers - list, a list of papers found out for the given keywords
    '''

    search_url = f"https://scholar.google.com/scholar?q={keyword.replace(' ', '+')}"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
    response = requests.get(search_url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')

    papers = []
    for item in soup.find_all('div', class_='gs_ri'):
        title = item.find('h3').text
        author_info = item.find('div', class_='gs_a').text
        abstract = item.find('div', class_='gs_rs').text
        link = item.find('h3').find('a')['href'] if item.find('h3').find('a') else None
        papers.append({"title": title, "authors": author_info, "abstract": abstract, "link": link})

    return papers

# Example usage
keywords = [
    "Natural Language Processing", "Machine Learning", "Computer Vision", "Satellite Imagery", "Transformer",
    "Attention Mechanisms", "Remote Sensing", "Statistical Machine Learning", "Communications in Media",
    "Artificial Intelligence", "Deep Learning", "Reinforcement Learning", "Neural Networks", "Data Mining",
    "Big Data Analytics", "Data Science", "Image Processing", "Natural Language Generation", "Speech Recognition",
    "Predictive Modeling", "Computer Graphics", "Augmented Reality", "Virtual Reality", "Robotics", "Internet of Things",
    "Blockchain Technology", "Cybersecurity", "Wireless Networks", "5G Technology", "Cloud Computing",
    "Edge Computing", "Quantum Computing", "Software Engineering", "Human-Computer Interaction", "Bioinformatics",
    "Healthcare Analytics", "Social Media Analytics", "Sentiment Analysis", "Financial Technology", "E-commerce",
    "Smart Cities", "Smart Agriculture", "Energy Management", "Environmental Science", "Climate Change",
    "Geographical Information Systems", "Agricultural Robotics", "Sustainable Development", "Supply Chain Management",
    "Logistics Optimization", "Statistical Modeling", "Multivariate Analysis", "Time Series Analysis",
    "Data Visualization", "Statistics", "Mathematical Modeling", "Operations Research", "Game Theory",
    "Combinatorial Optimization", "Fuzzy Logic", "Evolutionary Algorithms", "Genetic Algorithms",
    "Swarm Intelligence", "Artificial Neural Networks", "Support Vector Machines", "Decision Trees",
    "Random Forests", "K-Nearest Neighbors", "Naive Bayes", "Principal Component Analysis", "Linear Regression",
    "Logistic Regression", "Clustering Techniques", "Deep Reinforcement Learning", "Transfer Learning",
    "Zero-Shot Learning", "Multi-Task Learning", "Domain Adaptation", "Active Learning", "Federated Learning",
    "Explainable AI", "Robustness in AI", "Ethics in AI", "Fairness in Machine Learning",
    "Computer Vision Applications", "Facial Recognition", "Object Detection", "Image Segmentation",
    "Video Analysis", "Augmented Reality Applications", "Natural Language Understanding", "Text Classification",
    "Information Retrieval", "Question Answering Systems", "Text Summarization", "Dialogue Systems",
    "Semantic Web", "Knowledge Graphs", "Machine Translation", "Cross-Language Information Retrieval",
    "Human-AI Collaboration", "Smart Manufacturing", "Predictive Maintenance", "Manufacturing Automation",
    "Industrial Internet of Things", "AI in Agriculture", "Precision Farming", "Weather Forecasting",
    "Remote Sensing Applications", "Satellite Data Processing", "GIS Applications", "Environmental Monitoring",
    "Soil Health Monitoring", "Crop Yield Prediction", "Urban Planning", "Disaster Management", "Resilience Engineering",
    "Network Security", "Information Systems Security", "Cyber Threat Intelligence", "Digital Forensics",
    "Secure Software Development", "Cloud Security", "Privacy in Big Data", "Data Governance",
    "Regulatory Compliance", "Healthcare Informatics", "Telemedicine", "Wearable Technology",
    "Fitness Tracking", "Medical Imaging", "Genomic Data Analysis", "Personalized Medicine", "Patient-Centered Care",
    "Public Health Informatics", "Health Data Privacy", "Clinical Decision Support Systems",
    "Health Information Exchange", "Predictive Analytics in Healthcare", "Consumer Behavior Analytics",
    "Brand Management", "Marketing Analytics", "Advertising Technology", "Customer Relationship Management",
    "E-commerce Analytics", "Social Network Analysis", "Influencer Marketing", "Content Marketing",
    "User Experience Design", "Mobile Application Development", "Web Development", "UI/UX Design",
    "Cloud-Native Applications", "Microservices Architecture", "DevOps", "Continuous Integration/Continuous Deployment",
    "API Development", "Software Quality Assurance", "Agile Methodologies", "Scrum", "Kanban",
    "Digital Transformation", "E-Government", "Smart Grids", "Smart Water Management", "Smart Transport",
    "Sustainable Transportation", "Urban Mobility", "Transportation Systems", "Public Transportation Systems",
    "Vehicle Automation", "Autonomous Vehicles", "Traffic Management Systems", "Infrastructure Planning",
    "City Resilience", "Social Innovation", "Civic Technology", "Participatory Design",
    "Environmental Policy", "Sustainable Energy", "Renewable Energy Sources", "Energy Efficiency",
    "Carbon Footprint Reduction", "Climate Adaptation", "Sustainable Urban Development",
    "Sustainable Agriculture", "Food Security", "Water Resource Management", "Waste Management",
    "Circular Economy", "Environmental Education", "Environmental Ethics", "Ecological Footprint",
    "Conservation Biology", "Wildlife Management", "Habitat Restoration", "Biodiversity Monitoring",
    "Ecosystem Services", "Marine Biology", "Oceanography", "Climate Science", "Meteorology",
    "Geology", "Geomorphology", "Hydrology", "Soil Science", "Agronomy", "Plant Pathology",
    "Entomology", "Forestry", "Sustainable Fisheries", "Aquaculture", "Environmental Chemistry",
    "Toxicology", "Phytoremediation", "Bioremediation", "Environmental Impact Assessment",
    "Life Cycle Assessment", "Sustainable Development Goals", "Corporate Social Responsibility",
    "Social Entrepreneurship", "Impact Investing", "Community Development", "Nonprofit Management",
    "Fundraising", "Volunteering", "Social Work", "Human Rights", "Sustainability Education",
    "Environmental Activism", "Youth Leadership", "Community Organizing", "Advocacy", "Social Movements",
    "Cultural Studies", "Media Studies", "Political Science", "International Relations",
    "Conflict Resolution", "Peace Studies", "Global Governance", "Humanitarian Assistance",
    "Development Studies", "Gender Studies", "Indigenous Studies", "Migration Studies",
    "Postcolonial Studies", "Critical Theory", "Sociology", "Psychology", "Economics",
    "Behavioral Economics", "Cognitive Science", "Anthropology", "History", "Philosophy",
    "Comparative Literature", "Linguistics", "Education", "Curriculum Development", "Pedagogy",
    "Educational Technology", "Distance Education", "Learning Analytics", "Higher Education",
    "Adult Education", "Special Education", "Early Childhood Education", "Multicultural Education",
    "STEM Education", "Arts Education", "Music Education", "Physical Education", "Language Acquisition",
    "Second Language Learning", "Language Policy", "Literacy Studies", "Educational Assessment",
    "Teacher Training", "Educational Leadership", "School Reform", "Community Colleges",
    "Vocational Education", "Workforce Development", "Career Development", "Lifelong Learning",
    "Educational Equity", "Inclusive Education", "Diversity in Education", "Environmental Education",
    "Sustainability in Education", "Civic Engagement in Education", "Education Policy",
    "Educational Research", "Qualitative Research", "Quantitative Research", "Mixed Methods Research",
    "Action Research", "Case Studies", "Program Evaluation", "Research Ethics", "Data Analysis",
    "Statistical Analysis", "Experimental Design", "Survey Research", "Field Research", "Ethnography",
    "Grounded Theory", "Content Analysis", "Discourse Analysis", "Narrative Analysis", "Thematic Analysis",
    "Mixed Methods Design", "Research Methodology", "Research Design", "Research Funding",
    "Research Collaboration", "Research Communication", "Knowledge Mobilization", "Public Engagement in Research",
    "Open Science", "Citizen Science", "Research Impact", "Social Media and Research", "Research Metrics",
    "Bibliometrics", "Altmetrics", "Academic Publishing", "Peer Review", "Open Access Publishing",
    "Research Journals", "Conference Proceedings", "Thesis and Dissertation", "Academic Writing",
    "Scientific Writing", "Grant Writing", "Literature Review", "Research Proposal", "Research Presentation",
    "Research Networks", "Academic Conferences", "Research Associations", "Professional Organizations",
    "Research Communities", "Research Trends", "Emerging Research Areas", "Future Research Directions",
    "Research Careers", "Academic Job Market", "Research Ethics Committees", "Research Integrity",
    "Responsible Conduct of Research", "Research Governance", "Intellectual Property in Research",
    "Research Collaboration Tools", "Virtual Research Environments", "Research Data Management",
    "Data Sharing", "Research Dissemination", "Research Funding Agencies", "Research Policy",
    "Research Priorities", "Research Infrastructure", "Interdisciplinary Research", "Collaborative Research",
    "Transdisciplinary Research", "Participatory Research", "Community-Based Research", "Knowledge Translation",
    "Research Partnerships", "International Research", "Cross-Cultural Research", "Comparative Research",
    "Policy Research", "Evidence-Based Policy", "Policy Analysis", "Policy Evaluation", "Policy Advocacy",
    "Health Policy", "Education Policy", "Environmental Policy", "Social Policy", "Economic Policy",
    "International Development", "Global Health", "Public Policy", "Local Governance", "Public Administration",
    "Public Finance", "Public Services", "Service Delivery", "Public Sector Reform", "Good Governance",
]



all_papers = [] #we will store all the papers informations for all the keywords
for keyword in keywords:
    print(f"Scraping papers for keyword: {keyword}")
    papers = scrape_google_scholar(keyword)
    all_papers.extend(papers)
    time.sleep(2)  # Sleep to avoid overwhelming the server




Scraping papers for keyword: Natural Language Processing
Scraping papers for keyword: Machine Learning
Scraping papers for keyword: Computer Vision
Scraping papers for keyword: Satellite Imagery
Scraping papers for keyword: Transformer
Scraping papers for keyword: Attention Mechanisms
Scraping papers for keyword: Remote Sensing
Scraping papers for keyword: Statistical Machine Learning
Scraping papers for keyword: Communications in Media
Scraping papers for keyword: Artificial Intelligence
Scraping papers for keyword: Deep Learning
Scraping papers for keyword: Reinforcement Learning
Scraping papers for keyword: Neural Networks
Scraping papers for keyword: Data Mining
Scraping papers for keyword: Big Data Analytics
Scraping papers for keyword: Data Science
Scraping papers for keyword: Image Processing
Scraping papers for keyword: Natural Language Generation
Scraping papers for keyword: Speech Recognition
Scraping papers for keyword: Predictive Modeling
Scraping papers for keyword: Comput

In [None]:
#creating and saving a dataframe for the papers for furthur use
df = pd.DataFrame(all_papers)
df.to_csv('papers.csv')

490

In [13]:
#use a pretrianed model to create embeddings
#you can also yours

# Load pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for titles and abstracts and save as a column on df dataframe
df['title_embedding'] = df['title'].apply(lambda x: model.encode(x))
df['abstract_embedding'] = df['abstract'].apply(lambda x: model.encode(x))


  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [16]:
# Create a structured list for indexing
#it will help us to retrive important information about papers during response
#Like for resutled papers we can easily find out the information like authors or link or title or abstract
indexed_papers = df[['title', 'authors', 'abstract', 'link', 'title_embedding', 'abstract_embedding']].to_dict(orient='records')

In [34]:
#we will use FAISS to similarity search
#Convert embeddings to a NumPy array for FAISS indexing
title_embeddings = np.array(df['title_embedding'].tolist()).astype('float32')
abstract_embeddings = np.array(df['abstract_embedding'].tolist()).astype('float32')

# Initialize FAISS index for title embeddings
dimension = title_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(dimension)   # L2 distance index for title embeddings
index.add(title_embeddings)             # Add title embeddings to the index

# Function to retrieve similar papers based on title query
def retrieve_similar_papers(query, top_n=7):
    '''
    Inp:
      query - str, query from the prompt by user . could be a tile or texts
      top_n - int, number of similar papers we will return

    Out:

      result - list, distance and paper information form indexed_paper for top_n number of similar papers

    '''
    query_embedding = model.encode(query).astype('float32').reshape(1, -1)  # Reshape for FAISS
    distances, indices = index.search(query_embedding, top_n)  # Perform the search

    if len(indices) == 0 or indices[0][0] == -1:
        return "Sorry, could not find any relevant papers. I am still learning. Someday I will help you, InshaAllah."

    results = []
    for i in range(top_n):
        paper = indexed_papers[indices[0][i]]
        results.append((paper, distances[0][i]))

    if not results:
        return "Sorry, could not find any relevant papers. I am still learning. Someday I will help you, InshaAllah."

    return results


In [32]:
#uncomment and run to understand
'''
query = "Machine learning on classifications"
similar_papers = retrieve_similar_papers(query)

print("Top similar papers:")
for paper, score in similar_papers:
    print(f"Title: {paper['title']}, Authors: {paper['authors']}, Distance: {score:.4f}, Link: {paper['link']}")

'''

'\nquery = "Machine learning on classifications"\nsimilar_papers = retrieve_similar_papers(query)\n\nprint("Top similar papers:")\nfor paper, score in similar_papers:\n    print(f"Title: {paper[\'title\']}, Authors: {paper[\'authors\']}, Distance: {score:.4f}, Link: {paper[\'link\']}")\n\n'

In [35]:
def generate_response(similar_papers):
    '''
    A simple response generator funciton based on our retrived information

    Inp:
      similar_paper:list, output of the retrival function.
    Out:
      response:str, a response based on our retirved information to the user
    '''
    response = "Here are some papers that are similar to your query:\n\n"
    for paper, distance in similar_papers:
        response += f"**Title:** {paper['title']}\n"
        response += f"**Authors:** {paper['authors']}\n"
        response += f"**Abstract:** {paper['abstract']}\n"
        response += f"**Link:** [View Paper]({paper['link']})\n"
        response += f"**Similarity Score:** {distance:.4f}\n\n"
    return response


In [36]:
def rag_pipeline(query):
    '''
    The pipeline or main function regarding our RAG system

    Inp:
      query - str, the query from the prompt/user

    Out:
      response - str, response from generate_response functoin
    '''
    # Retrieve similar papers based on the query
    similar_papers = retrieve_similar_papers(query)

    # Generate a response based on the retrieved papers
    response = generate_response(similar_papers)

    return response


In [40]:
# Example query
query = "crops classifications using satellite imagery" #Enter your query and run
response = rag_pipeline(query)

# Print the generated response
print(response)


Here are some papers that are similar to your query:

**Title:** Using self-organizing maps to identify patterns in satellite imagery
**Authors:** AJ Richardson, C Risien, FA Shillington - Progress in Oceanography, 2003 - Elsevier
**Abstract:** … in our examples are satellite images. Prior to training, the weights are initialized with starting 
(usually random) values. This is equivalent to starting with random images on each of the …
**Link:** [View Paper](https://www.sciencedirect.com/science/article/pii/S007966110300171X)
**Similarity Score:** 38.1279

**Title:** Current and future applications of statistical machine learning algorithms for agricultural machine vision systems
**Authors:** TU Rehman, MS Mahmud, YK Chang, J Jin… - … and electronics in …, 2019 - Elsevier
**Abstract:** … learning have been utilized for agriculture. This paper comprehensively surveyed current 
application of statistical machine learning techniques in machine … statistical machine learning …
**Link:** [Vi