# BoW

In [None]:
#Dataset
#collection of corpus

# Example corpus (simplified)
corpus = [
    "Artificial intelligence is the simulation of human intelligence by machines.",
    "Machine learning is a subset of AI that enables machines to learn from data.",
    "Deep learning is a subfield of machine learning focused on neural networks.",
    "Natural language processing is a branch of AI that deals with text and speech.",
    "Reinforcement learning involves agents taking actions to maximize cumulative reward."
]


In [None]:
#creating a bow for our dataset
import numpy as np
from collections import Counter

# Create a basic vocabulary
vocabulary = list(set(" ".join(corpus).split()))

# Build the bag-of-words representation for the corpus
def build_bow_matrix(corpus, vocabulary):
    bow_matrix = []
    for doc in corpus:
        word_counts = Counter(doc.split())
        bow_vector = [word_counts[word] for word in vocabulary]
        bow_matrix.append(bow_vector)
    return np.array(bow_matrix)

bow_matrix = build_bow_matrix(corpus, vocabulary)
bow_matrix

array([[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 2,
        0],
       [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
        0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 1, 1, 2, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
        0],
       [0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
        1],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0]])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert query into BoW representation
def query_to_bow(query, vocabulary):
    word_counts = Counter(query.split())
    return np.array([word_counts[word] for word in vocabulary])

# Retrieve the most similar document to the query
def retrieve(query, bow_matrix, vocabulary, corpus):

    query_bow = query_to_bow(query, vocabulary)
    print(query_bow)
    query_bow = query_bow.reshape(1,-1)
    print(query_bow)

    similarities = cosine_similarity(query_bow, bow_matrix)
    print(similarities)
    similarities = similarities[0]
    print(similarities)
    best_idx = np.argmax(similarities)
    return corpus[best_idx]

# Example query
query = "How do machines learn?"
retrieved_doc = retrieve(query, bow_matrix, vocabulary, corpus)
print(f"Retrieved document: {retrieved_doc}")


[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0]]
[[0.         0.26726124 0.         0.         0.        ]]
[0.         0.26726124 0.         0.         0.        ]
Retrieved document: Machine learning is a subset of AI that enables machines to learn from data.


In [None]:
import random

def simple_generator(retrieved_doc):
    # Simulate text generation by returning a sentence from the document
    sentences = retrieved_doc.split(".")
    response = random.choice(sentences).strip()
    return response if response else "Sorry, I don't have enough information."

# Generate a response based on the retrieved document
response = simple_generator(retrieved_doc)
print(f"Generated response: {response}")


Generated response: Machine learning is a subset of AI that enables machines to learn from data


In [None]:
def rag_pipeline(query, bow_matrix, vocabulary, corpus):
    # Step 1: Retrieve the most relevant document
    retrieved_doc = retrieve(query, bow_matrix, vocabulary, corpus)

    # Step 2: Generate a response based on the retrieved document
    response = simple_generator(retrieved_doc)

    return response

# Test the RAG pipeline
query = "Tell me about neural networks."
response = rag_pipeline(query, bow_matrix, vocabulary, corpus)
print(f"Final response: {response}")


[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0]
[[0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 1 0 0 0 0]]
[[0.         0.         0.37796447 0.         0.        ]]
[0.         0.         0.37796447 0.         0.        ]
Final response: Sorry, I don't have enough information.


# Word Embeddings


In [26]:
!apt-get install -y tor

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tor is already the newest version (0.4.6.10-1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [27]:
#Install Required modules
!pip install requests beautifulsoup4
!pip install sentence_transformers
!pip install faiss-cpu
!pip install selenium
!pip install webdriver_manager
!pip install stem
!pip install scholarly



In [30]:
#import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

import tkinter as tk
from tkinter import filedialog
import random
import logging


from stem import Signal
from stem.control import Controller
from scholarly import scholarly, ProxyGenerator

In [31]:
import os

# Start Tor in the background
os.system('tor &')

0

In [33]:
!tor &


Oct 06 05:43:20.895 [notice] Tor 0.4.6.10 running on Linux with Libevent 2.1.12-stable, OpenSSL 3.0.2, Zlib 1.2.11, Liblzma 5.2.5, Libzstd 1.4.8 and Glibc 2.35 as libc.
Oct 06 05:43:20.895 [notice] Tor can't help you if you use it wrong! Learn how to be safe at https://support.torproject.org/faq/staying-anonymous/
Oct 06 05:43:20.895 [notice] Read configuration file "/etc/tor/torrc".
Oct 06 05:43:20.897 [notice] Opening Socks listener on 127.0.0.1:9050
Oct 06 05:43:20.897 [warn] Could not bind to 127.0.0.1:9050: Address already in use. Is Tor already running?
Oct 06 05:43:20.898 [warn] Failed to parse/validate config: Failed to bind one of the listener ports.


In [32]:
!service tor status


 * cannot read PID file /run/tor/tor.pid


In [49]:

# Set up the ProxyGenerator to use ScraperAPI
pg = ProxyGenerator()
pg.ScraperAPI("a46748cb98c5a7e9fc1fd815eb8585a0")  # Your ScraperAPI key
scholarly.use_proxy(pg)

# Function to get Google Scholar results using ProxyGenerator
def scrape_google_scholar(keyword, num_results=3):
    try:
        # Search for papers on Google Scholar
        search_query = scholarly.search_pubs(keyword)

        papers = []
        for i in range(num_results):  # Get up to num_results results
            try:
                paper = next(search_query)

                # Access the relevant fields safely
                title = paper.get('bib', {}).get('title', 'N/A')
                author = paper.get('bib', {}).get('author', 'N/A')
                year = paper.get('bib', {}).get('pub_year', 'N/A')
                url = paper.get('pub_url', 'N/A')
                abstract = paper.get('bib', {}).get('abstract', 'N/A')

                paper_info = {
                    'title': title,
                    'author': author,
                    'year': year,
                    'url': url,
                    'abstract': abstract
                }

                papers.append(paper_info)
            except StopIteration:
                break  # No more results

        print(f"Scraped {len(papers)} papers for keyword: '{keyword}'")
        return papers

    except Exception as e:
        print(f"Error processing keyword '{keyword}': {str(e)}")
        return []
#hhh
keywords = [
    "Natural Language Processing", "Machine Learning", "Computer Vision", "Satellite Imagery", "Transformer",
    "Attention Mechanisms", "Remote Sensing", "Statistical Machine Learning", "Communications in Media",
    "Artificial Intelligence", "Deep Learning", "Reinforcement Learning", "Neural Networks", "Data Mining",
    "Big Data Analytics", "Data Science", "Image Processing", "Natural Language Generation", "Speech Recognition",
    "Predictive Modeling", "Computer Graphics", "Augmented Reality", "Virtual Reality", "Robotics", "Internet of Things",
    "Blockchain Technology", "Cybersecurity", "Wireless Networks", "5G Technology", "Cloud Computing",
    "Edge Computing", "Quantum Computing", "Software Engineering", "Human-Computer Interaction", "Bioinformatics",
    "Healthcare Analytics", "Social Media Analytics", "Sentiment Analysis", "Financial Technology", "E-commerce",
    "Smart Cities", "Smart Agriculture", "Energy Management", "Environmental Science", "Climate Change",
    "Geographical Information Systems", "Agricultural Robotics", "Sustainable Development", "Supply Chain Management",
    "Logistics Optimization", "Statistical Modeling", "Multivariate Analysis", "Time Series Analysis",
    "Data Visualization", "Statistics", "Mathematical Modeling", "Operations Research", "Game Theory",
    "Combinatorial Optimization", "Fuzzy Logic", "Evolutionary Algorithms", "Genetic Algorithms",
    "Swarm Intelligence", "Artificial Neural Networks", "Support Vector Machines", "Decision Trees",
    "Random Forests", "K-Nearest Neighbors", "Naive Bayes", "Principal Component Analysis", "Linear Regression",
    "Logistic Regression", "Clustering Techniques", "Deep Reinforcement Learning", "Transfer Learning",
    "Zero-Shot Learning", "Multi-Task Learning", "Domain Adaptation", "Active Learning", "Federated Learning",
    "Explainable AI", "Robustness in AI", "Ethics in AI", "Fairness in Machine Learning",
    "Computer Vision Applications", "Facial Recognition", "Object Detection", "Image Segmentation",
    "Video Analysis", "Augmented Reality Applications", "Natural Language Understanding", "Text Classification",
    "Information Retrieval", "Question Answering Systems", "Text Summarization", "Dialogue Systems",
    "Semantic Web", "Knowledge Graphs", "Machine Translation", "Cross-Language Information Retrieval",
    "Human-AI Collaboration", "Smart Manufacturing", "Predictive Maintenance", "Manufacturing Automation",
    "Industrial Internet of Things", "AI in Agriculture", "Precision Farming", "Weather Forecasting",
    "Remote Sensing Applications", "Satellite Data Processing", "GIS Applications", "Environmental Monitoring",
    "Soil Health Monitoring", "Crop Yield Prediction", "Urban Planning", "Disaster Management", "Resilience Engineering",
    "Network Security", "Information Systems Security", "Cyber Threat Intelligence", "Digital Forensics",
    "Secure Software Development", "Cloud Security", "Privacy in Big Data", "Data Governance",
    "Regulatory Compliance", "Healthcare Informatics", "Telemedicine", "Wearable Technology",
    "Fitness Tracking", "Medical Imaging", "Genomic Data Analysis", "Personalized Medicine", "Patient-Centered Care",
    "Public Health Informatics", "Health Data Privacy", "Clinical Decision Support Systems",
    "Health Information Exchange", "Predictive Analytics in Healthcare", "Consumer Behavior Analytics",
    "Brand Management", "Marketing Analytics", "Advertising Technology", "Customer Relationship Management",
    "E-commerce Analytics", "Social Network Analysis", "Influencer Marketing", "Content Marketing",
    "User Experience Design", "Mobile Application Development", "Web Development", "UI/UX Design",
    "Cloud-Native Applications", "Microservices Architecture", "DevOps", "Continuous Integration/Continuous Deployment",
    "API Development", "Software Quality Assurance", "Agile Methodologies", "Scrum", "Kanban",
    "Digital Transformation", "E-Government", "Smart Grids", "Smart Water Management", "Smart Transport",
    "Sustainable Transportation", "Urban Mobility", "Transportation Systems", "Public Transportation Systems",
    "Vehicle Automation", "Autonomous Vehicles", "Traffic Management Systems", "Infrastructure Planning",
    "City Resilience", "Social Innovation", "Civic Technology", "Participatory Design",
    "Environmental Policy", "Sustainable Energy", "Renewable Energy Sources", "Energy Efficiency",
    "Carbon Footprint Reduction", "Climate Adaptation", "Sustainable Urban Development",
    "Sustainable Agriculture", "Food Security", "Water Resource Management", "Waste Management",
    "Circular Economy", "Environmental Education", "Environmental Ethics", "Ecological Footprint",
    "Conservation Biology", "Wildlife Management", "Habitat Restoration", "Biodiversity Monitoring",
    "Ecosystem Services", "Marine Biology", "Oceanography", "Climate Science", "Meteorology",
    "Geology", "Geomorphology", "Hydrology", "Soil Science", "Agronomy", "Plant Pathology",
    "Entomology", "Forestry", "Sustainable Fisheries", "Aquaculture", "Environmental Chemistry",


    "Toxicology", "Phytoremediation", "Bioremediation", "Environmental Impact Assessment",
    "Life Cycle Assessment", "Sustainable Development Goals", "Corporate Social Responsibility",
    "Social Entrepreneurship", "Impact Investing", "Community Development", "Nonprofit Management",
    "Fundraising", "Volunteering", "Social Work", "Human Rights", "Sustainability Education",
    "Environmental Activism", "Youth Leadership", "Community Organizing", "Advocacy", "Social Movements",
    "Cultural Studies", "Media Studies", "Political Science", "International Relations",
    "Conflict Resolution", "Peace Studies", "Global Governance", "Humanitarian Assistance",
    "Development Studies", "Gender Studies", "Indigenous Studies", "Migration Studies",
    "Postcolonial Studies", "Critical Theory", "Sociology", "Psychology", "Economics",
    "Behavioral Economics", "Cognitive Science", "Anthropology", "History", "Philosophy",
    "Comparative Literature", "Linguistics", "Education", "Curriculum Development", "Pedagogy",
    "Educational Technology", "Distance Education", "Learning Analytics", "Higher Education",
    "Adult Education", "Special Education", "Early Childhood Education", "Multicultural Education",
    "STEM Education", "Arts Education", "Music Education", "Physical Education", "Language Acquisition",
    "Second Language Learning", "Language Policy", "Literacy Studies", "Educational Assessment",
    "Teacher Training", "Educational Leadership", "School Reform", "Community Colleges",
    "Vocational Education", "Workforce Development", "Career Development", "Lifelong Learning",
    "Educational Equity", "Inclusive Education", "Diversity in Education", "Environmental Education",
    "Sustainability in Education", "Civic Engagement in Education", "Education Policy",
    "Educational Research", "Qualitative Research", "Quantitative Research", "Mixed Methods Research",
    "Action Research", "Case Studies", "Program Evaluation", "Research Ethics", "Data Analysis",
    "Statistical Analysis", "Experimental Design", "Survey Research", "Field Research", "Ethnography",
    "Grounded Theory", "Content Analysis", "Discourse Analysis", "Narrative Analysis", "Thematic Analysis",
    "Mixed Methods Design", "Research Methodology", "Research Design", "Research Funding",
    "Research Collaboration", "Research Communication", "Knowledge Mobilization", "Public Engagement in Research",
    "Open Science", "Citizen Science", "Research Impact", "Social Media and Research", "Research Metrics",
    "Bibliometrics", "Altmetrics", "Academic Publishing", "Peer Review", "Open Access Publishing",
    "Research Journals", "Conference Proceedings", "Thesis and Dissertation", "Academic Writing",
    "Scientific Writing", "Grant Writing", "Literature Review", "Research Proposal", "Research Presentation",
    "Research Networks", "Academic Conferences", "Research Associations", "Professional Organizations",
    "Research Communities", "Research Trends", "Emerging Research Areas", "Future Research Directions",
    "Research Careers", "Academic Job Market", "Research Ethics Committees", "Research Integrity",
    "Responsible Conduct of Research", "Research Governance", "Intellectual Property in Research",
    "Research Collaboration Tools", "Virtual Research Environments", "Research Data Management",
    "Data Sharing", "Research Dissemination", "Research Funding Agencies", "Research Policy",
    "Research Priorities", "Research Infrastructure", "Interdisciplinary Research", "Collaborative Research",
    "Transdisciplinary Research", "Participatory Research", "Community-Based Research", "Knowledge Translation",
    "Research Partnerships", "International Research", "Cross-Cultural Research", "Comparative Research",
    "Policy Research", "Evidence-Based Policy", "Policy Analysis", "Policy Evaluation", "Policy Advocacy",
    "Health Policy", "Education Policy", "Environmental Policy", "Social Policy", "Economic Policy",
    "International Development", "Global Health", "Public Policy", "Local Governance", "Public Administration",
    "Public Finance", "Public Services", "Service Delivery", "Public Sector Reform", "Good Governance",


        # Science (Physics)
    "Quantum Mechanics", "Electromagnetism", "Thermodynamics", "Nuclear Physics", "Optics",
    "Solid State Physics", "Plasma Physics", "Astrophysics", "Gravitational Waves", "Cosmology",
    "Particle Physics", "Biophysics", "Molecular Physics", "Fluid Dynamics", "Chaos Theory",
    "Quantum Computing", "Relativity Theory", "Black Holes", "Space-time Continuum", "High Energy Physics",

    # Science (Chemistry)
    "Organic Chemistry", "Inorganic Chemistry", "Biochemistry", "Physical Chemistry", "Analytical Chemistry",
    "Chemical Engineering", "Nanotechnology", "Electrochemistry", "Polymer Chemistry", "Medicinal Chemistry",
    "Quantum Chemistry", "Green Chemistry", "Chemical Synthesis", "Photochemistry", "Surface Chemistry",
    "Reaction Kinetics", "Molecular Dynamics", "Computational Chemistry", "Spectroscopy", "Thermochemistry",

    # Science (Biology)
    "Microbiology", "Molecular Biology", "Genetics", "Evolutionary Biology", "Zoology",
    "Botany", "Biotechnology", "Cell Biology", "Immunology", "Neuroscience",
    "Ecotoxicology", "Virology", "Parasitology", "Microbial Ecology", "Human Genetics",
    "Cancer Biology", "Plant Physiology", "Evolutionary Ecology", "Animal Behavior", "Marine Ecology",

    # Science (Astronomy)
    "Exoplanets", "Space Exploration", "Rocket Science", "Solar System", "Dark Matter",
    "Stellar Evolution", "Galaxies", "Astronomical Instrumentation", "Cosmic Microwave Background", "Radio Astronomy",
    "Spacecraft Engineering", "Planetary Science", "Astrobiology", "Astrochemistry", "Gravitational Lensing",
    "Lunar Missions", "Telescope Technology", "Space Radiation", "Astrodynamics", "Meteorology in Space",

    # Engineering
    "Civil Engineering", "Mechanical Engineering", "Electrical Engineering", "Robotics", "Structural Engineering",
    "Environmental Engineering", "Geotechnical Engineering", "Aerodynamics", "Mechatronics", "Thermal Engineering",
    "Hydraulics", "Power Electronics", "Machine Design", "Material Science", "Control Systems",
    "Manufacturing Systems", "Renewable Energy Systems", "Turbomachinery", "Combustion Engines", "Automation",
    "Fluid Mechanics", "Biomechanics", "Smart Grids", "Hybrid Electric Vehicles", "Nuclear Engineering",
    "Space Systems Engineering", "Embedded Systems", "Instrumentation", "Electromagnetic Compatibility", "Signal Processing",
    "Internet of Things in Engineering", "Smart Materials", "Energy Harvesting", "Sustainable Engineering", "Urban Infrastructure",

    # Communication
    "Communication Studies", "Media Psychology", "Public Relations", "Journalism", "Digital Communication",
    "Broadcast Media", "Political Communication", "Interpersonal Communication", "Corporate Communication", "Advertising",
    "Mass Media", "Persuasive Communication", "Nonverbal Communication", "Health Communication", "Communication Theory",
    "Crisis Communication", "Social Media Marketing", "Media Ethics", "Film Studies", "Television Studies",
    "Radio Broadcasting", "Communication Law", "Communication Research", "Strategic Communication", "Intercultural Communication",
    "Communication Technologies", "Media Literacy", "Online Journalism", "Audience Studies", "Mobile Communication",

    # Arts
    "Art History", "Music Theory", "Creative Writing", "Drama", "Theatre Studies",
    "Performing Arts", "Fine Arts", "Cinematography", "Sculpture", "Photography",
    "Graphic Design", "Fashion Design", "Textile Arts", "Film Production", "Contemporary Art",
    "Dance Studies", "Music Composition", "Orchestra", "Piano Performance", "Ethnomusicology",
    "Literary Studies", "Poetry", "Narrative Structures", "Film Directing", "Cultural Criticism",
    "Art Criticism", "Visual Culture", "Film Editing", "Art Therapy", "Music Education",
    "Sound Design", "Screenwriting", "Digital Art", "Classical Music", "Creative Process",

    # Agriculture
    "Precision Agriculture", "Soil Science", "Crop Science", "Plant Breeding", "Agronomy",
    "Agricultural Economics", "Agroforestry", "Irrigation Systems", "Aquaponics", "Food Technology",
    "Fertilizer Management", "Water Resource Management", "Agricultural Extension", "Pest Control", "Sustainable Agriculture",
    "Soil Fertility", "Plant Pathology", "Organic Farming", "Agricultural Biotechnology", "Agricultural Engineering",
    "Livestock Management", "Agroecology", "Hydroponics", "Farming Systems", "Post-Harvest Management",
    "Agricultural Machinery", "Seed Technology", "Agrometeorology", "Precision Irrigation", "Crop Modelling",
    "Agricultural Policy", "Sustainable Farming", "Farm Management", "Agricultural Supply Chain", "Climate-Resilient Crops",
    "Plant Genetics", "Agri-business", "Fisheries Management", "Horticulture", "Dairy Farming",
    "Animal Husbandry", "Food Processing", "Agricultural Innovation", "Rural Development", "Soil Health"
]



all_papers = [] #we will store all the papers informations for all the keywords
for keyword in keywords:
    print(f"Scraping papers for keyword: {keyword}")
    papers = scrape_google_scholar(keyword)
    all_papers.extend(papers)
    time.sleep(2)  # Sleep to avoid overwhelming the server




Scraping papers for keyword: Natural Language Processing
Scraped 3 papers for keyword: 'Natural Language Processing'
Scraping papers for keyword: Machine Learning
Scraped 3 papers for keyword: 'Machine Learning'
Scraping papers for keyword: Computer Vision
Scraped 3 papers for keyword: 'Computer Vision'
Scraping papers for keyword: Satellite Imagery
Scraped 3 papers for keyword: 'Satellite Imagery'
Scraping papers for keyword: Transformer
Scraped 3 papers for keyword: 'Transformer'
Scraping papers for keyword: Attention Mechanisms
Scraped 3 papers for keyword: 'Attention Mechanisms'
Scraping papers for keyword: Remote Sensing
Scraped 3 papers for keyword: 'Remote Sensing'
Scraping papers for keyword: Statistical Machine Learning
Scraped 3 papers for keyword: 'Statistical Machine Learning'
Scraping papers for keyword: Communications in Media
Scraped 3 papers for keyword: 'Communications in Media'
Scraping papers for keyword: Artificial Intelligence
Scraped 3 papers for keyword: 'Artific

In [50]:
#creating and saving a dataframe for the papers for furthur use
df = pd.DataFrame(all_papers)
# Ensure pandas doesn't truncate long strings in the console
pd.set_option('display.max_colwidth', None)
df.to_csv('papers.csv')

In [54]:
'''df2 = pd.read_csv('/content/papers_2.csv')

new_df = pd.concat([df,df2])
new_df
new_df.to_csv('all_papers.csv')
'''

In [62]:
#df = new_df
#load the all_papers.csv
#df = pd.read_csv('all_papers.csv')


In [63]:
#use a pretrianed model to create embeddings
#you can also yours

# Load pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for titles and abstracts and save as a column on df dataframe
df['title_embedding'] = df['title'].apply(lambda x: model.encode(x))
df['abstract_embedding'] = df['abstract'].apply(lambda x: model.encode(x))



In [64]:
# Create a structured list for indexing
#it will help us to retrive important information about papers during response
#Like for resutled papers we can easily find out the information like authors or link or title or abstract
# Ensure pandas doesn't truncate long strings in the console

# Create indexed paper information
indexed_papers = df[['title', 'author', 'abstract','year', 'url', 'title_embedding', 'abstract_embedding']].to_dict(orient='records')

#instead you also can call indexed_paper.txt as a list
'''
with open('indexed_paper.txt' , 'w') as file:
    for item in indexed_papers:
      file.write(f'{item}\n')
'''



In [65]:
#we will use FAISS to similarity search
#Convert embeddings to a NumPy array for FAISS indexing
title_embeddings = np.array(df['title_embedding'].tolist()).astype('float32')
abstract_embeddings = np.array(df['abstract_embedding'].tolist()).astype('float32')

# Initialize FAISS index for title embeddings
dimension = title_embeddings.shape[1]  # Dimension of embeddings
index = faiss.IndexFlatL2(dimension)   # L2 distance index for title embeddings
index.add(title_embeddings)
faiss.write_index(index, 'faiss_index.index')             # Add title embeddings to the index

# Function to retrieve similar papers based on title query
def retrieve_similar_papers(query, top_n=10):
    '''
    Inp:
      query - str, query from the prompt by user . could be a tile or texts
      top_n - int, number of similar papers we will return

    Out:

      result - list, distance and paper information form indexed_paper for top_n number of similar papers

    '''
    query_embedding = model.encode(query).astype('float32').reshape(1, -1)  # Reshape for FAISS
    distances, indices = index.search(query_embedding, top_n)  # Perform the search

    if len(indices) == 0 or indices[0][0] == -1:
        return "Sorry, could not find any relevant papers. I am still learning. Someday I will help you, InshaAllah."

    results = []
    for i in range(top_n):
        paper = indexed_papers[indices[0][i]]
        results.append((paper, distances[0][i]))

    if not results:
        return "Sorry, could not find any relevant papers. I am still learning. Someday I will help you, InshaAllah."

    return results


In [None]:
#uncomment and run to understand
'''
query = "Machine learning on classifications"
similar_papers = retrieve_similar_papers(query)

print("Top similar papers:")
for paper, score in similar_papers:
    print(f"Title: {paper['title']}, Authors: {paper['authors']}, Distance: {score:.4f}, Link: {paper['link']}")

'''

'\nquery = "Machine learning on classifications"\nsimilar_papers = retrieve_similar_papers(query)\n\nprint("Top similar papers:")\nfor paper, score in similar_papers:\n    print(f"Title: {paper[\'title\']}, Authors: {paper[\'authors\']}, Distance: {score:.4f}, Link: {paper[\'link\']}")\n\n'

In [77]:
def generate_response(similar_papers):
    '''
    A simple response generator function based on our retrieved information

    Inp:
      similar_papers:list, output of the retrieval function.
    Out:
      response:str, a response based on our retrieved information for the user
    '''
    response = "Here are some papers that are similar to your query:\n\n"

    for paper, distance in similar_papers:
        response += f"**Title:** {paper['title']}\n"
        response += f"**Authors:** {paper['author']}\n"
        response += f"**Year:** [View Paper]({paper['year']})\n"
        response += f"**Abstract:** {str(paper['abstract'])}\n"  # Full abstract will be displayed
        response += f"**Link:** [View Paper]({paper['url']})\n"
        response += f"**Similarity Score:** {distance:.4f}\n\n"

    return response


In [78]:
def rag_pipeline(query):
    '''
    The pipeline or main function regarding our RAG system

    Inp:
      query - str, the query from the prompt/user

    Out:
      response - str, response from generate_response functoin
    '''
    # Retrieve similar papers based on the query
    similar_papers = retrieve_similar_papers(query)

    # Generate a response based on the retrieved papers
    response = generate_response(similar_papers)

    return response


In [80]:
# Example query
query = "Statistical classification" #Enter your query and run
response = rag_pipeline(query)

# Print the generated response
print(response)


Here are some papers that are similar to your query:

**Title:** Introduction to statistical machine learning
**Authors:** ['M Sugiyama']
**Year:** [View Paper](2015)
**Abstract:** the field of machine learning in Part 1. Then Part 2 introduces fundamental concepts of  probability and statistics, which form the mathematical basis of statistical machine learning. Part 2
**Link:** [View Paper](https://books.google.com/books?hl=en&lr=&id=4YsdCAAAQBAJ&oi=fnd&pg=PP1&dq=Statistical+Machine+Learning&ots=Kkmte7Qavq&sig=TkmhRZoAGfbS7QJSX5tH9fPFdn8)
**Similarity Score:** 33.7685

**Title:** A recent overview of the state-of-the-art elements of text classification
**Authors:** ['MM Mirończuk', 'J Protasiewicz']
**Year:** [View Paper](2018)
**Abstract:** text classification including data collection, data analysis for labelling, feature construction  and weighing, feature selection and projection, training of a classification  of text classification.
**Link:** [View Paper](https://www.sciencedirec