In [None]:
!pip install sentence-transformers
!pip install pandas
!pip install prettytable
import time
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from prettytable import PrettyTable

Collecting sentence-transformers
  Downloading sentence_transformers-3.2.0-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.0-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.2/255.2 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.2.0


  from tqdm.autonotebook import tqdm, trange


In [None]:

# Load the paraphrase-MPNet-base-v2 model
model = SentenceTransformer('paraphrase-MPNet-base-v2')

# Load the CSV file into a DataFrame
df = pd.read_csv('projects.csv')

# Combine title and abstract into a single text field
df['combined_text'] = df['abstract']
# df['combined_text'] = df['title'] + ' ' + df['abstract']



In [None]:

# Generate embeddings for each project and store in the DataFrame
start_time = time.time()
df['embeddings'] = df['combined_text'].apply(lambda x: model.encode(x).tolist())
end_time = time.time()
print(f"Time taken to generate embeddings for projects: {end_time - start_time:.2f} seconds")

Time taken to generate embeddings for projects: 40.69 seconds


## Enter the details about the New Project

In [None]:
# Enter the details about the input proposal

new_abstract = """This project investigates the influence of social media on the mental well-being of adolescents, focusing on the relationship between screen time, online interactions, and feelings of loneliness and anxiety. A mixed-methods approach was employed, combining quantitative surveys and qualitative interviews with participants aged 13 to 18. The survey assessed daily screen time, types of social media usage, and self-reported mental health indicators. Interviews provided deeper insights into personal experiences and perceptions regarding social media's impact. Results revealed a correlation between excessive screen time and increased feelings of anxiety and loneliness, particularly among users who reported negative interactions online. Conversely, participants who engaged in positive online communities reported higher levels of social support and lower anxiety. This study underscores the complex role of social media in adolescent mental health, highlighting the need for balanced usage and promoting healthy online interactions. The findings contribute to ongoing discussions about digital well-being and provide valuable insights for parents, educators, and mental health professionals."""
new_combined_text = new_abstract

'''
Enter the details about the input proposal
new_title = "Calculation of prompt diphoton production cross sections at Tevatron and LHC energies"
new_abstract = """  A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
more detailed tests with CDF and DO data. Predictions are shown for
distributions of diphoton pairs produced at the energy of the Large Hadron
Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs
boson are contrasted with those produced from QCD processes at the LHC, showing
that enhanced sensitivity to the signal can be obtained with judicious
selection of events."""
new_combined_text = new_title + ' ' + new_abstract
'''



' \nEnter the details about the input proposal\nnew_title = "Calculation of prompt diphoton production cross sections at Tevatron and LHC energies"\nnew_abstract = """  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are c

In [None]:
# Generate embedding for the new proposal
start_time = time.time()
new_embedding = model.encode(new_combined_text)
end_time = time.time()
print(f"Time taken to generate embedding for the new proposal: {end_time - start_time:.2f} seconds")

# Convert embeddings from list to numpy array
existing_embeddings = np.array(df['embeddings'].tolist())

# Calculate cosine similarity between the new embedding and existing embeddings
start_time = time.time()
similarity_scores = cosine_similarity([new_embedding], existing_embeddings)
end_time = time.time()
print(f"Time taken to calculate similarity scores: {end_time - start_time:.2f} seconds")

Time taken to generate embedding for the new proposal: 0.03 seconds
Time taken to calculate similarity scores: 0.01 seconds


In [None]:


# Add similarity scores to the DataFrame
df['similarity_score'] = similarity_scores[0]

# Display the top 5 most similar projects
top_projects = df[['project_id', 'title', 'category', 'abstract', 'similarity_score']].sort_values(by='similarity_score', ascending=False).head()

from prettytable import PrettyTable

# Create a PrettyTable object
table = PrettyTable()
table.field_names = ["Rank", "Similarity Score", "Title", "Category", "Abstract"]

# Add rows to the table
for rank, row in enumerate(top_projects.itertuples(), start=1):
    table.add_row([
        rank,
        f"{row.similarity_score:.4f}",  # Format similarity score to 4 decimal places
        row.title,
        row.abstract,
        row.category
    ])

# Print the table
print(table)

+------+------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------+--------------------------------+
| Rank | Similarity Score |                             Title                             |                                     Category                                    |            Abstract            |
+------+------------------+---------------------------------------------------------------+---------------------------------------------------------------------------------+--------------------------------+
|  1   |      0.4583      |                Novelty and Collective Attention               |     The subject of collective attention is central to an information age where  |   cs.CY cs.IR physics.soc-ph   |
|      |                  |                                                               | millions of people are inundated with daily messages. It is thus of interest to 