In [1]:
from extraction import arxiv_abstract_extractor, arxiv_full_paper_extractor
from processing import text_embeddings, summarization_and_visualization
import pandas as pd


  from tqdm.autonotebook import tqdm, trange


In [2]:
arxiv_abstract_extractor.fetch_arxiv_abstracts?

[0;31mSignature:[0m
[0marxiv_abstract_extractor[0m[0;34m.[0m[0mfetch_arxiv_abstracts[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mcategory[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstart_year[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mend_year[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_results[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fetches metadata and abstracts of research papers from arXiv based on a specified category, date range, and result limit.

Args:
    category (str): The arXiv category to search (e.g., "cs.LG" for Machine Learning).
    start_year (int): The starting year of the date range for the search (e.g., 2020).
    end_year (int): The ending year of the date range for the search (e.g., 2023).
    max_results (int): The maximum number of results to retrieve.

Returns:
    list[dict]: A list of dictionaries, where each dictionary contains metadata for a paper:
        - 'id' (str): The uni

In [3]:
categories = 'cs.SI'
start_year = 2023
end_year = 2024
max_results = 1000

data = arxiv_abstract_extractor.fetch_arxiv_abstracts(category=categories, start_year=start_year, end_year=end_year, max_results=max_results)

df = pd.DataFrame(data)
print(df['category'].value_counts())
df.head()


category
cs.SI    1000
Name: count, dtype: int64


Unnamed: 0,id,title,published_date,authors,category,abstract
0,http://arxiv.org/abs/2301.00312v2,Collision of Environmental Injustice and Sea L...,2023-01-01,"Zhewei Liu, Ali Mostafavi",cs.SI,Global sea-level rise causes increasing threat...
1,http://arxiv.org/abs/2301.00354v1,RiskProp: Account Risk Rating on Ethereum via ...,2023-01-01,"Dan Lin, Jiajing Wu, Qishuang Fu, Zibin Zheng,...",cs.SI,As one of the most popular blockchain platform...
2,http://arxiv.org/abs/2301.01256v1,M-Centrality: identifying key nodes based on g...,2023-01-03,"Ahmed Ibnoulouafi, Mohamed El Haziti, Hocine C...",cs.SI,Identifying influential nodes in a network is ...
3,http://arxiv.org/abs/2301.01404v2,Neighbor Contrastive Learning on Learnable Gra...,2023-01-04,"Xiao Shen, Dewang Sun, Shirui Pan, Xi Zhou, La...",cs.SI,"Recent years, graph contrastive learning (GCL)..."
4,http://arxiv.org/abs/2301.01478v2,Modeling communication asymmetry and content p...,2023-01-04,"Franco Galante, Luca Vassio, Michele Garetto, ...",cs.SI,The increasing popularity of online social net...


In [4]:
text_embeddings.reduced_embeddings?

[0;31mSignature:[0m [0mtext_embeddings[0m[0;34m.[0m[0mreduced_embeddings[0m[0;34m([0m[0mdata_frame[0m[0;34m,[0m [0mdata_type[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generates reduced embeddings from a DataFrame based on the specified data type (either 'paper' or 'abstract').

Parameters:
    data_frame (DataFrame): Input DataFrame containing text data.
    data_type (str): Specifies the type of data ('paper' or 'abstract').

Returns:
    DataFrame: A DataFrame with original data and reduced embeddings.
[0;31mFile:[0m      ~/Desktop/Repo/ResearchExtractorAndSummarizer/processing/text_embeddings.py
[0;31mType:[0m      function

In [5]:
df_with_embeddings = text_embeddings.reduced_embeddings(df, data_type ='abstract' )
df_with_embeddings.head()

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Unnamed: 0,id,title,published_date,authors,category,abstract,Umap_1,Umap_2,Umap_3
0,http://arxiv.org/abs/2301.00312v2,Collision of Environmental Injustice and Sea L...,2023-01-01,"Zhewei Liu, Ali Mostafavi",cs.SI,Global sea-level rise causes increasing threat...,4.377036,6.066005,11.175738
1,http://arxiv.org/abs/2301.00354v1,RiskProp: Account Risk Rating on Ethereum via ...,2023-01-01,"Dan Lin, Jiajing Wu, Qishuang Fu, Zibin Zheng,...",cs.SI,As one of the most popular blockchain platform...,2.159528,6.080323,11.321388
2,http://arxiv.org/abs/2301.01256v1,M-Centrality: identifying key nodes based on g...,2023-01-03,"Ahmed Ibnoulouafi, Mohamed El Haziti, Hocine C...",cs.SI,Identifying influential nodes in a network is ...,5.995402,5.459976,8.999399
3,http://arxiv.org/abs/2301.01404v2,Neighbor Contrastive Learning on Learnable Gra...,2023-01-04,"Xiao Shen, Dewang Sun, Shirui Pan, Xi Zhou, La...",cs.SI,"Recent years, graph contrastive learning (GCL)...",5.551396,6.942538,6.774242
4,http://arxiv.org/abs/2301.01478v2,Modeling communication asymmetry and content p...,2023-01-04,"Franco Galante, Luca Vassio, Michele Garetto, ...",cs.SI,The increasing popularity of online social net...,4.225393,4.132129,11.939844


In [6]:
summarization_and_visualization.clusters_with_summary?

[0;31mSignature:[0m
[0msummarization_and_visualization[0m[0;34m.[0m[0mclusters_with_summary[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_with_embeddings[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m=[0m[0;34m'abstract'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Processes a DataFrame to create topic clusters for research papers, calculate centroids, and generate summaries and titles for each cluster.

Args:
    df_with_embeddings (pd.DataFrame): A DataFrame containing UMAP embeddings and content ('pdf_content' or 'abstract').
    data_type (str): Specifies whether the data includes 'paper' (requires 'pdf_content') or 'abstract' (requires 'abstract').
    
Returns:
    pd.DataFrame: A DataFrame of centroids with cluster information, including summaries and titles.
[0;31mFile:[0m      ~/Desktop/Repo/ResearchExtractorAndSummarizer/processing/summarization_and_visualization.py
[0;31mType:[0m      f

In [6]:
df_with_embeddings_clusters, centroids = summarization_and_visualization.clusters_with_summary(df_with_embeddings)

Processing clusters:   0%|          | 0/40 [00:00<?, ?it/s]

In [7]:
centroids.head()

Unnamed: 0,Cluster,Umap_1,Umap_2,Umap_3,Count,Summary,Title
0,0,5.423016,5.361913,10.296049,29,This research cluster focuses on modeling and ...,Modeling and Controlling Contagion Spread in N...
1,1,5.065401,6.490625,9.762818,11,This research cluster focuses on optimizing wi...,Optimizing Wireless Communication for Next-Gen...
2,2,4.32164,6.215507,10.707474,35,This research cluster focuses on understanding...,Analyzing Socio-Spatial Segregation and Mobili...
3,3,6.241837,4.343227,10.189025,36,This research cluster focuses on advancements ...,Influence Maximization in Social Networks: Alg...
4,4,2.205896,5.933599,11.397612,21,This research cluster focuses on detecting and...,Detection and Analysis of Fraudulent Activitie...


In [8]:
df_with_embeddings_clusters.head()

Unnamed: 0,id,title,published_date,authors,category,abstract,Umap_1,Umap_2,Umap_3,Cluster
0,http://arxiv.org/abs/2301.00312v2,Collision of Environmental Injustice and Sea L...,2023-01-01,"Zhewei Liu, Ali Mostafavi",cs.SI,Global sea-level rise causes increasing threat...,4.377036,6.066005,11.175738,2
1,http://arxiv.org/abs/2301.00354v1,RiskProp: Account Risk Rating on Ethereum via ...,2023-01-01,"Dan Lin, Jiajing Wu, Qishuang Fu, Zibin Zheng,...",cs.SI,As one of the most popular blockchain platform...,2.159528,6.080323,11.321388,4
2,http://arxiv.org/abs/2301.01256v1,M-Centrality: identifying key nodes based on g...,2023-01-03,"Ahmed Ibnoulouafi, Mohamed El Haziti, Hocine C...",cs.SI,Identifying influential nodes in a network is ...,5.995402,5.459976,8.999399,15
3,http://arxiv.org/abs/2301.01404v2,Neighbor Contrastive Learning on Learnable Gra...,2023-01-04,"Xiao Shen, Dewang Sun, Shirui Pan, Xi Zhou, La...",cs.SI,"Recent years, graph contrastive learning (GCL)...",5.551396,6.942538,6.774242,22
10,http://arxiv.org/abs/2301.04720v1,An Architecture For Cooperative Mobile Health ...,2023-01-11,"Georgios Drakopoulos, Phivos Mylonas, Spyros S...",cs.SI,Mobile health applications are steadily gainin...,4.879796,6.481934,9.976331,1


In [12]:
import importlib
importlib.reload(summarization_and_visualization)

<module 'processing.summarization_and_visualization' from '/Users/muhammadmuhdhar/Desktop/Repo/ResearchExtractorAndSummarizer/processing/summarization_and_visualization.py'>

In [9]:
summarization_and_visualization.visualize?

[0;31mSignature:[0m [0msummarization_and_visualization[0m[0;34m.[0m[0mvisualize[0m[0;34m([0m[0mdf_with_embeddings[0m[0;34m,[0m [0mcentroids_df[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generates a 3D scatter plot to visualize research documents and their topic clusters based on UMAP embeddings.
The function validates input data, calculates cluster sizes, and creates an interactive Plotly visualization
with options to toggle between document points, topic clusters, and combined views.

Args:
    df_with_embeddings (pd.DataFrame): A DataFrame containing UMAP embeddings for individual research documents
                                       with columns ['Umap_1', 'Umap_2', 'Umap_3'], and cluster labels.
    centroids_df (pd.DataFrame): A DataFrame containing the cluster centroids with additional metadata, 
                                 including the required columns ['Umap_1', 'Umap_2', 'Umap_3', 'Count', 'Title', 'Summary'].

Returns:
    None: D

In [13]:
summarization_and_visualization.visualize(df_with_embeddings_clusters, centroids)