In [1]:
from extraction import arxiv_abstract_extractor, arxiv_full_paper_extractor
from processing import text_embeddings, summarization_and_visualization
import pandas as pd


  from tqdm.autonotebook import tqdm, trange


In [2]:
arxiv_abstract_extractor.fetch_arxiv_abstracts?

[0;31mSignature:[0m
[0marxiv_abstract_extractor[0m[0;34m.[0m[0mfetch_arxiv_abstracts[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mcategory[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mstart_year[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mend_year[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_results[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Fetches metadata and abstracts of research papers from arXiv based on a specified category, date range, and result limit.

Args:
    category (str): The arXiv category to search (e.g., "cs.LG" for Machine Learning).
    start_year (int): The starting year of the date range for the search (e.g., 2020).
    end_year (int): The ending year of the date range for the search (e.g., 2023).
    max_results (int): The maximum number of results to retrieve.

Returns:
    list[dict]: A list of dictionaries, where each dictionary contains metadata for a paper:
        - 'id' (str): The uni

In [3]:
categories = 'cs.SI' # Social and Information Networks
start_year = 2023
end_year = 2024
max_results = 1000

data = arxiv_abstract_extractor.fetch_arxiv_abstracts(category=categories, start_year=start_year, end_year=end_year, max_results=max_results)

df = pd.DataFrame(data)
print(df['category'].value_counts())
df.head()


category
cs.SI    1000
Name: count, dtype: int64


Unnamed: 0,id,title,published_date,authors,category,abstract
0,http://arxiv.org/abs/2301.00312v2,Collision of Environmental Injustice and Sea L...,2023-01-01,"Zhewei Liu, Ali Mostafavi",cs.SI,Global sea-level rise causes increasing threat...
1,http://arxiv.org/abs/2301.00354v1,RiskProp: Account Risk Rating on Ethereum via ...,2023-01-01,"Dan Lin, Jiajing Wu, Qishuang Fu, Zibin Zheng,...",cs.SI,As one of the most popular blockchain platform...
2,http://arxiv.org/abs/2301.01256v1,M-Centrality: identifying key nodes based on g...,2023-01-03,"Ahmed Ibnoulouafi, Mohamed El Haziti, Hocine C...",cs.SI,Identifying influential nodes in a network is ...
3,http://arxiv.org/abs/2301.01404v2,Neighbor Contrastive Learning on Learnable Gra...,2023-01-04,"Xiao Shen, Dewang Sun, Shirui Pan, Xi Zhou, La...",cs.SI,"Recent years, graph contrastive learning (GCL)..."
4,http://arxiv.org/abs/2301.01478v2,Modeling communication asymmetry and content p...,2023-01-04,"Franco Galante, Luca Vassio, Michele Garetto, ...",cs.SI,The increasing popularity of online social net...


In [4]:
text_embeddings.reduced_embeddings?

[0;31mSignature:[0m [0mtext_embeddings[0m[0;34m.[0m[0mreduced_embeddings[0m[0;34m([0m[0mdata_frame[0m[0;34m,[0m [0mdata_type[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generates reduced embeddings from a DataFrame based on the specified data type (either 'paper' or 'abstract').

Parameters:
    data_frame (DataFrame): Input DataFrame containing text data.
    data_type (str): Specifies the type of data ('paper' or 'abstract').

Returns:
    DataFrame: A DataFrame with original data and reduced embeddings.
[0;31mFile:[0m      ~/Desktop/Repo/ResearchExtractorAndSummarizer/processing/text_embeddings.py
[0;31mType:[0m      function

In [5]:
import importlib

importlib.reload(text_embeddings)

<module 'processing.text_embeddings' from '/Users/muhammadmuhdhar/Desktop/Repo/ResearchExtractorAndSummarizer/processing/text_embeddings.py'>

In [6]:
df_with_embeddings = text_embeddings.reduced_embeddings(df, data_type ='abstract' )
df_with_embeddings.head()

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Unnamed: 0,id,title,published_date,authors,category,abstract,Umap_1,Umap_2
0,http://arxiv.org/abs/2301.00312v2,Collision of Environmental Injustice and Sea L...,2023-01-01,"Zhewei Liu, Ali Mostafavi",cs.SI,Global sea-level rise causes increasing threat...,4.754412,8.585053
1,http://arxiv.org/abs/2301.00354v1,RiskProp: Account Risk Rating on Ethereum via ...,2023-01-01,"Dan Lin, Jiajing Wu, Qishuang Fu, Zibin Zheng,...",cs.SI,As one of the most popular blockchain platform...,3.979782,11.852528
2,http://arxiv.org/abs/2301.01256v1,M-Centrality: identifying key nodes based on g...,2023-01-03,"Ahmed Ibnoulouafi, Mohamed El Haziti, Hocine C...",cs.SI,Identifying influential nodes in a network is ...,7.305691,10.024186
3,http://arxiv.org/abs/2301.01404v2,Neighbor Contrastive Learning on Learnable Gra...,2023-01-04,"Xiao Shen, Dewang Sun, Shirui Pan, Xi Zhou, La...",cs.SI,"Recent years, graph contrastive learning (GCL)...",10.28283,9.404253
4,http://arxiv.org/abs/2301.01478v2,Modeling communication asymmetry and content p...,2023-01-04,"Franco Galante, Luca Vassio, Michele Garetto, ...",cs.SI,The increasing popularity of online social net...,3.621909,10.558822


In [8]:
summarization_and_visualization.clusters_with_summary?

[0;31mSignature:[0m
[0msummarization_and_visualization[0m[0;34m.[0m[0mclusters_with_summary[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf_with_embeddings[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdata_type[0m[0;34m=[0m[0;34m'abstract'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Processes a DataFrame to create topic clusters for research papers, calculate centroids, and generate summaries and titles for each cluster.

Args:
    df_with_embeddings (pd.DataFrame): A DataFrame containing UMAP embeddings and content ('pdf_content' or 'abstract').
    data_type (str): Specifies whether the data includes 'paper' (requires 'pdf_content') or 'abstract' (requires 'abstract').
    
Returns:
    pd.DataFrame: A DataFrame of centroids with cluster information, including summaries and titles.
[0;31mFile:[0m      ~/Desktop/Repo/ResearchExtractorAndSummarizer/processing/summarization_and_visualization.py
[0;31mType:[0m      f

In [11]:
df_with_embeddings_clusters, centroids = summarization_and_visualization.clusters_with_summary(df_with_embeddings)

Processing clusters:   0%|          | 0/48 [00:00<?, ?it/s]

In [13]:
centroids.head()

Unnamed: 0,Cluster,Umap_1,Umap_2,Count,Summary,Title
0,0,0.480939,12.169841,24,This research cluster focuses on the detection...,Detection and Characterization of Social Media...
1,1,5.157413,9.563317,6,This research cluster focuses on modeling and ...,Modeling and Simulation of Epidemic Spread: In...
2,2,5.755319,9.234463,11,This research cluster focuses on optimizing wi...,Optimizing Wireless Communication for the Meta...
3,3,6.153074,9.959497,13,This research cluster focuses on modeling and ...,Modeling and Controlling Information and Epide...
4,4,4.891826,8.361863,44,This research cluster focuses on understanding...,Analyzing Socio-Spatial Dynamics and Inequalit...


In [12]:
df_with_embeddings_clusters.head()

Unnamed: 0,id,title,published_date,authors,category,abstract,Umap_1,Umap_2,Cluster
0,http://arxiv.org/abs/2301.00312v2,Collision of Environmental Injustice and Sea L...,2023-01-01,"Zhewei Liu, Ali Mostafavi",cs.SI,Global sea-level rise causes increasing threat...,4.754412,8.585053,4
1,http://arxiv.org/abs/2301.00354v1,RiskProp: Account Risk Rating on Ethereum via ...,2023-01-01,"Dan Lin, Jiajing Wu, Qishuang Fu, Zibin Zheng,...",cs.SI,As one of the most popular blockchain platform...,3.979782,11.852528,11
3,http://arxiv.org/abs/2301.01404v2,Neighbor Contrastive Learning on Learnable Gra...,2023-01-04,"Xiao Shen, Dewang Sun, Shirui Pan, Xi Zhou, La...",cs.SI,"Recent years, graph contrastive learning (GCL)...",10.28283,9.404253,10
6,http://arxiv.org/abs/2301.02737v2,Understanding the (In)Effectiveness of Content...,2023-01-06,"Ian Goldstein, Laura Edelson, Minh-Kha Nguyen,...",cs.SI,Social media networks commonly employ content ...,1.455973,10.876102,45
7,http://arxiv.org/abs/2301.02885v2,SCOREH+: A High-Order Node Proximity Spectral ...,2023-01-07,"Yanhui Zhu, Fang Hu, Lei Hsin Kuo, Jia liu",cs.SI,The research on complex networks has achieved ...,8.801978,7.903253,14


In [71]:
summarization_and_visualization.visualize?

[0;31mSignature:[0m [0msummarization_and_visualization[0m[0;34m.[0m[0mvisualize[0m[0;34m([0m[0mdf_with_embeddings[0m[0;34m,[0m [0mcentroids_df[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generates a 3D scatter plot to visualize research documents and their topic clusters based on UMAP embeddings.
The function validates input data, calculates cluster sizes, and creates an interactive Plotly visualization
with options to toggle between document points, topic clusters, and combined views.

Args:
    df_with_embeddings (pd.DataFrame): A DataFrame containing UMAP embeddings for individual research documents
                                       with columns ['Umap_1', 'Umap_2'], and cluster labels.
    centroids_df (pd.DataFrame): A DataFrame containing the cluster centroids with additional metadata, 
                                 including the required columns ['Umap_1', 'Umap_2', 'Count', 'Title', 'Summary'].

Returns:
    None: Displays an interacti

In [69]:
summarization_and_visualization.visualize(df_with_embeddings_clusters, centroids)