In [1]:
# from umap import UMAP
# from hdbscan import HDBSCAN
# from sentence_transformers import SentenceTransformer
# from sklearn.feature_extraction.text import CountVectorizer

# from bertopic import BERTopic
# from bertopic.representation import KeyBERTInspired
# from bertopic.vectorizers import ClassTfidfTransformer


# # Step 1 - Extract embeddings
# embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# # Step 2 - Reduce dimensionality
# umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# # Step 3 - Cluster reduced embeddings
# hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# # Step 4 - Tokenize topics
# vectorizer_model = CountVectorizer(stop_words="english")

# # Step 5 - Create topic representation
# ctfidf_model = ClassTfidfTransformer()

# # Step 6 - (Optional) Fine-tune topic representations with 
# # a `bertopic.representation` model
# representation_model = KeyBERTInspired()


In [11]:
import pandas as pd
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

# Step 1 - Read Data
data = pd.read_csv('./semantic_scholar_data3.csv')

# Drop rows with NaN or empty abstracts
data = data.dropna(subset=['abstract'])  # Assuming 'abstract' is the column name
data = data.drop(['paperId', 'url'], axis=1)

# Step 2 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 3 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 4 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 5 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 6 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 7 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# Assuming you have a column named 'abstract' in your DataFrame 'data' containing the text data
documents = data['abstract'].tolist()

# Step 8 - Extract Sentence Embeddings
document_embeddings = embedding_model.encode(documents)

# Step 9 - Reduce Dimensionality
umap_embeddings = umap_model.fit_transform(document_embeddings)

In [12]:
# Step 10 - Cluster the Reduced Embeddings
hdbscan_labels = hdbscan_model.fit_predict(umap_embeddings)

In [13]:
# Step 11 - Tokenize Topics
vectorized_data = vectorizer_model.fit_transform(data['abstract'])

In [40]:
vectorized_data

<36355x2691 sparse matrix of type '<class 'numpy.int64'>'
	with 2204543 stored elements in Compressed Sparse Row format>

In [None]:

# # Convert vectorized_data to a Pandas DataFrame
# vectorized_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer_model.get_feature_names_out())

# # Save the DataFrame to a CSV file
# vectorized_df.to_csv('vectorized_data.csv', index=False)

# # Read the saved CSV file to check the columns
# saved_df = pd.read_csv('vectorized_data.csv')

# # Display the columns of the saved DataFrame
# print("Columns of the saved DataFrame:")
# print(saved_df.columns)
# vectorized_df

In [17]:
print("Shape of vectorized_data:", vectorized_data.shape)
print("Shape of hdbscan_labels:", hdbscan_labels.shape)
# Print the column names of your DataFrame 'data'
print(data.columns)



Shape of vectorized_data: (36355, 2691)
Shape of hdbscan_labels: (36355,)
Index(['title', 'abstract', 'year', 'citationCount'], dtype='object')


In [None]:
# # Assuming 'data' is your DataFrame
# column_names = data.columns
# print(column_names)


In [None]:
# hdbscan_labels = hdbscan_labels.astype(float)


In [None]:
#hdbscan_labels = hdbscan_labels.reshape(36355, 1)  # Reshape to (36355, 1)


In [16]:
# Assuming you have a 'hdbscan_labels' column in your DataFrame 'data' containing the cluster labels

# Create topic representation using the ClassTfidfTransformer
ctfidf_data = ctfidf_model.fit_transform(vectorized_data,hdbscan_labels)

ValueError: operands could not be broadcast together with shapes (2691,) (36355,) 

In [None]:
# Step 13 - (Optional) Fine-tune Topic Representations
if representation_model is not None:
    representation_model.fit(ctfidf_data)



In [18]:
# Step 14 - Initialize and Fit BERTopic
topic_model = BERTopic(embedding_model=embedding_model, n_gram_range=(1, 1), top_n_words=10)
topics, _ = topic_model.fit_transform(data['abstract'])



In [19]:
# Step 15 - Get Topics and Assign Topics to Documents
topic_model.get_topics()



{-1: [('power', 0.023917734685240888),
  ('formation', 0.020483617970975136),
  ('star', 0.01723358714099716),
  ('laws', 0.015161058793417862),
  ('data', 0.015080887067841735),
  ('galaxies', 0.013165492885298882),
  ('gas', 0.012693038536754974),
  ('sfr', 0.012290170782585083),
  ('the', 0.012199856212170911),
  ('global', 0.011709690500151656)],
 0: [('tyler', 0.10471346340816305),
  ('fear', 0.07743566179331059),
  ('legitimate', 0.07743566179331059),
  ('believe', 0.07743566179331059),
  ('his', 0.07203556244790628),
  ('punishment', 0.06980897560544204),
  ('obey', 0.06980897560544204),
  ('people', 0.05698247213786389),
  ('because', 0.05698247213786389),
  ('they', 0.052840060635467506)],
 1: [('children', 0.17150066171645623),
  ('care', 0.09469739299520831),
  ('spouses', 0.09469739299520831),
  ('levels', 0.08182859937208864),
  ('report', 0.07729796220440667),
  ('fewer', 0.07002877240438522),
  ('recipient', 0.07002877240438522),
  ('caregiving', 0.07002877240438522),
  

In [20]:
# Step 16 - Assign Topics to Documents
data['topic'] = topics



In [21]:
# You can explore the resulting DataFrame 'data' to see the assigned topics to each document
print(data.head())

                                             title   
0                          Why People Obey the Law  \
1                                  Law and Finance   
2        Power-Law Distributions in Empirical Data   
3  Power laws, Pareto distributions and Zipf's law   
4  The Global Schmidt law in star forming galaxies   

                                            abstract  year  citationCount   
0  People obey the law if they believe it's legit...  2021           2415  \
1  This paper examines legal rules covering prote...  1996          17458   
2  Power-law distributions occur in many situatio...  2007           8614   
3  When the probability of measuring a particular...  2004           5283   
4  Measurements of Hα, H I, and CO distributions ...  1997           3777   

   topic  
0      0  
1     64  
2     67  
3     69  
4     75  


In [52]:
# Save the DataFrame to a CSV file
data.to_csv('.csv', index=False)


In [53]:
data.head()

Unnamed: 0,title,abstract,year,citationCount,topic
0,Why People Obey the Law,People obey the law if they believe it's legit...,2021,2415,0
1,Law and Finance,This paper examines legal rules covering prote...,1996,17458,64
2,Power-Law Distributions in Empirical Data,Power-law distributions occur in many situatio...,2007,8614,67
3,"Power laws, Pareto distributions and Zipf's law",When the probability of measuring a particular...,2004,5283,69
4,The Global Schmidt law in star forming galaxies,"Measurements of Hα, H I, and CO distributions ...",1997,3777,75


In [55]:
df11=pd.read_csv('./Playground_SB/semantic_scholar_data_with_clusters100.csv')

FileNotFoundError: [Errno 2] No such file or directory: './Playground_SB/semantic_scholar_data_with_clusters100.csv'

In [None]:
df11

In [45]:
from sklearn.decomposition import PCA

# Assuming 'data' is your DataFrame with 5 columns
# Extract the numeric columns that you want to reduce to 3 dimensions
numeric_data = data[['year', 'citationCount', 'topic']]

# Initialize PCA with 3 components
pca = PCA(n_components=3)

# Fit and transform the data to reduce dimensions to 3
data_3d = pca.fit_transform(numeric_data)

# Now 'data_3d' contains your data with 3 dimensions
print(data_3d.shape)  # This should be (36355, 3)


(36355, 3)


In [46]:
import numpy as np

# Assuming 'data_3d' is your reduced-dimensional data with 3 dimensions
# Save it to a TSV file
np.savetxt('reduced_embeddings.tsv', data_3d, delimiter='\t', fmt='%.6f')




In [33]:
import numpy as np

# Generate random data as an example
n_samples = 3000  # Replace with your desired number of samples
n_dimensions =3  # Replace with your desired number of dimensions
embeddings = np.random.rand(n_samples, n_dimensions)

# Save the 'embeddings' array to a TSV file
np.savetxt('embeddings.tsv', embeddings, delimiter='\t', fmt='%.6f')


In [None]:
import pandas as pd
import numpy as np

# Load your data from a CSV file
data = pd.read_csv('data_with_topics.csv')

# Shuffle the data randomly
data_shuffled = data.sample(frac=1, random_state=42)

# Select the first 1000 rows as a subset (you can adjust the number as needed)
n_samples = 10
subset_data = data_shuffled.iloc[:n_samples]

# Select the first 3 dimensions (you can adjust the number as needed)
n_dimensions = 3
subset_data = subset_data.iloc[:, :n_dimensions]

# Save the subset data to a TSV file
subset_data.to_csv('subset_embeddings.tsv', sep='\t', index=False, float_format='%.6f')


In [None]:
# import numpy as np
# embeddings = ''
# # Assuming 'embeddings' is a NumPy array with shape (n_samples, n_dimensions)
# np.savetxt('embeddings.tsv', embeddings, delimiter='\t', fmt='%.6f')


In [27]:
import pandas as pd

# Assuming 'data' is your DataFrame with titles, abstracts, topics, and other columns
metadata = data[['title', 'abstract', 'topic']]
metadata.to_csv('metadata.csv', index=False)
