In [1]:
!pip install -q bertopic pandas nltk scikit-learn matplotlib xlwt 

In [6]:
import pandas as pd
Paper = pd.read_csv('/home/jupyter/WSM/data/papers.csv')
Paper = Paper.dropna()
Paper

Unnamed: 0,Date,Abstract
0,2015-03-10,Garlic is valued more for its flavoring and us...
1,2015-03-10,The present investigation was undertaken to st...
2,2015-01-01,Upland areas of southeastern U.S. tidal creek ...
3,2014-12-10,The specific activity of 90Sr in milk and vege...
4,2014-06-11,A survey was conducted to evaluate aflatoxin c...
...,...,...
12686,2020-09-29,The gluten-free food market is growing with th...
12687,2020-09-02,Hurricane Sandy made landfall in New Jersey on...
12688,2020-08-28,Prevalent risks in meat value-chains of sub-Sa...
12689,2020-06-03,The presence of metal contaminants in agricult...


In [7]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
from umap import UMAP
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean
import nltk
import re

In [11]:
# Function to group by a specific time unit (Year, Month, Day)
def group_by_time_unit(df, time_unit):
    if time_unit == 'year':
        return df['Date'].dt.year
    elif time_unit == 'month':
        return df['Date'].dt.to_period('M')  # Month as YYYY-MM
    elif time_unit == 'day':
        return df['Date'].dt.date  # Exact day as YYYY-MM-DD
    else:
        raise ValueError("Invalid time_unit. Choose 'year', 'month', or 'day'.")

In [12]:
def time_difference(max_time, min_time, time_unit):
    if time_unit == 'year':
        return max_time - min_time  # Directly subtract the years since they are integers
    elif time_unit == 'month':
        return (max_time.year - min_time.year) * 12 + (max_time.month - min_time.month)  # Total months
    elif time_unit == 'day':
        return (max_time - min_time).days  # Difference in days
    else:
        raise ValueError("Invalid time unit for difference. Choose 'year', 'month', or 'day'.")

In [13]:
# Convert the 'Abstract' column to string data type
Paper['Abstract'] = Paper['Abstract'].astype(str)

# Define a regular expression pattern to match non-Latin characters
non_latin_pattern = re.compile(r'[^\x00-\x7F]')

# Apply the regular expression pattern to the 'Abstract' column
Paper = Paper[~Paper['Abstract'].str.contains(non_latin_pattern)]

# Set the time unit for grouping (year, month, or day)
time_unit = 'year'  # Change this to 'year', 'month', or 'day'

# Convert the 'Date' column to datetime and drop invalid dates
Paper['Date'] = pd.to_datetime(Paper['Date'], errors='coerce')
Paper.dropna(subset=['Date'], inplace=True)

# Group data based on the selected time unit
Paper['Time'] = group_by_time_unit(Paper, time_unit)

# Prepare the abstracts
abstracts = Paper["Abstract"].astype('str')
text = abstracts.tolist()

In [None]:
# Prepare embeddings
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(text, show_progress_bar=False)
cluster_model = AgglomerativeClustering(distance_threshold=0.2, n_clusters=None)

# Train BERTopic
topic_model = BERTopic(hdbscan_model=cluster_model).fit(text, embeddings)
topic_info = topic_model.get_topic_info()

doc_info = topic_model.get_document_info(text)
doc_info["ID"] = doc_info.index
doc_info["Time"] = Paper["Time"].tolist()

In [None]:
# Split the data based on the selected time unit
paper_split = dict(tuple(doc_info.groupby('Time')))

# Get the maximum time value (e.g., year, month, or day)
max_time = doc_info['Time'].max()

# Initialize an empty DataFrame
dov = pd.DataFrame()

# Create a list to store individual DataFrames for each time group (year, month, or day)
dfs = []

# Set the time weight (tw)
tw = 0.05

# Iterate over each time group's data
for time_value, group in doc_info.groupby('Time'):
    topic_count = group['Name'].value_counts().reset_index()
    topic_count.columns = ['Name', 'Count']
    topic_count['Time'] = time_value

    # Get min and max time for this group
    min_time = group['Time'].min()
    max_time = group['Time'].max()

    # Calculate the time difference (based on the time unit)
    time_diff = time_difference(max_time, min_time, time_unit)

    # Create a DataFrame for the current time's DoV values
    df = pd.DataFrame(index=[min_time], columns=topic_count['Name'])
    for _, row in topic_count.iterrows():
        topic = row['Name']
        frequency = row['Count']
        # Ensure the time difference is factored in correctly
        df[topic] = (frequency / len(group)) * (1 - tw * time_diff)

    dfs.append(df)

# Concatenate all the time-based DataFrames into a single DataFrame
dov = pd.concat(dfs)

# Compute the geometric mean for each topic across time units
dov_means = pd.DataFrame(columns=dov.columns)

for column in dov.columns:
    column_mean = gmean(dov[column].dropna())
    dov_means[column] = [column_mean]

t_DoV = dov_means.melt(var_name="Name", value_name="dov")

# Merge with topic information
wsm_dov = pd.merge(t_DoV, topic_info, on='Name')
wsm_dov

In [None]:
wsm_dov.to_csv('data/wsm_dov.csv', index=False)

In [None]:
import plotly.express as px

# Create the scatter plot using Plotly
fig = px.scatter(wsm_dov, x='Count', y='dov', hover_data=['Name'])

# Show the plot
fig.show()

In [None]:
# Set the desired range for 'dov' and 'frequency'
dov_min = 0.0025
count_min = 2
count_max = 5

# Filter the DataFrame based on the range of values
weak_signal_df = wsm_dov[(wsm_dov['dov'] >= dov_min) & (wsm_dov['Count'] >= count_min) & (wsm_dov['Count'] <= count_max)]

# Get the values from the 'term' column in the filtered DataFrame
weak_signals = weak_signal_df['Name'].values.tolist()
weak_signal_df

In [None]:
weak_signal_df.to_csv('data/weak_signal_df.csv', index=False)