In [1]:
!pip install -q bertopic pandas nltk scikit-learn matplotlib xlwt 

In [21]:
import pandas as pd
docs = pd.read_csv('/home/jupyter/WSM/data/papers.csv')
docs = docs.dropna()
docs

Unnamed: 0,Date,Abstract
0,2015-03-10,Garlic is valued more for its flavoring and us...
1,2015-03-10,The present investigation was undertaken to st...
2,2015-01-01,Upland areas of southeastern U.S. tidal creek ...
3,2014-12-10,The specific activity of 90Sr in milk and vege...
4,2014-06-11,A survey was conducted to evaluate aflatoxin c...
...,...,...
12686,2020-09-29,The gluten-free food market is growing with th...
12687,2020-09-02,Hurricane Sandy made landfall in New Jersey on...
12688,2020-08-28,Prevalent risks in meat value-chains of sub-Sa...
12689,2020-06-03,The presence of metal contaminants in agricult...


In [22]:
import pandas as pd
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
from umap import UMAP
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean
import nltk
import re

In [23]:
# Function to group by a specific time unit (Year, Month, Day)
def group_by_time_unit(df, time_unit):
    if time_unit == 'year':
        return df['Date'].dt.year
    elif time_unit == 'month':
        return df['Date'].dt.to_period('M')  # Month as YYYY-MM
    elif time_unit == 'day':
        return df['Date'].dt.date  # Exact day as YYYY-MM-DD
    else:
        raise ValueError("Invalid time_unit. Choose 'year', 'month', or 'day'.")

In [24]:
def time_difference(max_time, min_time, time_unit):
    if time_unit == 'year':
        return max_time - min_time  # Directly subtract the years since they are integers
    elif time_unit == 'month':
        return (max_time.year - min_time.year) * 12 + (max_time.month - min_time.month)  # Total months
    elif time_unit == 'day':
        return (max_time - min_time).days  # Difference in days
    else:
        raise ValueError("Invalid time unit for difference. Choose 'year', 'month', or 'day'.")

In [25]:
# Convert the 'Abstract' column to string data type
docs['Abstract'] = docs['Abstract'].astype(str)

# Define a regular expression pattern to match non-Latin characters
non_latin_pattern = re.compile(r'[^\x00-\x7F]')

# Apply the regular expression pattern to the 'Abstract' column
docs = docs[~docs['Abstract'].str.contains(non_latin_pattern)]

# Set the time unit for grouping (year, month, or day)
time_unit = 'year'  # Change this to 'year', 'month', or 'day'

# Convert the 'Date' column to datetime and drop invalid dates
docs['Date'] = pd.to_datetime(docs['Date'], errors='coerce')
docs.dropna(subset=['Date'], inplace=True)

# Group data based on the selected time unit
docs['Time'] = group_by_time_unit(docs, time_unit)

# Prepare the abstracts
abstracts = docs["Abstract"].astype('str')
text = abstracts.tolist()

In [26]:
# Prepare embeddings
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(text, show_progress_bar=False)
cluster_model = AgglomerativeClustering(distance_threshold=0.2, n_clusters=None)

# Train BERTopic
topic_model = BERTopic(hdbscan_model=cluster_model).fit(text, embeddings)
topic_info = topic_model.get_topic_info()

doc_info = topic_model.get_document_info(text)
doc_info["ID"] = doc_info.index
doc_info["Time"] = docs["Time"].tolist()

In [27]:
# Split the data based on the selected time unit
paper_split = dict(tuple(doc_info.groupby('Time')))

# Get the maximum time value (e.g., year, month, or day)
max_time = doc_info['Time'].max()

# Initialize an empty DataFrame
dov = pd.DataFrame()

# Create a list to store individual DataFrames for each time group (year, month, or day)
dfs = []

# Set the time weight (tw)
tw = 0.05

# Iterate over each time group's data
for time_value, group in doc_info.groupby('Time'):
    topic_count = group['Name'].value_counts().reset_index()
    topic_count.columns = ['Name', 'Count']
    topic_count['Time'] = time_value

    # Get min and max time for this group
    min_time = group['Time'].min()
    max_time = group['Time'].max()

    # Calculate the time difference (based on the time unit)
    time_diff = time_difference(max_time, min_time, time_unit)

    # Create a DataFrame for the current time's DoV values
    df = pd.DataFrame(index=[min_time], columns=topic_count['Name'])
    for _, row in topic_count.iterrows():
        topic = row['Name']
        frequency = row['Count']
        # Ensure the time difference is factored in correctly
        df[topic] = (frequency / len(group)) * (1 - tw * time_diff)

    dfs.append(df)

# Concatenate all the time-based DataFrames into a single DataFrame
dov = pd.concat(dfs)

# Compute the geometric mean for each topic across time units
dov_means = pd.DataFrame(columns=dov.columns)

for column in dov.columns:
    column_mean = gmean(dov[column].dropna())
    dov_means[column] = [column_mean]

t_DoV = dov_means.melt(var_name="Name", value_name="dov")

# Merge with topic information
wsm_dov = pd.merge(t_DoV, topic_info, on='Name')
wsm_dov

Unnamed: 0,Name,dov,Topic,Count,Representation,Representative_Docs
0,164_responsibility_labelling_cues_perceived,0.003392,164,11,"[responsibility, labelling, cues, perceived, f...",[Unsafe food handling behaviors are common amo...
1,85_radium_dose_clothing_sudoqu,0.006758,85,13,"[radium, dose, clothing, sudoqu, radioactive, ...","[The Fukushima nuclear accident (Japan, 11 Mar..."
2,146_furazolidone_coccidiostats_residues_residue,0.002868,146,11,"[furazolidone, coccidiostats, residues, residu...",[This article presents a review of the current...
3,582_foragers_social_collective_colony,0.004269,582,5,"[foragers, social, collective, colony, nestmat...",[Foraging is a result of innate and acquired m...
4,51_eels_dabs_bde47_polybrominated,0.003693,51,15,"[eels, dabs, bde47, polybrominated, biphenyls,...",[There is growing evidence that more people in...
...,...,...,...,...,...,...
1403,1292_coffee_silverskin_cs_unsustainable,0.000996,1292,1,"[coffee, silverskin, cs, unsustainable, roasti...",[The coffee supply chain is characterized by a...
1404,1291_kernels_like_maize_fungal,0.000996,1291,1,"[kernels, like, maize, fungal, mycotoxin, path...",[Mycotoxin contamination of maize kernels by f...
1405,1290_coldsmoked_supporting_salmon_capable,0.000996,1290,1,"[coldsmoked, supporting, salmon, capable, read...",[Cold-smoked salmon is a ready-to-eat food pro...
1406,1278_regulators_pathwayspecific_inhibit_biosyn...,0.015873,1278,1,"[regulators, pathwayspecific, inhibit, biosynt...",[Mycotoxin contamination in food poses health ...


In [28]:
wsm_dov.to_csv('data/wsm_dov.csv', index=False)

In [29]:
import plotly.express as px

# Create the scatter plot using Plotly
fig = px.scatter(wsm_dov, x='Count', y='dov', hover_data=['Name'])

# Show the plot
fig.show()

In [34]:
# Set the desired range for 'dov' and 'frequency'
dov_min = 0.004
count_min = 3
count_max = 6

# Filter the DataFrame based on the range of values
weak_signal_df = wsm_dov[(wsm_dov['dov'] >= dov_min) & (wsm_dov['Count'] >= count_min) & (wsm_dov['Count'] <= count_max)]

# Get the values from the 'term' column in the filtered DataFrame
weak_signals = weak_signal_df['Name'].values.tolist()
weak_signal_df

Unnamed: 0,Name,dov,Topic,Count,Representation,Representative_Docs
3,582_foragers_social_collective_colony,0.004269,582,5,"[foragers, social, collective, colony, nestmat...",[Foraging is a result of innate and acquired m...
7,700_pregnancy_hypospadias_glucose_discourses,0.004397,700,4,"[pregnancy, hypospadias, glucose, discourses, ...",[Overweight and obesity pre pregnancy or durin...
11,924_microscopically_bioinformaticsenabled_libr...,0.008287,924,3,"[microscopically, bioinformaticsenabled, libra...",[Since the advent of the use of matrix-assiste...
28,678_pgamended_peppers_ferralsols_moderately,0.005038,678,4,"[pgamended, peppers, ferralsols, moderately, p...",[Lead (Pb) contamination of soil poses severe ...
33,482_afb1_aflatoxin_indonesia_cancer,0.004123,482,6,"[afb1, aflatoxin, indonesia, cancer, ngkg, 001...",[This mini review article described the exposu...
35,442_lenses_device_maude_sentinel,0.004003,442,6,"[lenses, device, maude, sentinel, stroke, comp...",[All contact lenses (corrective/noncorrective)...
61,453_sanguineus_sachet_salivary_am,0.00516,453,6,"[sanguineus, sachet, salivary, am, ivermectin,...",[The purpose of this study was to develop an a...
74,847_buttonwood_site_stem_cadmium,0.006593,847,3,"[buttonwood, site, stem, cadmium, leaf, chicor...",[Soil contaminated with cadmium presents a pot...
93,690_sfr_bottles_asepticuht_1011,0.004046,690,4,"[sfr, bottles, asepticuht, 1011, stearothermop...",[Aseptic ultra-high-temperature (UHT)-type pro...
110,656_timetemperature_git_options_simulated,0.005065,656,4,"[timetemperature, git, options, simulated, buf...",[Even though a plethora of barriers are employ...


In [35]:
weak_signal_df.to_csv('data/weak_signal_df.csv', index=False)