In [11]:
import pandas as pd
import numpy as np
import pickle, re

from tqdm.auto import tqdm
# register tqdm with pandas
tqdm.pandas()

import ahocorasick

from helper.keyword_helper import get_clean_keywords, neo4j_fetch_data

In [2]:
# Create a dict of neo4j credentials
NEO4J_CREDENTIALS = {"url": "bolt://localhost:37687", "user": "neo4j", "password": "neo4jpassword"}
DICT_DIRECTORY = "data/dictionaries/"

In [3]:
# Import the core and extended dictionaries
core_dict = pd.read_csv("data/dictionaries/core_keywords.csv")
extended_dict = pd.read_csv("data/dictionaries/extended_keywords.csv")
extended_dict_neg = pd.read_csv("data/dictionaries/extended_keywords_neg.csv")

In [13]:
# Get abstracts from papers in neo4j
query = """
MATCH (p:Paper)
RETURN p.id AS id, p.title AS title, p.abstract AS abstract
LIMIT 1000
"""
print("Fetching data...")
papers = neo4j_fetch_data(query, NEO4J_CREDENTIALS)
print("Done.")
display(papers.head())

# Clean the abstracts with regex. Only keep alpha
papers["abstract_clean"] = papers["abstract"].progress_apply(lambda x: re.sub(r"[^a-zA-Z]", " ", x))
papers["abstract_clean"] = papers["abstract_clean"].progress_apply(lambda x: x.lower())

display(papers.head())

Fetching data...
Done.


Unnamed: 0,id,title,abstract
0,4769585118369390189087340060926485915591163442...,Spectral feature mapping with mimic loss for r...,"For the task of speech enhancement, local lear..."
1,7976349412852653205399785718984636615103045675...,"Detecting truth, just on parts","We introduce and discuss, through a computatio..."
2,9304173010254316121188238355449324959015075236...,Failing to Learn: Autonomously Identifying Per...,One of the major open challenges in self-drivi...
3,6525473766675309646606181528323199963925323625...,Enhancing Evolutionary Conversion Rate Optimiz...,Conversion rate optimization means designing w...
4,1127488343483927770718834009113463814949406094...,Transferable Joint Attribute-Identity Deep Lea...,Most existing person re-identification (re-id)...


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,id,title,abstract,abstract_clean
0,4769585118369390189087340060926485915591163442...,Spectral feature mapping with mimic loss for r...,"For the task of speech enhancement, local lear...",for the task of speech enhancement local lear...
1,7976349412852653205399785718984636615103045675...,"Detecting truth, just on parts","We introduce and discuss, through a computatio...",we introduce and discuss through a computatio...
2,9304173010254316121188238355449324959015075236...,Failing to Learn: Autonomously Identifying Per...,One of the major open challenges in self-drivi...,one of the major open challenges in self drivi...
3,6525473766675309646606181528323199963925323625...,Enhancing Evolutionary Conversion Rate Optimiz...,Conversion rate optimization means designing w...,conversion rate optimization means designing w...
4,1127488343483927770718834009113463814949406094...,Transferable Joint Attribute-Identity Deep Lea...,Most existing person re-identification (re-id)...,most existing person re identification re id ...


In [15]:
documents = papers["abstract"].tolist()
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Preprocessing
stop = set(stopwords.words('english'))
lemma = WordNetLemmatizer()

# Ensure only strings are processed
texts = [[lemma.lemmatize(word) for word in document.lower().split() if word not in stop]
         for document in documents if isinstance(document, str)]

# Create a dictionary and corpus needed for Topic Modeling
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Apply LDA
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)

# Extract and display topics
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/wilinski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/wilinski/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(0, '0.014*"image" + 0.006*"method" + 0.006*"approach" + 0.006*"algorithm"')
(1, '0.013*"network" + 0.010*"model" + 0.006*"image" + 0.006*"learning"')
(2, '0.010*"model" + 0.008*"image" + 0.007*"network" + 0.006*"method"')
(3, '0.007*"model" + 0.006*"method" + 0.006*"image" + 0.005*"based"')
(4, '0.008*"learning" + 0.008*"image" + 0.007*"method" + 0.006*"class"')
(5, '0.011*"model" + 0.006*"learning" + 0.006*"show" + 0.005*"network"')
(6, '0.009*"model" + 0.008*"feature" + 0.007*"network" + 0.006*"approach"')
(7, '0.012*"method" + 0.011*"image" + 0.011*"network" + 0.007*"model"')
(8, '0.006*"image" + 0.004*"model" + 0.003*"recruitment" + 0.003*"using"')
(9, '0.008*"data" + 0.007*"method" + 0.007*"learning" + 0.007*"network"')


In [None]:
# Using R inside python
import rpy2
import rpy2.robjects.packages as rpackages
from rpy2.robjects.vectors import StrVector
from rpy2.robjects.packages import importr
utils = rpackages.importr('utils')
utils.chooseCRANmirror(ind=1)

# Install packages
packnames = ("seededlda")
utils.install_packages(StrVector(packnames))

# Load packages
seededlda = importr("seededlda")

# Create a list of seed words
